i8259: Clear ELCR on reset
[qemu/ar7.git] / fpu / softfloat.c
blob3aafa81d58b441e78d6344942092357c0498ecd2
1 /*
2 * QEMU float support
4 * Derived from SoftFloat.
5 */
7 /*============================================================================
9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10 Package, Release 2b.
12 Written by John R. Hauser. This work was made possible in part by the
13 International Computer Science Institute, located at Suite 600, 1947 Center
14 Street, Berkeley, California 94704. Funding was partially provided by the
15 National Science Foundation under grant MIP-9311980. The original version
16 of this code was written as part of a project to build a fixed-point vector
17 processor in collaboration with the University of California at Berkeley,
18 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20 arithmetic/SoftFloat.html'.
22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
31 Derivative works are acceptable, even for commercial purposes, so long as
32 (1) the source code for the derivative work includes prominent notice that
33 the work is derivative, and (2) the source code includes prominent notice with
34 these four paragraphs for those parts of this code that are retained.
36 =============================================================================*/
38 /* softfloat (and in particular the code in softfloat-specialize.h) is
39 * target-dependent and needs the TARGET_* macros.
41 #include "config.h"
43 #include "softfloat.h"
45 /*----------------------------------------------------------------------------
46 | Primitive arithmetic functions, including multi-word arithmetic, and
47 | division and square root approximations. (Can be specialized to target if
48 | desired.)
49 *----------------------------------------------------------------------------*/
50 #include "softfloat-macros.h"
52 /*----------------------------------------------------------------------------
53 | Functions and definitions to determine: (1) whether tininess for underflow
54 | is detected before or after rounding by default, (2) what (if anything)
55 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
56 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
57 | are propagated from function inputs to output. These details are target-
58 | specific.
59 *----------------------------------------------------------------------------*/
60 #include "softfloat-specialize.h"
62 void set_float_rounding_mode(int val STATUS_PARAM)
64 STATUS(float_rounding_mode) = val;
67 void set_float_exception_flags(int val STATUS_PARAM)
69 STATUS(float_exception_flags) = val;
72 void set_floatx80_rounding_precision(int val STATUS_PARAM)
74 STATUS(floatx80_rounding_precision) = val;
77 /*----------------------------------------------------------------------------
78 | Returns the fraction bits of the half-precision floating-point value `a'.
79 *----------------------------------------------------------------------------*/
81 INLINE uint32_t extractFloat16Frac(float16 a)
83 return float16_val(a) & 0x3ff;
86 /*----------------------------------------------------------------------------
87 | Returns the exponent bits of the half-precision floating-point value `a'.
88 *----------------------------------------------------------------------------*/
90 INLINE int16 extractFloat16Exp(float16 a)
92 return (float16_val(a) >> 10) & 0x1f;
95 /*----------------------------------------------------------------------------
96 | Returns the sign bit of the single-precision floating-point value `a'.
97 *----------------------------------------------------------------------------*/
99 INLINE flag extractFloat16Sign(float16 a)
101 return float16_val(a)>>15;
104 /*----------------------------------------------------------------------------
105 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
106 | and 7, and returns the properly rounded 32-bit integer corresponding to the
107 | input. If `zSign' is 1, the input is negated before being converted to an
108 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
109 | is simply rounded to an integer, with the inexact exception raised if the
110 | input cannot be represented exactly as an integer. However, if the fixed-
111 | point input is too large, the invalid exception is raised and the largest
112 | positive or negative integer is returned.
113 *----------------------------------------------------------------------------*/
115 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
117 int8 roundingMode;
118 flag roundNearestEven;
119 int8 roundIncrement, roundBits;
120 int32 z;
122 roundingMode = STATUS(float_rounding_mode);
123 roundNearestEven = ( roundingMode == float_round_nearest_even );
124 roundIncrement = 0x40;
125 if ( ! roundNearestEven ) {
126 if ( roundingMode == float_round_to_zero ) {
127 roundIncrement = 0;
129 else {
130 roundIncrement = 0x7F;
131 if ( zSign ) {
132 if ( roundingMode == float_round_up ) roundIncrement = 0;
134 else {
135 if ( roundingMode == float_round_down ) roundIncrement = 0;
139 roundBits = absZ & 0x7F;
140 absZ = ( absZ + roundIncrement )>>7;
141 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
142 z = absZ;
143 if ( zSign ) z = - z;
144 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
145 float_raise( float_flag_invalid STATUS_VAR);
146 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
148 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
149 return z;
153 /*----------------------------------------------------------------------------
154 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
155 | `absZ1', with binary point between bits 63 and 64 (between the input words),
156 | and returns the properly rounded 64-bit integer corresponding to the input.
157 | If `zSign' is 1, the input is negated before being converted to an integer.
158 | Ordinarily, the fixed-point input is simply rounded to an integer, with
159 | the inexact exception raised if the input cannot be represented exactly as
160 | an integer. However, if the fixed-point input is too large, the invalid
161 | exception is raised and the largest positive or negative integer is
162 | returned.
163 *----------------------------------------------------------------------------*/
165 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
167 int8 roundingMode;
168 flag roundNearestEven, increment;
169 int64 z;
171 roundingMode = STATUS(float_rounding_mode);
172 roundNearestEven = ( roundingMode == float_round_nearest_even );
173 increment = ( (int64_t) absZ1 < 0 );
174 if ( ! roundNearestEven ) {
175 if ( roundingMode == float_round_to_zero ) {
176 increment = 0;
178 else {
179 if ( zSign ) {
180 increment = ( roundingMode == float_round_down ) && absZ1;
182 else {
183 increment = ( roundingMode == float_round_up ) && absZ1;
187 if ( increment ) {
188 ++absZ0;
189 if ( absZ0 == 0 ) goto overflow;
190 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
192 z = absZ0;
193 if ( zSign ) z = - z;
194 if ( z && ( ( z < 0 ) ^ zSign ) ) {
195 overflow:
196 float_raise( float_flag_invalid STATUS_VAR);
197 return
198 zSign ? (int64_t) LIT64( 0x8000000000000000 )
199 : LIT64( 0x7FFFFFFFFFFFFFFF );
201 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
202 return z;
206 /*----------------------------------------------------------------------------
207 | Returns the fraction bits of the single-precision floating-point value `a'.
208 *----------------------------------------------------------------------------*/
210 INLINE uint32_t extractFloat32Frac( float32 a )
213 return float32_val(a) & 0x007FFFFF;
217 /*----------------------------------------------------------------------------
218 | Returns the exponent bits of the single-precision floating-point value `a'.
219 *----------------------------------------------------------------------------*/
221 INLINE int16 extractFloat32Exp( float32 a )
224 return ( float32_val(a)>>23 ) & 0xFF;
228 /*----------------------------------------------------------------------------
229 | Returns the sign bit of the single-precision floating-point value `a'.
230 *----------------------------------------------------------------------------*/
232 INLINE flag extractFloat32Sign( float32 a )
235 return float32_val(a)>>31;
239 /*----------------------------------------------------------------------------
240 | If `a' is denormal and we are in flush-to-zero mode then set the
241 | input-denormal exception and return zero. Otherwise just return the value.
242 *----------------------------------------------------------------------------*/
243 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
245 if (STATUS(flush_inputs_to_zero)) {
246 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
247 float_raise(float_flag_input_denormal STATUS_VAR);
248 return make_float32(float32_val(a) & 0x80000000);
251 return a;
254 /*----------------------------------------------------------------------------
255 | Normalizes the subnormal single-precision floating-point value represented
256 | by the denormalized significand `aSig'. The normalized exponent and
257 | significand are stored at the locations pointed to by `zExpPtr' and
258 | `zSigPtr', respectively.
259 *----------------------------------------------------------------------------*/
261 static void
262 normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr )
264 int8 shiftCount;
266 shiftCount = countLeadingZeros32( aSig ) - 8;
267 *zSigPtr = aSig<<shiftCount;
268 *zExpPtr = 1 - shiftCount;
272 /*----------------------------------------------------------------------------
273 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
274 | single-precision floating-point value, returning the result. After being
275 | shifted into the proper positions, the three fields are simply added
276 | together to form the result. This means that any integer portion of `zSig'
277 | will be added into the exponent. Since a properly normalized significand
278 | will have an integer portion equal to 1, the `zExp' input should be 1 less
279 | than the desired result exponent whenever `zSig' is a complete, normalized
280 | significand.
281 *----------------------------------------------------------------------------*/
283 INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig )
286 return make_float32(
287 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
291 /*----------------------------------------------------------------------------
292 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
293 | and significand `zSig', and returns the proper single-precision floating-
294 | point value corresponding to the abstract input. Ordinarily, the abstract
295 | value is simply rounded and packed into the single-precision format, with
296 | the inexact exception raised if the abstract input cannot be represented
297 | exactly. However, if the abstract value is too large, the overflow and
298 | inexact exceptions are raised and an infinity or maximal finite value is
299 | returned. If the abstract value is too small, the input value is rounded to
300 | a subnormal number, and the underflow and inexact exceptions are raised if
301 | the abstract input cannot be represented exactly as a subnormal single-
302 | precision floating-point number.
303 | The input significand `zSig' has its binary point between bits 30
304 | and 29, which is 7 bits to the left of the usual location. This shifted
305 | significand must be normalized or smaller. If `zSig' is not normalized,
306 | `zExp' must be 0; in that case, the result returned is a subnormal number,
307 | and it must not require rounding. In the usual case that `zSig' is
308 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
309 | The handling of underflow and overflow follows the IEC/IEEE Standard for
310 | Binary Floating-Point Arithmetic.
311 *----------------------------------------------------------------------------*/
313 static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
315 int8 roundingMode;
316 flag roundNearestEven;
317 int8 roundIncrement, roundBits;
318 flag isTiny;
320 roundingMode = STATUS(float_rounding_mode);
321 roundNearestEven = ( roundingMode == float_round_nearest_even );
322 roundIncrement = 0x40;
323 if ( ! roundNearestEven ) {
324 if ( roundingMode == float_round_to_zero ) {
325 roundIncrement = 0;
327 else {
328 roundIncrement = 0x7F;
329 if ( zSign ) {
330 if ( roundingMode == float_round_up ) roundIncrement = 0;
332 else {
333 if ( roundingMode == float_round_down ) roundIncrement = 0;
337 roundBits = zSig & 0x7F;
338 if ( 0xFD <= (uint16_t) zExp ) {
339 if ( ( 0xFD < zExp )
340 || ( ( zExp == 0xFD )
341 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
343 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
344 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
346 if ( zExp < 0 ) {
347 if (STATUS(flush_to_zero)) {
348 float_raise(float_flag_output_denormal STATUS_VAR);
349 return packFloat32(zSign, 0, 0);
351 isTiny =
352 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
353 || ( zExp < -1 )
354 || ( zSig + roundIncrement < 0x80000000 );
355 shift32RightJamming( zSig, - zExp, &zSig );
356 zExp = 0;
357 roundBits = zSig & 0x7F;
358 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
361 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
362 zSig = ( zSig + roundIncrement )>>7;
363 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
364 if ( zSig == 0 ) zExp = 0;
365 return packFloat32( zSign, zExp, zSig );
369 /*----------------------------------------------------------------------------
370 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
371 | and significand `zSig', and returns the proper single-precision floating-
372 | point value corresponding to the abstract input. This routine is just like
373 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
374 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
375 | floating-point exponent.
376 *----------------------------------------------------------------------------*/
378 static float32
379 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
381 int8 shiftCount;
383 shiftCount = countLeadingZeros32( zSig ) - 1;
384 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
388 /*----------------------------------------------------------------------------
389 | Returns the fraction bits of the double-precision floating-point value `a'.
390 *----------------------------------------------------------------------------*/
392 INLINE uint64_t extractFloat64Frac( float64 a )
395 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
399 /*----------------------------------------------------------------------------
400 | Returns the exponent bits of the double-precision floating-point value `a'.
401 *----------------------------------------------------------------------------*/
403 INLINE int16 extractFloat64Exp( float64 a )
406 return ( float64_val(a)>>52 ) & 0x7FF;
410 /*----------------------------------------------------------------------------
411 | Returns the sign bit of the double-precision floating-point value `a'.
412 *----------------------------------------------------------------------------*/
414 INLINE flag extractFloat64Sign( float64 a )
417 return float64_val(a)>>63;
421 /*----------------------------------------------------------------------------
422 | If `a' is denormal and we are in flush-to-zero mode then set the
423 | input-denormal exception and return zero. Otherwise just return the value.
424 *----------------------------------------------------------------------------*/
425 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
427 if (STATUS(flush_inputs_to_zero)) {
428 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
429 float_raise(float_flag_input_denormal STATUS_VAR);
430 return make_float64(float64_val(a) & (1ULL << 63));
433 return a;
436 /*----------------------------------------------------------------------------
437 | Normalizes the subnormal double-precision floating-point value represented
438 | by the denormalized significand `aSig'. The normalized exponent and
439 | significand are stored at the locations pointed to by `zExpPtr' and
440 | `zSigPtr', respectively.
441 *----------------------------------------------------------------------------*/
443 static void
444 normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr )
446 int8 shiftCount;
448 shiftCount = countLeadingZeros64( aSig ) - 11;
449 *zSigPtr = aSig<<shiftCount;
450 *zExpPtr = 1 - shiftCount;
454 /*----------------------------------------------------------------------------
455 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
456 | double-precision floating-point value, returning the result. After being
457 | shifted into the proper positions, the three fields are simply added
458 | together to form the result. This means that any integer portion of `zSig'
459 | will be added into the exponent. Since a properly normalized significand
460 | will have an integer portion equal to 1, the `zExp' input should be 1 less
461 | than the desired result exponent whenever `zSig' is a complete, normalized
462 | significand.
463 *----------------------------------------------------------------------------*/
465 INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig )
468 return make_float64(
469 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
473 /*----------------------------------------------------------------------------
474 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
475 | and significand `zSig', and returns the proper double-precision floating-
476 | point value corresponding to the abstract input. Ordinarily, the abstract
477 | value is simply rounded and packed into the double-precision format, with
478 | the inexact exception raised if the abstract input cannot be represented
479 | exactly. However, if the abstract value is too large, the overflow and
480 | inexact exceptions are raised and an infinity or maximal finite value is
481 | returned. If the abstract value is too small, the input value is rounded
482 | to a subnormal number, and the underflow and inexact exceptions are raised
483 | if the abstract input cannot be represented exactly as a subnormal double-
484 | precision floating-point number.
485 | The input significand `zSig' has its binary point between bits 62
486 | and 61, which is 10 bits to the left of the usual location. This shifted
487 | significand must be normalized or smaller. If `zSig' is not normalized,
488 | `zExp' must be 0; in that case, the result returned is a subnormal number,
489 | and it must not require rounding. In the usual case that `zSig' is
490 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
491 | The handling of underflow and overflow follows the IEC/IEEE Standard for
492 | Binary Floating-Point Arithmetic.
493 *----------------------------------------------------------------------------*/
495 static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
497 int8 roundingMode;
498 flag roundNearestEven;
499 int16 roundIncrement, roundBits;
500 flag isTiny;
502 roundingMode = STATUS(float_rounding_mode);
503 roundNearestEven = ( roundingMode == float_round_nearest_even );
504 roundIncrement = 0x200;
505 if ( ! roundNearestEven ) {
506 if ( roundingMode == float_round_to_zero ) {
507 roundIncrement = 0;
509 else {
510 roundIncrement = 0x3FF;
511 if ( zSign ) {
512 if ( roundingMode == float_round_up ) roundIncrement = 0;
514 else {
515 if ( roundingMode == float_round_down ) roundIncrement = 0;
519 roundBits = zSig & 0x3FF;
520 if ( 0x7FD <= (uint16_t) zExp ) {
521 if ( ( 0x7FD < zExp )
522 || ( ( zExp == 0x7FD )
523 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
525 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
526 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
528 if ( zExp < 0 ) {
529 if (STATUS(flush_to_zero)) {
530 float_raise(float_flag_output_denormal STATUS_VAR);
531 return packFloat64(zSign, 0, 0);
533 isTiny =
534 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
535 || ( zExp < -1 )
536 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
537 shift64RightJamming( zSig, - zExp, &zSig );
538 zExp = 0;
539 roundBits = zSig & 0x3FF;
540 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
543 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
544 zSig = ( zSig + roundIncrement )>>10;
545 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
546 if ( zSig == 0 ) zExp = 0;
547 return packFloat64( zSign, zExp, zSig );
551 /*----------------------------------------------------------------------------
552 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
553 | and significand `zSig', and returns the proper double-precision floating-
554 | point value corresponding to the abstract input. This routine is just like
555 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
556 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
557 | floating-point exponent.
558 *----------------------------------------------------------------------------*/
560 static float64
561 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
563 int8 shiftCount;
565 shiftCount = countLeadingZeros64( zSig ) - 1;
566 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
570 /*----------------------------------------------------------------------------
571 | Returns the fraction bits of the extended double-precision floating-point
572 | value `a'.
573 *----------------------------------------------------------------------------*/
575 INLINE uint64_t extractFloatx80Frac( floatx80 a )
578 return a.low;
582 /*----------------------------------------------------------------------------
583 | Returns the exponent bits of the extended double-precision floating-point
584 | value `a'.
585 *----------------------------------------------------------------------------*/
587 INLINE int32 extractFloatx80Exp( floatx80 a )
590 return a.high & 0x7FFF;
594 /*----------------------------------------------------------------------------
595 | Returns the sign bit of the extended double-precision floating-point value
596 | `a'.
597 *----------------------------------------------------------------------------*/
599 INLINE flag extractFloatx80Sign( floatx80 a )
602 return a.high>>15;
606 /*----------------------------------------------------------------------------
607 | Normalizes the subnormal extended double-precision floating-point value
608 | represented by the denormalized significand `aSig'. The normalized exponent
609 | and significand are stored at the locations pointed to by `zExpPtr' and
610 | `zSigPtr', respectively.
611 *----------------------------------------------------------------------------*/
613 static void
614 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
616 int8 shiftCount;
618 shiftCount = countLeadingZeros64( aSig );
619 *zSigPtr = aSig<<shiftCount;
620 *zExpPtr = 1 - shiftCount;
624 /*----------------------------------------------------------------------------
625 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
626 | extended double-precision floating-point value, returning the result.
627 *----------------------------------------------------------------------------*/
629 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
631 floatx80 z;
633 z.low = zSig;
634 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
635 return z;
639 /*----------------------------------------------------------------------------
640 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
641 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
642 | and returns the proper extended double-precision floating-point value
643 | corresponding to the abstract input. Ordinarily, the abstract value is
644 | rounded and packed into the extended double-precision format, with the
645 | inexact exception raised if the abstract input cannot be represented
646 | exactly. However, if the abstract value is too large, the overflow and
647 | inexact exceptions are raised and an infinity or maximal finite value is
648 | returned. If the abstract value is too small, the input value is rounded to
649 | a subnormal number, and the underflow and inexact exceptions are raised if
650 | the abstract input cannot be represented exactly as a subnormal extended
651 | double-precision floating-point number.
652 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
653 | number of bits as single or double precision, respectively. Otherwise, the
654 | result is rounded to the full precision of the extended double-precision
655 | format.
656 | The input significand must be normalized or smaller. If the input
657 | significand is not normalized, `zExp' must be 0; in that case, the result
658 | returned is a subnormal number, and it must not require rounding. The
659 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
660 | Floating-Point Arithmetic.
661 *----------------------------------------------------------------------------*/
663 static floatx80
664 roundAndPackFloatx80(
665 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
666 STATUS_PARAM)
668 int8 roundingMode;
669 flag roundNearestEven, increment, isTiny;
670 int64 roundIncrement, roundMask, roundBits;
672 roundingMode = STATUS(float_rounding_mode);
673 roundNearestEven = ( roundingMode == float_round_nearest_even );
674 if ( roundingPrecision == 80 ) goto precision80;
675 if ( roundingPrecision == 64 ) {
676 roundIncrement = LIT64( 0x0000000000000400 );
677 roundMask = LIT64( 0x00000000000007FF );
679 else if ( roundingPrecision == 32 ) {
680 roundIncrement = LIT64( 0x0000008000000000 );
681 roundMask = LIT64( 0x000000FFFFFFFFFF );
683 else {
684 goto precision80;
686 zSig0 |= ( zSig1 != 0 );
687 if ( ! roundNearestEven ) {
688 if ( roundingMode == float_round_to_zero ) {
689 roundIncrement = 0;
691 else {
692 roundIncrement = roundMask;
693 if ( zSign ) {
694 if ( roundingMode == float_round_up ) roundIncrement = 0;
696 else {
697 if ( roundingMode == float_round_down ) roundIncrement = 0;
701 roundBits = zSig0 & roundMask;
702 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
703 if ( ( 0x7FFE < zExp )
704 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
706 goto overflow;
708 if ( zExp <= 0 ) {
709 if (STATUS(flush_to_zero)) {
710 float_raise(float_flag_output_denormal STATUS_VAR);
711 return packFloatx80(zSign, 0, 0);
713 isTiny =
714 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
715 || ( zExp < 0 )
716 || ( zSig0 <= zSig0 + roundIncrement );
717 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
718 zExp = 0;
719 roundBits = zSig0 & roundMask;
720 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
721 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
722 zSig0 += roundIncrement;
723 if ( (int64_t) zSig0 < 0 ) zExp = 1;
724 roundIncrement = roundMask + 1;
725 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
726 roundMask |= roundIncrement;
728 zSig0 &= ~ roundMask;
729 return packFloatx80( zSign, zExp, zSig0 );
732 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
733 zSig0 += roundIncrement;
734 if ( zSig0 < roundIncrement ) {
735 ++zExp;
736 zSig0 = LIT64( 0x8000000000000000 );
738 roundIncrement = roundMask + 1;
739 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
740 roundMask |= roundIncrement;
742 zSig0 &= ~ roundMask;
743 if ( zSig0 == 0 ) zExp = 0;
744 return packFloatx80( zSign, zExp, zSig0 );
745 precision80:
746 increment = ( (int64_t) zSig1 < 0 );
747 if ( ! roundNearestEven ) {
748 if ( roundingMode == float_round_to_zero ) {
749 increment = 0;
751 else {
752 if ( zSign ) {
753 increment = ( roundingMode == float_round_down ) && zSig1;
755 else {
756 increment = ( roundingMode == float_round_up ) && zSig1;
760 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
761 if ( ( 0x7FFE < zExp )
762 || ( ( zExp == 0x7FFE )
763 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
764 && increment
767 roundMask = 0;
768 overflow:
769 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
770 if ( ( roundingMode == float_round_to_zero )
771 || ( zSign && ( roundingMode == float_round_up ) )
772 || ( ! zSign && ( roundingMode == float_round_down ) )
774 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
776 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
778 if ( zExp <= 0 ) {
779 isTiny =
780 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
781 || ( zExp < 0 )
782 || ! increment
783 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
784 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
785 zExp = 0;
786 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
787 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
788 if ( roundNearestEven ) {
789 increment = ( (int64_t) zSig1 < 0 );
791 else {
792 if ( zSign ) {
793 increment = ( roundingMode == float_round_down ) && zSig1;
795 else {
796 increment = ( roundingMode == float_round_up ) && zSig1;
799 if ( increment ) {
800 ++zSig0;
801 zSig0 &=
802 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
803 if ( (int64_t) zSig0 < 0 ) zExp = 1;
805 return packFloatx80( zSign, zExp, zSig0 );
808 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
809 if ( increment ) {
810 ++zSig0;
811 if ( zSig0 == 0 ) {
812 ++zExp;
813 zSig0 = LIT64( 0x8000000000000000 );
815 else {
816 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
819 else {
820 if ( zSig0 == 0 ) zExp = 0;
822 return packFloatx80( zSign, zExp, zSig0 );
826 /*----------------------------------------------------------------------------
827 | Takes an abstract floating-point value having sign `zSign', exponent
828 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
829 | and returns the proper extended double-precision floating-point value
830 | corresponding to the abstract input. This routine is just like
831 | `roundAndPackFloatx80' except that the input significand does not have to be
832 | normalized.
833 *----------------------------------------------------------------------------*/
835 static floatx80
836 normalizeRoundAndPackFloatx80(
837 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
838 STATUS_PARAM)
840 int8 shiftCount;
842 if ( zSig0 == 0 ) {
843 zSig0 = zSig1;
844 zSig1 = 0;
845 zExp -= 64;
847 shiftCount = countLeadingZeros64( zSig0 );
848 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
849 zExp -= shiftCount;
850 return
851 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
855 /*----------------------------------------------------------------------------
856 | Returns the least-significant 64 fraction bits of the quadruple-precision
857 | floating-point value `a'.
858 *----------------------------------------------------------------------------*/
860 INLINE uint64_t extractFloat128Frac1( float128 a )
863 return a.low;
867 /*----------------------------------------------------------------------------
868 | Returns the most-significant 48 fraction bits of the quadruple-precision
869 | floating-point value `a'.
870 *----------------------------------------------------------------------------*/
872 INLINE uint64_t extractFloat128Frac0( float128 a )
875 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
879 /*----------------------------------------------------------------------------
880 | Returns the exponent bits of the quadruple-precision floating-point value
881 | `a'.
882 *----------------------------------------------------------------------------*/
884 INLINE int32 extractFloat128Exp( float128 a )
887 return ( a.high>>48 ) & 0x7FFF;
891 /*----------------------------------------------------------------------------
892 | Returns the sign bit of the quadruple-precision floating-point value `a'.
893 *----------------------------------------------------------------------------*/
895 INLINE flag extractFloat128Sign( float128 a )
898 return a.high>>63;
902 /*----------------------------------------------------------------------------
903 | Normalizes the subnormal quadruple-precision floating-point value
904 | represented by the denormalized significand formed by the concatenation of
905 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
906 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
907 | significand are stored at the location pointed to by `zSig0Ptr', and the
908 | least significant 64 bits of the normalized significand are stored at the
909 | location pointed to by `zSig1Ptr'.
910 *----------------------------------------------------------------------------*/
912 static void
913 normalizeFloat128Subnormal(
914 uint64_t aSig0,
915 uint64_t aSig1,
916 int32 *zExpPtr,
917 uint64_t *zSig0Ptr,
918 uint64_t *zSig1Ptr
921 int8 shiftCount;
923 if ( aSig0 == 0 ) {
924 shiftCount = countLeadingZeros64( aSig1 ) - 15;
925 if ( shiftCount < 0 ) {
926 *zSig0Ptr = aSig1>>( - shiftCount );
927 *zSig1Ptr = aSig1<<( shiftCount & 63 );
929 else {
930 *zSig0Ptr = aSig1<<shiftCount;
931 *zSig1Ptr = 0;
933 *zExpPtr = - shiftCount - 63;
935 else {
936 shiftCount = countLeadingZeros64( aSig0 ) - 15;
937 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
938 *zExpPtr = 1 - shiftCount;
943 /*----------------------------------------------------------------------------
944 | Packs the sign `zSign', the exponent `zExp', and the significand formed
945 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
946 | floating-point value, returning the result. After being shifted into the
947 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
948 | added together to form the most significant 32 bits of the result. This
949 | means that any integer portion of `zSig0' will be added into the exponent.
950 | Since a properly normalized significand will have an integer portion equal
951 | to 1, the `zExp' input should be 1 less than the desired result exponent
952 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
953 | significand.
954 *----------------------------------------------------------------------------*/
956 INLINE float128
957 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
959 float128 z;
961 z.low = zSig1;
962 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
963 return z;
967 /*----------------------------------------------------------------------------
968 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
969 | and extended significand formed by the concatenation of `zSig0', `zSig1',
970 | and `zSig2', and returns the proper quadruple-precision floating-point value
971 | corresponding to the abstract input. Ordinarily, the abstract value is
972 | simply rounded and packed into the quadruple-precision format, with the
973 | inexact exception raised if the abstract input cannot be represented
974 | exactly. However, if the abstract value is too large, the overflow and
975 | inexact exceptions are raised and an infinity or maximal finite value is
976 | returned. If the abstract value is too small, the input value is rounded to
977 | a subnormal number, and the underflow and inexact exceptions are raised if
978 | the abstract input cannot be represented exactly as a subnormal quadruple-
979 | precision floating-point number.
980 | The input significand must be normalized or smaller. If the input
981 | significand is not normalized, `zExp' must be 0; in that case, the result
982 | returned is a subnormal number, and it must not require rounding. In the
983 | usual case that the input significand is normalized, `zExp' must be 1 less
984 | than the ``true'' floating-point exponent. The handling of underflow and
985 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
986 *----------------------------------------------------------------------------*/
988 static float128
989 roundAndPackFloat128(
990 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
992 int8 roundingMode;
993 flag roundNearestEven, increment, isTiny;
995 roundingMode = STATUS(float_rounding_mode);
996 roundNearestEven = ( roundingMode == float_round_nearest_even );
997 increment = ( (int64_t) zSig2 < 0 );
998 if ( ! roundNearestEven ) {
999 if ( roundingMode == float_round_to_zero ) {
1000 increment = 0;
1002 else {
1003 if ( zSign ) {
1004 increment = ( roundingMode == float_round_down ) && zSig2;
1006 else {
1007 increment = ( roundingMode == float_round_up ) && zSig2;
1011 if ( 0x7FFD <= (uint32_t) zExp ) {
1012 if ( ( 0x7FFD < zExp )
1013 || ( ( zExp == 0x7FFD )
1014 && eq128(
1015 LIT64( 0x0001FFFFFFFFFFFF ),
1016 LIT64( 0xFFFFFFFFFFFFFFFF ),
1017 zSig0,
1018 zSig1
1020 && increment
1023 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1024 if ( ( roundingMode == float_round_to_zero )
1025 || ( zSign && ( roundingMode == float_round_up ) )
1026 || ( ! zSign && ( roundingMode == float_round_down ) )
1028 return
1029 packFloat128(
1030 zSign,
1031 0x7FFE,
1032 LIT64( 0x0000FFFFFFFFFFFF ),
1033 LIT64( 0xFFFFFFFFFFFFFFFF )
1036 return packFloat128( zSign, 0x7FFF, 0, 0 );
1038 if ( zExp < 0 ) {
1039 if (STATUS(flush_to_zero)) {
1040 float_raise(float_flag_output_denormal STATUS_VAR);
1041 return packFloat128(zSign, 0, 0, 0);
1043 isTiny =
1044 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1045 || ( zExp < -1 )
1046 || ! increment
1047 || lt128(
1048 zSig0,
1049 zSig1,
1050 LIT64( 0x0001FFFFFFFFFFFF ),
1051 LIT64( 0xFFFFFFFFFFFFFFFF )
1053 shift128ExtraRightJamming(
1054 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1055 zExp = 0;
1056 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1057 if ( roundNearestEven ) {
1058 increment = ( (int64_t) zSig2 < 0 );
1060 else {
1061 if ( zSign ) {
1062 increment = ( roundingMode == float_round_down ) && zSig2;
1064 else {
1065 increment = ( roundingMode == float_round_up ) && zSig2;
1070 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1071 if ( increment ) {
1072 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1073 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1075 else {
1076 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1078 return packFloat128( zSign, zExp, zSig0, zSig1 );
1082 /*----------------------------------------------------------------------------
1083 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1084 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1085 | returns the proper quadruple-precision floating-point value corresponding
1086 | to the abstract input. This routine is just like `roundAndPackFloat128'
1087 | except that the input significand has fewer bits and does not have to be
1088 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1089 | point exponent.
1090 *----------------------------------------------------------------------------*/
1092 static float128
1093 normalizeRoundAndPackFloat128(
1094 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1096 int8 shiftCount;
1097 uint64_t zSig2;
1099 if ( zSig0 == 0 ) {
1100 zSig0 = zSig1;
1101 zSig1 = 0;
1102 zExp -= 64;
1104 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1105 if ( 0 <= shiftCount ) {
1106 zSig2 = 0;
1107 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1109 else {
1110 shift128ExtraRightJamming(
1111 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1113 zExp -= shiftCount;
1114 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1118 /*----------------------------------------------------------------------------
1119 | Returns the result of converting the 32-bit two's complement integer `a'
1120 | to the single-precision floating-point format. The conversion is performed
1121 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1122 *----------------------------------------------------------------------------*/
1124 float32 int32_to_float32( int32 a STATUS_PARAM )
1126 flag zSign;
1128 if ( a == 0 ) return float32_zero;
1129 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1130 zSign = ( a < 0 );
1131 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1135 /*----------------------------------------------------------------------------
1136 | Returns the result of converting the 32-bit two's complement integer `a'
1137 | to the double-precision floating-point format. The conversion is performed
1138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139 *----------------------------------------------------------------------------*/
1141 float64 int32_to_float64( int32 a STATUS_PARAM )
1143 flag zSign;
1144 uint32 absA;
1145 int8 shiftCount;
1146 uint64_t zSig;
1148 if ( a == 0 ) return float64_zero;
1149 zSign = ( a < 0 );
1150 absA = zSign ? - a : a;
1151 shiftCount = countLeadingZeros32( absA ) + 21;
1152 zSig = absA;
1153 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1157 /*----------------------------------------------------------------------------
1158 | Returns the result of converting the 32-bit two's complement integer `a'
1159 | to the extended double-precision floating-point format. The conversion
1160 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1161 | Arithmetic.
1162 *----------------------------------------------------------------------------*/
1164 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1166 flag zSign;
1167 uint32 absA;
1168 int8 shiftCount;
1169 uint64_t zSig;
1171 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1172 zSign = ( a < 0 );
1173 absA = zSign ? - a : a;
1174 shiftCount = countLeadingZeros32( absA ) + 32;
1175 zSig = absA;
1176 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1180 /*----------------------------------------------------------------------------
1181 | Returns the result of converting the 32-bit two's complement integer `a' to
1182 | the quadruple-precision floating-point format. The conversion is performed
1183 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1184 *----------------------------------------------------------------------------*/
1186 float128 int32_to_float128( int32 a STATUS_PARAM )
1188 flag zSign;
1189 uint32 absA;
1190 int8 shiftCount;
1191 uint64_t zSig0;
1193 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1194 zSign = ( a < 0 );
1195 absA = zSign ? - a : a;
1196 shiftCount = countLeadingZeros32( absA ) + 17;
1197 zSig0 = absA;
1198 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1202 /*----------------------------------------------------------------------------
1203 | Returns the result of converting the 64-bit two's complement integer `a'
1204 | to the single-precision floating-point format. The conversion is performed
1205 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1206 *----------------------------------------------------------------------------*/
1208 float32 int64_to_float32( int64 a STATUS_PARAM )
1210 flag zSign;
1211 uint64 absA;
1212 int8 shiftCount;
1214 if ( a == 0 ) return float32_zero;
1215 zSign = ( a < 0 );
1216 absA = zSign ? - a : a;
1217 shiftCount = countLeadingZeros64( absA ) - 40;
1218 if ( 0 <= shiftCount ) {
1219 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1221 else {
1222 shiftCount += 7;
1223 if ( shiftCount < 0 ) {
1224 shift64RightJamming( absA, - shiftCount, &absA );
1226 else {
1227 absA <<= shiftCount;
1229 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1234 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1236 int8 shiftCount;
1238 if ( a == 0 ) return float32_zero;
1239 shiftCount = countLeadingZeros64( a ) - 40;
1240 if ( 0 <= shiftCount ) {
1241 return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1243 else {
1244 shiftCount += 7;
1245 if ( shiftCount < 0 ) {
1246 shift64RightJamming( a, - shiftCount, &a );
1248 else {
1249 a <<= shiftCount;
1251 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1255 /*----------------------------------------------------------------------------
1256 | Returns the result of converting the 64-bit two's complement integer `a'
1257 | to the double-precision floating-point format. The conversion is performed
1258 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1259 *----------------------------------------------------------------------------*/
1261 float64 int64_to_float64( int64 a STATUS_PARAM )
1263 flag zSign;
1265 if ( a == 0 ) return float64_zero;
1266 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1267 return packFloat64( 1, 0x43E, 0 );
1269 zSign = ( a < 0 );
1270 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1274 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1276 if ( a == 0 ) return float64_zero;
1277 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1281 /*----------------------------------------------------------------------------
1282 | Returns the result of converting the 64-bit two's complement integer `a'
1283 | to the extended double-precision floating-point format. The conversion
1284 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1285 | Arithmetic.
1286 *----------------------------------------------------------------------------*/
1288 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1290 flag zSign;
1291 uint64 absA;
1292 int8 shiftCount;
1294 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1295 zSign = ( a < 0 );
1296 absA = zSign ? - a : a;
1297 shiftCount = countLeadingZeros64( absA );
1298 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1302 /*----------------------------------------------------------------------------
1303 | Returns the result of converting the 64-bit two's complement integer `a' to
1304 | the quadruple-precision floating-point format. The conversion is performed
1305 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1306 *----------------------------------------------------------------------------*/
1308 float128 int64_to_float128( int64 a STATUS_PARAM )
1310 flag zSign;
1311 uint64 absA;
1312 int8 shiftCount;
1313 int32 zExp;
1314 uint64_t zSig0, zSig1;
1316 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1317 zSign = ( a < 0 );
1318 absA = zSign ? - a : a;
1319 shiftCount = countLeadingZeros64( absA ) + 49;
1320 zExp = 0x406E - shiftCount;
1321 if ( 64 <= shiftCount ) {
1322 zSig1 = 0;
1323 zSig0 = absA;
1324 shiftCount -= 64;
1326 else {
1327 zSig1 = absA;
1328 zSig0 = 0;
1330 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1331 return packFloat128( zSign, zExp, zSig0, zSig1 );
1335 /*----------------------------------------------------------------------------
1336 | Returns the result of converting the single-precision floating-point value
1337 | `a' to the 32-bit two's complement integer format. The conversion is
1338 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1339 | Arithmetic---which means in particular that the conversion is rounded
1340 | according to the current rounding mode. If `a' is a NaN, the largest
1341 | positive integer is returned. Otherwise, if the conversion overflows, the
1342 | largest integer with the same sign as `a' is returned.
1343 *----------------------------------------------------------------------------*/
1345 int32 float32_to_int32( float32 a STATUS_PARAM )
1347 flag aSign;
1348 int16 aExp, shiftCount;
1349 uint32_t aSig;
1350 uint64_t aSig64;
1352 a = float32_squash_input_denormal(a STATUS_VAR);
1353 aSig = extractFloat32Frac( a );
1354 aExp = extractFloat32Exp( a );
1355 aSign = extractFloat32Sign( a );
1356 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1357 if ( aExp ) aSig |= 0x00800000;
1358 shiftCount = 0xAF - aExp;
1359 aSig64 = aSig;
1360 aSig64 <<= 32;
1361 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1362 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1366 /*----------------------------------------------------------------------------
1367 | Returns the result of converting the single-precision floating-point value
1368 | `a' to the 32-bit two's complement integer format. The conversion is
1369 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1370 | Arithmetic, except that the conversion is always rounded toward zero.
1371 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1372 | the conversion overflows, the largest integer with the same sign as `a' is
1373 | returned.
1374 *----------------------------------------------------------------------------*/
1376 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1378 flag aSign;
1379 int16 aExp, shiftCount;
1380 uint32_t aSig;
1381 int32 z;
1382 a = float32_squash_input_denormal(a STATUS_VAR);
1384 aSig = extractFloat32Frac( a );
1385 aExp = extractFloat32Exp( a );
1386 aSign = extractFloat32Sign( a );
1387 shiftCount = aExp - 0x9E;
1388 if ( 0 <= shiftCount ) {
1389 if ( float32_val(a) != 0xCF000000 ) {
1390 float_raise( float_flag_invalid STATUS_VAR);
1391 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1393 return (int32_t) 0x80000000;
1395 else if ( aExp <= 0x7E ) {
1396 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1397 return 0;
1399 aSig = ( aSig | 0x00800000 )<<8;
1400 z = aSig>>( - shiftCount );
1401 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1402 STATUS(float_exception_flags) |= float_flag_inexact;
1404 if ( aSign ) z = - z;
1405 return z;
1409 /*----------------------------------------------------------------------------
1410 | Returns the result of converting the single-precision floating-point value
1411 | `a' to the 16-bit two's complement integer format. The conversion is
1412 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1413 | Arithmetic, except that the conversion is always rounded toward zero.
1414 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1415 | the conversion overflows, the largest integer with the same sign as `a' is
1416 | returned.
1417 *----------------------------------------------------------------------------*/
1419 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1421 flag aSign;
1422 int16 aExp, shiftCount;
1423 uint32_t aSig;
1424 int32 z;
1426 aSig = extractFloat32Frac( a );
1427 aExp = extractFloat32Exp( a );
1428 aSign = extractFloat32Sign( a );
1429 shiftCount = aExp - 0x8E;
1430 if ( 0 <= shiftCount ) {
1431 if ( float32_val(a) != 0xC7000000 ) {
1432 float_raise( float_flag_invalid STATUS_VAR);
1433 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1434 return 0x7FFF;
1437 return (int32_t) 0xffff8000;
1439 else if ( aExp <= 0x7E ) {
1440 if ( aExp | aSig ) {
1441 STATUS(float_exception_flags) |= float_flag_inexact;
1443 return 0;
1445 shiftCount -= 0x10;
1446 aSig = ( aSig | 0x00800000 )<<8;
1447 z = aSig>>( - shiftCount );
1448 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1449 STATUS(float_exception_flags) |= float_flag_inexact;
1451 if ( aSign ) {
1452 z = - z;
1454 return z;
1458 /*----------------------------------------------------------------------------
1459 | Returns the result of converting the single-precision floating-point value
1460 | `a' to the 64-bit two's complement integer format. The conversion is
1461 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1462 | Arithmetic---which means in particular that the conversion is rounded
1463 | according to the current rounding mode. If `a' is a NaN, the largest
1464 | positive integer is returned. Otherwise, if the conversion overflows, the
1465 | largest integer with the same sign as `a' is returned.
1466 *----------------------------------------------------------------------------*/
1468 int64 float32_to_int64( float32 a STATUS_PARAM )
1470 flag aSign;
1471 int16 aExp, shiftCount;
1472 uint32_t aSig;
1473 uint64_t aSig64, aSigExtra;
1474 a = float32_squash_input_denormal(a STATUS_VAR);
1476 aSig = extractFloat32Frac( a );
1477 aExp = extractFloat32Exp( a );
1478 aSign = extractFloat32Sign( a );
1479 shiftCount = 0xBE - aExp;
1480 if ( shiftCount < 0 ) {
1481 float_raise( float_flag_invalid STATUS_VAR);
1482 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1483 return LIT64( 0x7FFFFFFFFFFFFFFF );
1485 return (int64_t) LIT64( 0x8000000000000000 );
1487 if ( aExp ) aSig |= 0x00800000;
1488 aSig64 = aSig;
1489 aSig64 <<= 40;
1490 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1491 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1495 /*----------------------------------------------------------------------------
1496 | Returns the result of converting the single-precision floating-point value
1497 | `a' to the 64-bit two's complement integer format. The conversion is
1498 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1499 | Arithmetic, except that the conversion is always rounded toward zero. If
1500 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1501 | conversion overflows, the largest integer with the same sign as `a' is
1502 | returned.
1503 *----------------------------------------------------------------------------*/
1505 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1507 flag aSign;
1508 int16 aExp, shiftCount;
1509 uint32_t aSig;
1510 uint64_t aSig64;
1511 int64 z;
1512 a = float32_squash_input_denormal(a STATUS_VAR);
1514 aSig = extractFloat32Frac( a );
1515 aExp = extractFloat32Exp( a );
1516 aSign = extractFloat32Sign( a );
1517 shiftCount = aExp - 0xBE;
1518 if ( 0 <= shiftCount ) {
1519 if ( float32_val(a) != 0xDF000000 ) {
1520 float_raise( float_flag_invalid STATUS_VAR);
1521 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1522 return LIT64( 0x7FFFFFFFFFFFFFFF );
1525 return (int64_t) LIT64( 0x8000000000000000 );
1527 else if ( aExp <= 0x7E ) {
1528 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1529 return 0;
1531 aSig64 = aSig | 0x00800000;
1532 aSig64 <<= 40;
1533 z = aSig64>>( - shiftCount );
1534 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1535 STATUS(float_exception_flags) |= float_flag_inexact;
1537 if ( aSign ) z = - z;
1538 return z;
1542 /*----------------------------------------------------------------------------
1543 | Returns the result of converting the single-precision floating-point value
1544 | `a' to the double-precision floating-point format. The conversion is
1545 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1546 | Arithmetic.
1547 *----------------------------------------------------------------------------*/
1549 float64 float32_to_float64( float32 a STATUS_PARAM )
1551 flag aSign;
1552 int16 aExp;
1553 uint32_t aSig;
1554 a = float32_squash_input_denormal(a STATUS_VAR);
1556 aSig = extractFloat32Frac( a );
1557 aExp = extractFloat32Exp( a );
1558 aSign = extractFloat32Sign( a );
1559 if ( aExp == 0xFF ) {
1560 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1561 return packFloat64( aSign, 0x7FF, 0 );
1563 if ( aExp == 0 ) {
1564 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1565 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1566 --aExp;
1568 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1572 /*----------------------------------------------------------------------------
1573 | Returns the result of converting the single-precision floating-point value
1574 | `a' to the extended double-precision floating-point format. The conversion
1575 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1576 | Arithmetic.
1577 *----------------------------------------------------------------------------*/
1579 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1581 flag aSign;
1582 int16 aExp;
1583 uint32_t aSig;
1585 a = float32_squash_input_denormal(a STATUS_VAR);
1586 aSig = extractFloat32Frac( a );
1587 aExp = extractFloat32Exp( a );
1588 aSign = extractFloat32Sign( a );
1589 if ( aExp == 0xFF ) {
1590 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1591 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1593 if ( aExp == 0 ) {
1594 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1595 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1597 aSig |= 0x00800000;
1598 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1602 /*----------------------------------------------------------------------------
1603 | Returns the result of converting the single-precision floating-point value
1604 | `a' to the double-precision floating-point format. The conversion is
1605 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1606 | Arithmetic.
1607 *----------------------------------------------------------------------------*/
1609 float128 float32_to_float128( float32 a STATUS_PARAM )
1611 flag aSign;
1612 int16 aExp;
1613 uint32_t aSig;
1615 a = float32_squash_input_denormal(a STATUS_VAR);
1616 aSig = extractFloat32Frac( a );
1617 aExp = extractFloat32Exp( a );
1618 aSign = extractFloat32Sign( a );
1619 if ( aExp == 0xFF ) {
1620 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1621 return packFloat128( aSign, 0x7FFF, 0, 0 );
1623 if ( aExp == 0 ) {
1624 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1625 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1626 --aExp;
1628 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1632 /*----------------------------------------------------------------------------
1633 | Rounds the single-precision floating-point value `a' to an integer, and
1634 | returns the result as a single-precision floating-point value. The
1635 | operation is performed according to the IEC/IEEE Standard for Binary
1636 | Floating-Point Arithmetic.
1637 *----------------------------------------------------------------------------*/
1639 float32 float32_round_to_int( float32 a STATUS_PARAM)
1641 flag aSign;
1642 int16 aExp;
1643 uint32_t lastBitMask, roundBitsMask;
1644 int8 roundingMode;
1645 uint32_t z;
1646 a = float32_squash_input_denormal(a STATUS_VAR);
1648 aExp = extractFloat32Exp( a );
1649 if ( 0x96 <= aExp ) {
1650 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1651 return propagateFloat32NaN( a, a STATUS_VAR );
1653 return a;
1655 if ( aExp <= 0x7E ) {
1656 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1657 STATUS(float_exception_flags) |= float_flag_inexact;
1658 aSign = extractFloat32Sign( a );
1659 switch ( STATUS(float_rounding_mode) ) {
1660 case float_round_nearest_even:
1661 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1662 return packFloat32( aSign, 0x7F, 0 );
1664 break;
1665 case float_round_down:
1666 return make_float32(aSign ? 0xBF800000 : 0);
1667 case float_round_up:
1668 return make_float32(aSign ? 0x80000000 : 0x3F800000);
1670 return packFloat32( aSign, 0, 0 );
1672 lastBitMask = 1;
1673 lastBitMask <<= 0x96 - aExp;
1674 roundBitsMask = lastBitMask - 1;
1675 z = float32_val(a);
1676 roundingMode = STATUS(float_rounding_mode);
1677 if ( roundingMode == float_round_nearest_even ) {
1678 z += lastBitMask>>1;
1679 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1681 else if ( roundingMode != float_round_to_zero ) {
1682 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1683 z += roundBitsMask;
1686 z &= ~ roundBitsMask;
1687 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1688 return make_float32(z);
1692 /*----------------------------------------------------------------------------
1693 | Returns the result of adding the absolute values of the single-precision
1694 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1695 | before being returned. `zSign' is ignored if the result is a NaN.
1696 | The addition is performed according to the IEC/IEEE Standard for Binary
1697 | Floating-Point Arithmetic.
1698 *----------------------------------------------------------------------------*/
1700 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1702 int16 aExp, bExp, zExp;
1703 uint32_t aSig, bSig, zSig;
1704 int16 expDiff;
1706 aSig = extractFloat32Frac( a );
1707 aExp = extractFloat32Exp( a );
1708 bSig = extractFloat32Frac( b );
1709 bExp = extractFloat32Exp( b );
1710 expDiff = aExp - bExp;
1711 aSig <<= 6;
1712 bSig <<= 6;
1713 if ( 0 < expDiff ) {
1714 if ( aExp == 0xFF ) {
1715 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1716 return a;
1718 if ( bExp == 0 ) {
1719 --expDiff;
1721 else {
1722 bSig |= 0x20000000;
1724 shift32RightJamming( bSig, expDiff, &bSig );
1725 zExp = aExp;
1727 else if ( expDiff < 0 ) {
1728 if ( bExp == 0xFF ) {
1729 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1730 return packFloat32( zSign, 0xFF, 0 );
1732 if ( aExp == 0 ) {
1733 ++expDiff;
1735 else {
1736 aSig |= 0x20000000;
1738 shift32RightJamming( aSig, - expDiff, &aSig );
1739 zExp = bExp;
1741 else {
1742 if ( aExp == 0xFF ) {
1743 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1744 return a;
1746 if ( aExp == 0 ) {
1747 if (STATUS(flush_to_zero)) {
1748 if (aSig | bSig) {
1749 float_raise(float_flag_output_denormal STATUS_VAR);
1751 return packFloat32(zSign, 0, 0);
1753 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1755 zSig = 0x40000000 + aSig + bSig;
1756 zExp = aExp;
1757 goto roundAndPack;
1759 aSig |= 0x20000000;
1760 zSig = ( aSig + bSig )<<1;
1761 --zExp;
1762 if ( (int32_t) zSig < 0 ) {
1763 zSig = aSig + bSig;
1764 ++zExp;
1766 roundAndPack:
1767 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1771 /*----------------------------------------------------------------------------
1772 | Returns the result of subtracting the absolute values of the single-
1773 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1774 | difference is negated before being returned. `zSign' is ignored if the
1775 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1776 | Standard for Binary Floating-Point Arithmetic.
1777 *----------------------------------------------------------------------------*/
1779 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1781 int16 aExp, bExp, zExp;
1782 uint32_t aSig, bSig, zSig;
1783 int16 expDiff;
1785 aSig = extractFloat32Frac( a );
1786 aExp = extractFloat32Exp( a );
1787 bSig = extractFloat32Frac( b );
1788 bExp = extractFloat32Exp( b );
1789 expDiff = aExp - bExp;
1790 aSig <<= 7;
1791 bSig <<= 7;
1792 if ( 0 < expDiff ) goto aExpBigger;
1793 if ( expDiff < 0 ) goto bExpBigger;
1794 if ( aExp == 0xFF ) {
1795 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1796 float_raise( float_flag_invalid STATUS_VAR);
1797 return float32_default_nan;
1799 if ( aExp == 0 ) {
1800 aExp = 1;
1801 bExp = 1;
1803 if ( bSig < aSig ) goto aBigger;
1804 if ( aSig < bSig ) goto bBigger;
1805 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1806 bExpBigger:
1807 if ( bExp == 0xFF ) {
1808 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1809 return packFloat32( zSign ^ 1, 0xFF, 0 );
1811 if ( aExp == 0 ) {
1812 ++expDiff;
1814 else {
1815 aSig |= 0x40000000;
1817 shift32RightJamming( aSig, - expDiff, &aSig );
1818 bSig |= 0x40000000;
1819 bBigger:
1820 zSig = bSig - aSig;
1821 zExp = bExp;
1822 zSign ^= 1;
1823 goto normalizeRoundAndPack;
1824 aExpBigger:
1825 if ( aExp == 0xFF ) {
1826 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1827 return a;
1829 if ( bExp == 0 ) {
1830 --expDiff;
1832 else {
1833 bSig |= 0x40000000;
1835 shift32RightJamming( bSig, expDiff, &bSig );
1836 aSig |= 0x40000000;
1837 aBigger:
1838 zSig = aSig - bSig;
1839 zExp = aExp;
1840 normalizeRoundAndPack:
1841 --zExp;
1842 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1846 /*----------------------------------------------------------------------------
1847 | Returns the result of adding the single-precision floating-point values `a'
1848 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1849 | Binary Floating-Point Arithmetic.
1850 *----------------------------------------------------------------------------*/
1852 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1854 flag aSign, bSign;
1855 a = float32_squash_input_denormal(a STATUS_VAR);
1856 b = float32_squash_input_denormal(b STATUS_VAR);
1858 aSign = extractFloat32Sign( a );
1859 bSign = extractFloat32Sign( b );
1860 if ( aSign == bSign ) {
1861 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1863 else {
1864 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1869 /*----------------------------------------------------------------------------
1870 | Returns the result of subtracting the single-precision floating-point values
1871 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1872 | for Binary Floating-Point Arithmetic.
1873 *----------------------------------------------------------------------------*/
1875 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1877 flag aSign, bSign;
1878 a = float32_squash_input_denormal(a STATUS_VAR);
1879 b = float32_squash_input_denormal(b STATUS_VAR);
1881 aSign = extractFloat32Sign( a );
1882 bSign = extractFloat32Sign( b );
1883 if ( aSign == bSign ) {
1884 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1886 else {
1887 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1892 /*----------------------------------------------------------------------------
1893 | Returns the result of multiplying the single-precision floating-point values
1894 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1895 | for Binary Floating-Point Arithmetic.
1896 *----------------------------------------------------------------------------*/
1898 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1900 flag aSign, bSign, zSign;
1901 int16 aExp, bExp, zExp;
1902 uint32_t aSig, bSig;
1903 uint64_t zSig64;
1904 uint32_t zSig;
1906 a = float32_squash_input_denormal(a STATUS_VAR);
1907 b = float32_squash_input_denormal(b STATUS_VAR);
1909 aSig = extractFloat32Frac( a );
1910 aExp = extractFloat32Exp( a );
1911 aSign = extractFloat32Sign( a );
1912 bSig = extractFloat32Frac( b );
1913 bExp = extractFloat32Exp( b );
1914 bSign = extractFloat32Sign( b );
1915 zSign = aSign ^ bSign;
1916 if ( aExp == 0xFF ) {
1917 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1918 return propagateFloat32NaN( a, b STATUS_VAR );
1920 if ( ( bExp | bSig ) == 0 ) {
1921 float_raise( float_flag_invalid STATUS_VAR);
1922 return float32_default_nan;
1924 return packFloat32( zSign, 0xFF, 0 );
1926 if ( bExp == 0xFF ) {
1927 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1928 if ( ( aExp | aSig ) == 0 ) {
1929 float_raise( float_flag_invalid STATUS_VAR);
1930 return float32_default_nan;
1932 return packFloat32( zSign, 0xFF, 0 );
1934 if ( aExp == 0 ) {
1935 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1936 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1938 if ( bExp == 0 ) {
1939 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1940 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1942 zExp = aExp + bExp - 0x7F;
1943 aSig = ( aSig | 0x00800000 )<<7;
1944 bSig = ( bSig | 0x00800000 )<<8;
1945 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
1946 zSig = zSig64;
1947 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
1948 zSig <<= 1;
1949 --zExp;
1951 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1955 /*----------------------------------------------------------------------------
1956 | Returns the result of dividing the single-precision floating-point value `a'
1957 | by the corresponding value `b'. The operation is performed according to the
1958 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1959 *----------------------------------------------------------------------------*/
1961 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1963 flag aSign, bSign, zSign;
1964 int16 aExp, bExp, zExp;
1965 uint32_t aSig, bSig, zSig;
1966 a = float32_squash_input_denormal(a STATUS_VAR);
1967 b = float32_squash_input_denormal(b STATUS_VAR);
1969 aSig = extractFloat32Frac( a );
1970 aExp = extractFloat32Exp( a );
1971 aSign = extractFloat32Sign( a );
1972 bSig = extractFloat32Frac( b );
1973 bExp = extractFloat32Exp( b );
1974 bSign = extractFloat32Sign( b );
1975 zSign = aSign ^ bSign;
1976 if ( aExp == 0xFF ) {
1977 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1978 if ( bExp == 0xFF ) {
1979 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1980 float_raise( float_flag_invalid STATUS_VAR);
1981 return float32_default_nan;
1983 return packFloat32( zSign, 0xFF, 0 );
1985 if ( bExp == 0xFF ) {
1986 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1987 return packFloat32( zSign, 0, 0 );
1989 if ( bExp == 0 ) {
1990 if ( bSig == 0 ) {
1991 if ( ( aExp | aSig ) == 0 ) {
1992 float_raise( float_flag_invalid STATUS_VAR);
1993 return float32_default_nan;
1995 float_raise( float_flag_divbyzero STATUS_VAR);
1996 return packFloat32( zSign, 0xFF, 0 );
1998 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2000 if ( aExp == 0 ) {
2001 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2002 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2004 zExp = aExp - bExp + 0x7D;
2005 aSig = ( aSig | 0x00800000 )<<7;
2006 bSig = ( bSig | 0x00800000 )<<8;
2007 if ( bSig <= ( aSig + aSig ) ) {
2008 aSig >>= 1;
2009 ++zExp;
2011 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2012 if ( ( zSig & 0x3F ) == 0 ) {
2013 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2015 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2019 /*----------------------------------------------------------------------------
2020 | Returns the remainder of the single-precision floating-point value `a'
2021 | with respect to the corresponding value `b'. The operation is performed
2022 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2023 *----------------------------------------------------------------------------*/
2025 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2027 flag aSign, zSign;
2028 int16 aExp, bExp, expDiff;
2029 uint32_t aSig, bSig;
2030 uint32_t q;
2031 uint64_t aSig64, bSig64, q64;
2032 uint32_t alternateASig;
2033 int32_t sigMean;
2034 a = float32_squash_input_denormal(a STATUS_VAR);
2035 b = float32_squash_input_denormal(b STATUS_VAR);
2037 aSig = extractFloat32Frac( a );
2038 aExp = extractFloat32Exp( a );
2039 aSign = extractFloat32Sign( a );
2040 bSig = extractFloat32Frac( b );
2041 bExp = extractFloat32Exp( b );
2042 if ( aExp == 0xFF ) {
2043 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2044 return propagateFloat32NaN( a, b STATUS_VAR );
2046 float_raise( float_flag_invalid STATUS_VAR);
2047 return float32_default_nan;
2049 if ( bExp == 0xFF ) {
2050 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2051 return a;
2053 if ( bExp == 0 ) {
2054 if ( bSig == 0 ) {
2055 float_raise( float_flag_invalid STATUS_VAR);
2056 return float32_default_nan;
2058 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2060 if ( aExp == 0 ) {
2061 if ( aSig == 0 ) return a;
2062 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2064 expDiff = aExp - bExp;
2065 aSig |= 0x00800000;
2066 bSig |= 0x00800000;
2067 if ( expDiff < 32 ) {
2068 aSig <<= 8;
2069 bSig <<= 8;
2070 if ( expDiff < 0 ) {
2071 if ( expDiff < -1 ) return a;
2072 aSig >>= 1;
2074 q = ( bSig <= aSig );
2075 if ( q ) aSig -= bSig;
2076 if ( 0 < expDiff ) {
2077 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2078 q >>= 32 - expDiff;
2079 bSig >>= 2;
2080 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2082 else {
2083 aSig >>= 2;
2084 bSig >>= 2;
2087 else {
2088 if ( bSig <= aSig ) aSig -= bSig;
2089 aSig64 = ( (uint64_t) aSig )<<40;
2090 bSig64 = ( (uint64_t) bSig )<<40;
2091 expDiff -= 64;
2092 while ( 0 < expDiff ) {
2093 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2094 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2095 aSig64 = - ( ( bSig * q64 )<<38 );
2096 expDiff -= 62;
2098 expDiff += 64;
2099 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2100 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2101 q = q64>>( 64 - expDiff );
2102 bSig <<= 6;
2103 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2105 do {
2106 alternateASig = aSig;
2107 ++q;
2108 aSig -= bSig;
2109 } while ( 0 <= (int32_t) aSig );
2110 sigMean = aSig + alternateASig;
2111 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2112 aSig = alternateASig;
2114 zSign = ( (int32_t) aSig < 0 );
2115 if ( zSign ) aSig = - aSig;
2116 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2120 /*----------------------------------------------------------------------------
2121 | Returns the square root of the single-precision floating-point value `a'.
2122 | The operation is performed according to the IEC/IEEE Standard for Binary
2123 | Floating-Point Arithmetic.
2124 *----------------------------------------------------------------------------*/
2126 float32 float32_sqrt( float32 a STATUS_PARAM )
2128 flag aSign;
2129 int16 aExp, zExp;
2130 uint32_t aSig, zSig;
2131 uint64_t rem, term;
2132 a = float32_squash_input_denormal(a STATUS_VAR);
2134 aSig = extractFloat32Frac( a );
2135 aExp = extractFloat32Exp( a );
2136 aSign = extractFloat32Sign( a );
2137 if ( aExp == 0xFF ) {
2138 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2139 if ( ! aSign ) return a;
2140 float_raise( float_flag_invalid STATUS_VAR);
2141 return float32_default_nan;
2143 if ( aSign ) {
2144 if ( ( aExp | aSig ) == 0 ) return a;
2145 float_raise( float_flag_invalid STATUS_VAR);
2146 return float32_default_nan;
2148 if ( aExp == 0 ) {
2149 if ( aSig == 0 ) return float32_zero;
2150 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2152 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2153 aSig = ( aSig | 0x00800000 )<<8;
2154 zSig = estimateSqrt32( aExp, aSig ) + 2;
2155 if ( ( zSig & 0x7F ) <= 5 ) {
2156 if ( zSig < 2 ) {
2157 zSig = 0x7FFFFFFF;
2158 goto roundAndPack;
2160 aSig >>= aExp & 1;
2161 term = ( (uint64_t) zSig ) * zSig;
2162 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2163 while ( (int64_t) rem < 0 ) {
2164 --zSig;
2165 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2167 zSig |= ( rem != 0 );
2169 shift32RightJamming( zSig, 1, &zSig );
2170 roundAndPack:
2171 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2175 /*----------------------------------------------------------------------------
2176 | Returns the binary exponential of the single-precision floating-point value
2177 | `a'. The operation is performed according to the IEC/IEEE Standard for
2178 | Binary Floating-Point Arithmetic.
2180 | Uses the following identities:
2182 | 1. -------------------------------------------------------------------------
2183 | x x*ln(2)
2184 | 2 = e
2186 | 2. -------------------------------------------------------------------------
2187 | 2 3 4 5 n
2188 | x x x x x x x
2189 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2190 | 1! 2! 3! 4! 5! n!
2191 *----------------------------------------------------------------------------*/
2193 static const float64 float32_exp2_coefficients[15] =
2195 const_float64( 0x3ff0000000000000ll ), /* 1 */
2196 const_float64( 0x3fe0000000000000ll ), /* 2 */
2197 const_float64( 0x3fc5555555555555ll ), /* 3 */
2198 const_float64( 0x3fa5555555555555ll ), /* 4 */
2199 const_float64( 0x3f81111111111111ll ), /* 5 */
2200 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2201 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2202 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2203 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2204 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2205 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2206 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2207 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2208 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2209 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2212 float32 float32_exp2( float32 a STATUS_PARAM )
2214 flag aSign;
2215 int16 aExp;
2216 uint32_t aSig;
2217 float64 r, x, xn;
2218 int i;
2219 a = float32_squash_input_denormal(a STATUS_VAR);
2221 aSig = extractFloat32Frac( a );
2222 aExp = extractFloat32Exp( a );
2223 aSign = extractFloat32Sign( a );
2225 if ( aExp == 0xFF) {
2226 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2227 return (aSign) ? float32_zero : a;
2229 if (aExp == 0) {
2230 if (aSig == 0) return float32_one;
2233 float_raise( float_flag_inexact STATUS_VAR);
2235 /* ******************************* */
2236 /* using float64 for approximation */
2237 /* ******************************* */
2238 x = float32_to_float64(a STATUS_VAR);
2239 x = float64_mul(x, float64_ln2 STATUS_VAR);
2241 xn = x;
2242 r = float64_one;
2243 for (i = 0 ; i < 15 ; i++) {
2244 float64 f;
2246 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2247 r = float64_add(r, f STATUS_VAR);
2249 xn = float64_mul(xn, x STATUS_VAR);
2252 return float64_to_float32(r, status);
2255 /*----------------------------------------------------------------------------
2256 | Returns the binary log of the single-precision floating-point value `a'.
2257 | The operation is performed according to the IEC/IEEE Standard for Binary
2258 | Floating-Point Arithmetic.
2259 *----------------------------------------------------------------------------*/
2260 float32 float32_log2( float32 a STATUS_PARAM )
2262 flag aSign, zSign;
2263 int16 aExp;
2264 uint32_t aSig, zSig, i;
2266 a = float32_squash_input_denormal(a STATUS_VAR);
2267 aSig = extractFloat32Frac( a );
2268 aExp = extractFloat32Exp( a );
2269 aSign = extractFloat32Sign( a );
2271 if ( aExp == 0 ) {
2272 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2273 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2275 if ( aSign ) {
2276 float_raise( float_flag_invalid STATUS_VAR);
2277 return float32_default_nan;
2279 if ( aExp == 0xFF ) {
2280 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2281 return a;
2284 aExp -= 0x7F;
2285 aSig |= 0x00800000;
2286 zSign = aExp < 0;
2287 zSig = aExp << 23;
2289 for (i = 1 << 22; i > 0; i >>= 1) {
2290 aSig = ( (uint64_t)aSig * aSig ) >> 23;
2291 if ( aSig & 0x01000000 ) {
2292 aSig >>= 1;
2293 zSig |= i;
2297 if ( zSign )
2298 zSig = -zSig;
2300 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2303 /*----------------------------------------------------------------------------
2304 | Returns 1 if the single-precision floating-point value `a' is equal to
2305 | the corresponding value `b', and 0 otherwise. The invalid exception is
2306 | raised if either operand is a NaN. Otherwise, the comparison is performed
2307 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2308 *----------------------------------------------------------------------------*/
2310 int float32_eq( float32 a, float32 b STATUS_PARAM )
2312 uint32_t av, bv;
2313 a = float32_squash_input_denormal(a STATUS_VAR);
2314 b = float32_squash_input_denormal(b STATUS_VAR);
2316 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2317 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2319 float_raise( float_flag_invalid STATUS_VAR);
2320 return 0;
2322 av = float32_val(a);
2323 bv = float32_val(b);
2324 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2327 /*----------------------------------------------------------------------------
2328 | Returns 1 if the single-precision floating-point value `a' is less than
2329 | or equal to the corresponding value `b', and 0 otherwise. The invalid
2330 | exception is raised if either operand is a NaN. The comparison is performed
2331 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2332 *----------------------------------------------------------------------------*/
2334 int float32_le( float32 a, float32 b STATUS_PARAM )
2336 flag aSign, bSign;
2337 uint32_t av, bv;
2338 a = float32_squash_input_denormal(a STATUS_VAR);
2339 b = float32_squash_input_denormal(b STATUS_VAR);
2341 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2342 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2344 float_raise( float_flag_invalid STATUS_VAR);
2345 return 0;
2347 aSign = extractFloat32Sign( a );
2348 bSign = extractFloat32Sign( b );
2349 av = float32_val(a);
2350 bv = float32_val(b);
2351 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2352 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2356 /*----------------------------------------------------------------------------
2357 | Returns 1 if the single-precision floating-point value `a' is less than
2358 | the corresponding value `b', and 0 otherwise. The invalid exception is
2359 | raised if either operand is a NaN. The comparison is performed according
2360 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2361 *----------------------------------------------------------------------------*/
2363 int float32_lt( float32 a, float32 b STATUS_PARAM )
2365 flag aSign, bSign;
2366 uint32_t av, bv;
2367 a = float32_squash_input_denormal(a STATUS_VAR);
2368 b = float32_squash_input_denormal(b STATUS_VAR);
2370 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2371 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2373 float_raise( float_flag_invalid STATUS_VAR);
2374 return 0;
2376 aSign = extractFloat32Sign( a );
2377 bSign = extractFloat32Sign( b );
2378 av = float32_val(a);
2379 bv = float32_val(b);
2380 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2381 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2385 /*----------------------------------------------------------------------------
2386 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2387 | be compared, and 0 otherwise. The invalid exception is raised if either
2388 | operand is a NaN. The comparison is performed according to the IEC/IEEE
2389 | Standard for Binary Floating-Point Arithmetic.
2390 *----------------------------------------------------------------------------*/
2392 int float32_unordered( float32 a, float32 b STATUS_PARAM )
2394 a = float32_squash_input_denormal(a STATUS_VAR);
2395 b = float32_squash_input_denormal(b STATUS_VAR);
2397 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2398 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2400 float_raise( float_flag_invalid STATUS_VAR);
2401 return 1;
2403 return 0;
2406 /*----------------------------------------------------------------------------
2407 | Returns 1 if the single-precision floating-point value `a' is equal to
2408 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2409 | exception. The comparison is performed according to the IEC/IEEE Standard
2410 | for Binary Floating-Point Arithmetic.
2411 *----------------------------------------------------------------------------*/
2413 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2415 a = float32_squash_input_denormal(a STATUS_VAR);
2416 b = float32_squash_input_denormal(b STATUS_VAR);
2418 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2419 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2421 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2422 float_raise( float_flag_invalid STATUS_VAR);
2424 return 0;
2426 return ( float32_val(a) == float32_val(b) ) ||
2427 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2430 /*----------------------------------------------------------------------------
2431 | Returns 1 if the single-precision floating-point value `a' is less than or
2432 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2433 | cause an exception. Otherwise, the comparison is performed according to the
2434 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2435 *----------------------------------------------------------------------------*/
2437 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2439 flag aSign, bSign;
2440 uint32_t av, bv;
2441 a = float32_squash_input_denormal(a STATUS_VAR);
2442 b = float32_squash_input_denormal(b STATUS_VAR);
2444 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2445 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2447 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2448 float_raise( float_flag_invalid STATUS_VAR);
2450 return 0;
2452 aSign = extractFloat32Sign( a );
2453 bSign = extractFloat32Sign( b );
2454 av = float32_val(a);
2455 bv = float32_val(b);
2456 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2457 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2461 /*----------------------------------------------------------------------------
2462 | Returns 1 if the single-precision floating-point value `a' is less than
2463 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2464 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2465 | Standard for Binary Floating-Point Arithmetic.
2466 *----------------------------------------------------------------------------*/
2468 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2470 flag aSign, bSign;
2471 uint32_t av, bv;
2472 a = float32_squash_input_denormal(a STATUS_VAR);
2473 b = float32_squash_input_denormal(b STATUS_VAR);
2475 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2476 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2478 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2479 float_raise( float_flag_invalid STATUS_VAR);
2481 return 0;
2483 aSign = extractFloat32Sign( a );
2484 bSign = extractFloat32Sign( b );
2485 av = float32_val(a);
2486 bv = float32_val(b);
2487 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2488 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2492 /*----------------------------------------------------------------------------
2493 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2494 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2495 | comparison is performed according to the IEC/IEEE Standard for Binary
2496 | Floating-Point Arithmetic.
2497 *----------------------------------------------------------------------------*/
2499 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2501 a = float32_squash_input_denormal(a STATUS_VAR);
2502 b = float32_squash_input_denormal(b STATUS_VAR);
2504 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2505 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2507 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2508 float_raise( float_flag_invalid STATUS_VAR);
2510 return 1;
2512 return 0;
2515 /*----------------------------------------------------------------------------
2516 | Returns the result of converting the double-precision floating-point value
2517 | `a' to the 32-bit two's complement integer format. The conversion is
2518 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2519 | Arithmetic---which means in particular that the conversion is rounded
2520 | according to the current rounding mode. If `a' is a NaN, the largest
2521 | positive integer is returned. Otherwise, if the conversion overflows, the
2522 | largest integer with the same sign as `a' is returned.
2523 *----------------------------------------------------------------------------*/
2525 int32 float64_to_int32( float64 a STATUS_PARAM )
2527 flag aSign;
2528 int16 aExp, shiftCount;
2529 uint64_t aSig;
2530 a = float64_squash_input_denormal(a STATUS_VAR);
2532 aSig = extractFloat64Frac( a );
2533 aExp = extractFloat64Exp( a );
2534 aSign = extractFloat64Sign( a );
2535 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2536 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2537 shiftCount = 0x42C - aExp;
2538 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2539 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2543 /*----------------------------------------------------------------------------
2544 | Returns the result of converting the double-precision floating-point value
2545 | `a' to the 32-bit two's complement integer format. The conversion is
2546 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2547 | Arithmetic, except that the conversion is always rounded toward zero.
2548 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2549 | the conversion overflows, the largest integer with the same sign as `a' is
2550 | returned.
2551 *----------------------------------------------------------------------------*/
2553 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2555 flag aSign;
2556 int16 aExp, shiftCount;
2557 uint64_t aSig, savedASig;
2558 int32 z;
2559 a = float64_squash_input_denormal(a STATUS_VAR);
2561 aSig = extractFloat64Frac( a );
2562 aExp = extractFloat64Exp( a );
2563 aSign = extractFloat64Sign( a );
2564 if ( 0x41E < aExp ) {
2565 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2566 goto invalid;
2568 else if ( aExp < 0x3FF ) {
2569 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2570 return 0;
2572 aSig |= LIT64( 0x0010000000000000 );
2573 shiftCount = 0x433 - aExp;
2574 savedASig = aSig;
2575 aSig >>= shiftCount;
2576 z = aSig;
2577 if ( aSign ) z = - z;
2578 if ( ( z < 0 ) ^ aSign ) {
2579 invalid:
2580 float_raise( float_flag_invalid STATUS_VAR);
2581 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2583 if ( ( aSig<<shiftCount ) != savedASig ) {
2584 STATUS(float_exception_flags) |= float_flag_inexact;
2586 return z;
2590 /*----------------------------------------------------------------------------
2591 | Returns the result of converting the double-precision floating-point value
2592 | `a' to the 16-bit two's complement integer format. The conversion is
2593 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2594 | Arithmetic, except that the conversion is always rounded toward zero.
2595 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2596 | the conversion overflows, the largest integer with the same sign as `a' is
2597 | returned.
2598 *----------------------------------------------------------------------------*/
2600 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2602 flag aSign;
2603 int16 aExp, shiftCount;
2604 uint64_t aSig, savedASig;
2605 int32 z;
2607 aSig = extractFloat64Frac( a );
2608 aExp = extractFloat64Exp( a );
2609 aSign = extractFloat64Sign( a );
2610 if ( 0x40E < aExp ) {
2611 if ( ( aExp == 0x7FF ) && aSig ) {
2612 aSign = 0;
2614 goto invalid;
2616 else if ( aExp < 0x3FF ) {
2617 if ( aExp || aSig ) {
2618 STATUS(float_exception_flags) |= float_flag_inexact;
2620 return 0;
2622 aSig |= LIT64( 0x0010000000000000 );
2623 shiftCount = 0x433 - aExp;
2624 savedASig = aSig;
2625 aSig >>= shiftCount;
2626 z = aSig;
2627 if ( aSign ) {
2628 z = - z;
2630 if ( ( (int16_t)z < 0 ) ^ aSign ) {
2631 invalid:
2632 float_raise( float_flag_invalid STATUS_VAR);
2633 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2635 if ( ( aSig<<shiftCount ) != savedASig ) {
2636 STATUS(float_exception_flags) |= float_flag_inexact;
2638 return z;
2641 /*----------------------------------------------------------------------------
2642 | Returns the result of converting the double-precision floating-point value
2643 | `a' to the 64-bit two's complement integer format. The conversion is
2644 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2645 | Arithmetic---which means in particular that the conversion is rounded
2646 | according to the current rounding mode. If `a' is a NaN, the largest
2647 | positive integer is returned. Otherwise, if the conversion overflows, the
2648 | largest integer with the same sign as `a' is returned.
2649 *----------------------------------------------------------------------------*/
2651 int64 float64_to_int64( float64 a STATUS_PARAM )
2653 flag aSign;
2654 int16 aExp, shiftCount;
2655 uint64_t aSig, aSigExtra;
2656 a = float64_squash_input_denormal(a STATUS_VAR);
2658 aSig = extractFloat64Frac( a );
2659 aExp = extractFloat64Exp( a );
2660 aSign = extractFloat64Sign( a );
2661 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2662 shiftCount = 0x433 - aExp;
2663 if ( shiftCount <= 0 ) {
2664 if ( 0x43E < aExp ) {
2665 float_raise( float_flag_invalid STATUS_VAR);
2666 if ( ! aSign
2667 || ( ( aExp == 0x7FF )
2668 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2670 return LIT64( 0x7FFFFFFFFFFFFFFF );
2672 return (int64_t) LIT64( 0x8000000000000000 );
2674 aSigExtra = 0;
2675 aSig <<= - shiftCount;
2677 else {
2678 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2680 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2684 /*----------------------------------------------------------------------------
2685 | Returns the result of converting the double-precision floating-point value
2686 | `a' to the 64-bit two's complement integer format. The conversion is
2687 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2688 | Arithmetic, except that the conversion is always rounded toward zero.
2689 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2690 | the conversion overflows, the largest integer with the same sign as `a' is
2691 | returned.
2692 *----------------------------------------------------------------------------*/
2694 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2696 flag aSign;
2697 int16 aExp, shiftCount;
2698 uint64_t aSig;
2699 int64 z;
2700 a = float64_squash_input_denormal(a STATUS_VAR);
2702 aSig = extractFloat64Frac( a );
2703 aExp = extractFloat64Exp( a );
2704 aSign = extractFloat64Sign( a );
2705 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2706 shiftCount = aExp - 0x433;
2707 if ( 0 <= shiftCount ) {
2708 if ( 0x43E <= aExp ) {
2709 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2710 float_raise( float_flag_invalid STATUS_VAR);
2711 if ( ! aSign
2712 || ( ( aExp == 0x7FF )
2713 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2715 return LIT64( 0x7FFFFFFFFFFFFFFF );
2718 return (int64_t) LIT64( 0x8000000000000000 );
2720 z = aSig<<shiftCount;
2722 else {
2723 if ( aExp < 0x3FE ) {
2724 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2725 return 0;
2727 z = aSig>>( - shiftCount );
2728 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
2729 STATUS(float_exception_flags) |= float_flag_inexact;
2732 if ( aSign ) z = - z;
2733 return z;
2737 /*----------------------------------------------------------------------------
2738 | Returns the result of converting the double-precision floating-point value
2739 | `a' to the single-precision floating-point format. The conversion is
2740 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2741 | Arithmetic.
2742 *----------------------------------------------------------------------------*/
2744 float32 float64_to_float32( float64 a STATUS_PARAM )
2746 flag aSign;
2747 int16 aExp;
2748 uint64_t aSig;
2749 uint32_t zSig;
2750 a = float64_squash_input_denormal(a STATUS_VAR);
2752 aSig = extractFloat64Frac( a );
2753 aExp = extractFloat64Exp( a );
2754 aSign = extractFloat64Sign( a );
2755 if ( aExp == 0x7FF ) {
2756 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2757 return packFloat32( aSign, 0xFF, 0 );
2759 shift64RightJamming( aSig, 22, &aSig );
2760 zSig = aSig;
2761 if ( aExp || zSig ) {
2762 zSig |= 0x40000000;
2763 aExp -= 0x381;
2765 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2770 /*----------------------------------------------------------------------------
2771 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2772 | half-precision floating-point value, returning the result. After being
2773 | shifted into the proper positions, the three fields are simply added
2774 | together to form the result. This means that any integer portion of `zSig'
2775 | will be added into the exponent. Since a properly normalized significand
2776 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2777 | than the desired result exponent whenever `zSig' is a complete, normalized
2778 | significand.
2779 *----------------------------------------------------------------------------*/
2780 static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig)
2782 return make_float16(
2783 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
2786 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2787 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
2789 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2791 flag aSign;
2792 int16 aExp;
2793 uint32_t aSig;
2795 aSign = extractFloat16Sign(a);
2796 aExp = extractFloat16Exp(a);
2797 aSig = extractFloat16Frac(a);
2799 if (aExp == 0x1f && ieee) {
2800 if (aSig) {
2801 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
2803 return packFloat32(aSign, 0xff, aSig << 13);
2805 if (aExp == 0) {
2806 int8 shiftCount;
2808 if (aSig == 0) {
2809 return packFloat32(aSign, 0, 0);
2812 shiftCount = countLeadingZeros32( aSig ) - 21;
2813 aSig = aSig << shiftCount;
2814 aExp = -shiftCount;
2816 return packFloat32( aSign, aExp + 0x70, aSig << 13);
2819 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
2821 flag aSign;
2822 int16 aExp;
2823 uint32_t aSig;
2824 uint32_t mask;
2825 uint32_t increment;
2826 int8 roundingMode;
2827 a = float32_squash_input_denormal(a STATUS_VAR);
2829 aSig = extractFloat32Frac( a );
2830 aExp = extractFloat32Exp( a );
2831 aSign = extractFloat32Sign( a );
2832 if ( aExp == 0xFF ) {
2833 if (aSig) {
2834 /* Input is a NaN */
2835 float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2836 if (!ieee) {
2837 return packFloat16(aSign, 0, 0);
2839 return r;
2841 /* Infinity */
2842 if (!ieee) {
2843 float_raise(float_flag_invalid STATUS_VAR);
2844 return packFloat16(aSign, 0x1f, 0x3ff);
2846 return packFloat16(aSign, 0x1f, 0);
2848 if (aExp == 0 && aSig == 0) {
2849 return packFloat16(aSign, 0, 0);
2851 /* Decimal point between bits 22 and 23. */
2852 aSig |= 0x00800000;
2853 aExp -= 0x7f;
2854 if (aExp < -14) {
2855 mask = 0x00ffffff;
2856 if (aExp >= -24) {
2857 mask >>= 25 + aExp;
2859 } else {
2860 mask = 0x00001fff;
2862 if (aSig & mask) {
2863 float_raise( float_flag_underflow STATUS_VAR );
2864 roundingMode = STATUS(float_rounding_mode);
2865 switch (roundingMode) {
2866 case float_round_nearest_even:
2867 increment = (mask + 1) >> 1;
2868 if ((aSig & mask) == increment) {
2869 increment = aSig & (increment << 1);
2871 break;
2872 case float_round_up:
2873 increment = aSign ? 0 : mask;
2874 break;
2875 case float_round_down:
2876 increment = aSign ? mask : 0;
2877 break;
2878 default: /* round_to_zero */
2879 increment = 0;
2880 break;
2882 aSig += increment;
2883 if (aSig >= 0x01000000) {
2884 aSig >>= 1;
2885 aExp++;
2887 } else if (aExp < -14
2888 && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2889 float_raise( float_flag_underflow STATUS_VAR);
2892 if (ieee) {
2893 if (aExp > 15) {
2894 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2895 return packFloat16(aSign, 0x1f, 0);
2897 } else {
2898 if (aExp > 16) {
2899 float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
2900 return packFloat16(aSign, 0x1f, 0x3ff);
2903 if (aExp < -24) {
2904 return packFloat16(aSign, 0, 0);
2906 if (aExp < -14) {
2907 aSig >>= -14 - aExp;
2908 aExp = -14;
2910 return packFloat16(aSign, aExp + 14, aSig >> 13);
2913 /*----------------------------------------------------------------------------
2914 | Returns the result of converting the double-precision floating-point value
2915 | `a' to the extended double-precision floating-point format. The conversion
2916 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2917 | Arithmetic.
2918 *----------------------------------------------------------------------------*/
2920 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2922 flag aSign;
2923 int16 aExp;
2924 uint64_t aSig;
2926 a = float64_squash_input_denormal(a STATUS_VAR);
2927 aSig = extractFloat64Frac( a );
2928 aExp = extractFloat64Exp( a );
2929 aSign = extractFloat64Sign( a );
2930 if ( aExp == 0x7FF ) {
2931 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2932 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2934 if ( aExp == 0 ) {
2935 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2936 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2938 return
2939 packFloatx80(
2940 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2944 /*----------------------------------------------------------------------------
2945 | Returns the result of converting the double-precision floating-point value
2946 | `a' to the quadruple-precision floating-point format. The conversion is
2947 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2948 | Arithmetic.
2949 *----------------------------------------------------------------------------*/
2951 float128 float64_to_float128( float64 a STATUS_PARAM )
2953 flag aSign;
2954 int16 aExp;
2955 uint64_t aSig, zSig0, zSig1;
2957 a = float64_squash_input_denormal(a STATUS_VAR);
2958 aSig = extractFloat64Frac( a );
2959 aExp = extractFloat64Exp( a );
2960 aSign = extractFloat64Sign( a );
2961 if ( aExp == 0x7FF ) {
2962 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2963 return packFloat128( aSign, 0x7FFF, 0, 0 );
2965 if ( aExp == 0 ) {
2966 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2967 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2968 --aExp;
2970 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2971 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2975 /*----------------------------------------------------------------------------
2976 | Rounds the double-precision floating-point value `a' to an integer, and
2977 | returns the result as a double-precision floating-point value. The
2978 | operation is performed according to the IEC/IEEE Standard for Binary
2979 | Floating-Point Arithmetic.
2980 *----------------------------------------------------------------------------*/
2982 float64 float64_round_to_int( float64 a STATUS_PARAM )
2984 flag aSign;
2985 int16 aExp;
2986 uint64_t lastBitMask, roundBitsMask;
2987 int8 roundingMode;
2988 uint64_t z;
2989 a = float64_squash_input_denormal(a STATUS_VAR);
2991 aExp = extractFloat64Exp( a );
2992 if ( 0x433 <= aExp ) {
2993 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2994 return propagateFloat64NaN( a, a STATUS_VAR );
2996 return a;
2998 if ( aExp < 0x3FF ) {
2999 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3000 STATUS(float_exception_flags) |= float_flag_inexact;
3001 aSign = extractFloat64Sign( a );
3002 switch ( STATUS(float_rounding_mode) ) {
3003 case float_round_nearest_even:
3004 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3005 return packFloat64( aSign, 0x3FF, 0 );
3007 break;
3008 case float_round_down:
3009 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3010 case float_round_up:
3011 return make_float64(
3012 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3014 return packFloat64( aSign, 0, 0 );
3016 lastBitMask = 1;
3017 lastBitMask <<= 0x433 - aExp;
3018 roundBitsMask = lastBitMask - 1;
3019 z = float64_val(a);
3020 roundingMode = STATUS(float_rounding_mode);
3021 if ( roundingMode == float_round_nearest_even ) {
3022 z += lastBitMask>>1;
3023 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3025 else if ( roundingMode != float_round_to_zero ) {
3026 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3027 z += roundBitsMask;
3030 z &= ~ roundBitsMask;
3031 if ( z != float64_val(a) )
3032 STATUS(float_exception_flags) |= float_flag_inexact;
3033 return make_float64(z);
3037 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3039 int oldmode;
3040 float64 res;
3041 oldmode = STATUS(float_rounding_mode);
3042 STATUS(float_rounding_mode) = float_round_to_zero;
3043 res = float64_round_to_int(a STATUS_VAR);
3044 STATUS(float_rounding_mode) = oldmode;
3045 return res;
3048 /*----------------------------------------------------------------------------
3049 | Returns the result of adding the absolute values of the double-precision
3050 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3051 | before being returned. `zSign' is ignored if the result is a NaN.
3052 | The addition is performed according to the IEC/IEEE Standard for Binary
3053 | Floating-Point Arithmetic.
3054 *----------------------------------------------------------------------------*/
3056 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3058 int16 aExp, bExp, zExp;
3059 uint64_t aSig, bSig, zSig;
3060 int16 expDiff;
3062 aSig = extractFloat64Frac( a );
3063 aExp = extractFloat64Exp( a );
3064 bSig = extractFloat64Frac( b );
3065 bExp = extractFloat64Exp( b );
3066 expDiff = aExp - bExp;
3067 aSig <<= 9;
3068 bSig <<= 9;
3069 if ( 0 < expDiff ) {
3070 if ( aExp == 0x7FF ) {
3071 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3072 return a;
3074 if ( bExp == 0 ) {
3075 --expDiff;
3077 else {
3078 bSig |= LIT64( 0x2000000000000000 );
3080 shift64RightJamming( bSig, expDiff, &bSig );
3081 zExp = aExp;
3083 else if ( expDiff < 0 ) {
3084 if ( bExp == 0x7FF ) {
3085 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3086 return packFloat64( zSign, 0x7FF, 0 );
3088 if ( aExp == 0 ) {
3089 ++expDiff;
3091 else {
3092 aSig |= LIT64( 0x2000000000000000 );
3094 shift64RightJamming( aSig, - expDiff, &aSig );
3095 zExp = bExp;
3097 else {
3098 if ( aExp == 0x7FF ) {
3099 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3100 return a;
3102 if ( aExp == 0 ) {
3103 if (STATUS(flush_to_zero)) {
3104 if (aSig | bSig) {
3105 float_raise(float_flag_output_denormal STATUS_VAR);
3107 return packFloat64(zSign, 0, 0);
3109 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3111 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3112 zExp = aExp;
3113 goto roundAndPack;
3115 aSig |= LIT64( 0x2000000000000000 );
3116 zSig = ( aSig + bSig )<<1;
3117 --zExp;
3118 if ( (int64_t) zSig < 0 ) {
3119 zSig = aSig + bSig;
3120 ++zExp;
3122 roundAndPack:
3123 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3127 /*----------------------------------------------------------------------------
3128 | Returns the result of subtracting the absolute values of the double-
3129 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3130 | difference is negated before being returned. `zSign' is ignored if the
3131 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3132 | Standard for Binary Floating-Point Arithmetic.
3133 *----------------------------------------------------------------------------*/
3135 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3137 int16 aExp, bExp, zExp;
3138 uint64_t aSig, bSig, zSig;
3139 int16 expDiff;
3141 aSig = extractFloat64Frac( a );
3142 aExp = extractFloat64Exp( a );
3143 bSig = extractFloat64Frac( b );
3144 bExp = extractFloat64Exp( b );
3145 expDiff = aExp - bExp;
3146 aSig <<= 10;
3147 bSig <<= 10;
3148 if ( 0 < expDiff ) goto aExpBigger;
3149 if ( expDiff < 0 ) goto bExpBigger;
3150 if ( aExp == 0x7FF ) {
3151 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3152 float_raise( float_flag_invalid STATUS_VAR);
3153 return float64_default_nan;
3155 if ( aExp == 0 ) {
3156 aExp = 1;
3157 bExp = 1;
3159 if ( bSig < aSig ) goto aBigger;
3160 if ( aSig < bSig ) goto bBigger;
3161 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3162 bExpBigger:
3163 if ( bExp == 0x7FF ) {
3164 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3165 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3167 if ( aExp == 0 ) {
3168 ++expDiff;
3170 else {
3171 aSig |= LIT64( 0x4000000000000000 );
3173 shift64RightJamming( aSig, - expDiff, &aSig );
3174 bSig |= LIT64( 0x4000000000000000 );
3175 bBigger:
3176 zSig = bSig - aSig;
3177 zExp = bExp;
3178 zSign ^= 1;
3179 goto normalizeRoundAndPack;
3180 aExpBigger:
3181 if ( aExp == 0x7FF ) {
3182 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3183 return a;
3185 if ( bExp == 0 ) {
3186 --expDiff;
3188 else {
3189 bSig |= LIT64( 0x4000000000000000 );
3191 shift64RightJamming( bSig, expDiff, &bSig );
3192 aSig |= LIT64( 0x4000000000000000 );
3193 aBigger:
3194 zSig = aSig - bSig;
3195 zExp = aExp;
3196 normalizeRoundAndPack:
3197 --zExp;
3198 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3202 /*----------------------------------------------------------------------------
3203 | Returns the result of adding the double-precision floating-point values `a'
3204 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3205 | Binary Floating-Point Arithmetic.
3206 *----------------------------------------------------------------------------*/
3208 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3210 flag aSign, bSign;
3211 a = float64_squash_input_denormal(a STATUS_VAR);
3212 b = float64_squash_input_denormal(b STATUS_VAR);
3214 aSign = extractFloat64Sign( a );
3215 bSign = extractFloat64Sign( b );
3216 if ( aSign == bSign ) {
3217 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3219 else {
3220 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3225 /*----------------------------------------------------------------------------
3226 | Returns the result of subtracting the double-precision floating-point values
3227 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3228 | for Binary Floating-Point Arithmetic.
3229 *----------------------------------------------------------------------------*/
3231 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3233 flag aSign, bSign;
3234 a = float64_squash_input_denormal(a STATUS_VAR);
3235 b = float64_squash_input_denormal(b STATUS_VAR);
3237 aSign = extractFloat64Sign( a );
3238 bSign = extractFloat64Sign( b );
3239 if ( aSign == bSign ) {
3240 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3242 else {
3243 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3248 /*----------------------------------------------------------------------------
3249 | Returns the result of multiplying the double-precision floating-point values
3250 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3251 | for Binary Floating-Point Arithmetic.
3252 *----------------------------------------------------------------------------*/
3254 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3256 flag aSign, bSign, zSign;
3257 int16 aExp, bExp, zExp;
3258 uint64_t aSig, bSig, zSig0, zSig1;
3260 a = float64_squash_input_denormal(a STATUS_VAR);
3261 b = float64_squash_input_denormal(b STATUS_VAR);
3263 aSig = extractFloat64Frac( a );
3264 aExp = extractFloat64Exp( a );
3265 aSign = extractFloat64Sign( a );
3266 bSig = extractFloat64Frac( b );
3267 bExp = extractFloat64Exp( b );
3268 bSign = extractFloat64Sign( b );
3269 zSign = aSign ^ bSign;
3270 if ( aExp == 0x7FF ) {
3271 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3272 return propagateFloat64NaN( a, b STATUS_VAR );
3274 if ( ( bExp | bSig ) == 0 ) {
3275 float_raise( float_flag_invalid STATUS_VAR);
3276 return float64_default_nan;
3278 return packFloat64( zSign, 0x7FF, 0 );
3280 if ( bExp == 0x7FF ) {
3281 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3282 if ( ( aExp | aSig ) == 0 ) {
3283 float_raise( float_flag_invalid STATUS_VAR);
3284 return float64_default_nan;
3286 return packFloat64( zSign, 0x7FF, 0 );
3288 if ( aExp == 0 ) {
3289 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3290 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3292 if ( bExp == 0 ) {
3293 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3294 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3296 zExp = aExp + bExp - 0x3FF;
3297 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3298 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3299 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3300 zSig0 |= ( zSig1 != 0 );
3301 if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3302 zSig0 <<= 1;
3303 --zExp;
3305 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3309 /*----------------------------------------------------------------------------
3310 | Returns the result of dividing the double-precision floating-point value `a'
3311 | by the corresponding value `b'. The operation is performed according to
3312 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3313 *----------------------------------------------------------------------------*/
3315 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3317 flag aSign, bSign, zSign;
3318 int16 aExp, bExp, zExp;
3319 uint64_t aSig, bSig, zSig;
3320 uint64_t rem0, rem1;
3321 uint64_t term0, term1;
3322 a = float64_squash_input_denormal(a STATUS_VAR);
3323 b = float64_squash_input_denormal(b STATUS_VAR);
3325 aSig = extractFloat64Frac( a );
3326 aExp = extractFloat64Exp( a );
3327 aSign = extractFloat64Sign( a );
3328 bSig = extractFloat64Frac( b );
3329 bExp = extractFloat64Exp( b );
3330 bSign = extractFloat64Sign( b );
3331 zSign = aSign ^ bSign;
3332 if ( aExp == 0x7FF ) {
3333 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3334 if ( bExp == 0x7FF ) {
3335 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3336 float_raise( float_flag_invalid STATUS_VAR);
3337 return float64_default_nan;
3339 return packFloat64( zSign, 0x7FF, 0 );
3341 if ( bExp == 0x7FF ) {
3342 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3343 return packFloat64( zSign, 0, 0 );
3345 if ( bExp == 0 ) {
3346 if ( bSig == 0 ) {
3347 if ( ( aExp | aSig ) == 0 ) {
3348 float_raise( float_flag_invalid STATUS_VAR);
3349 return float64_default_nan;
3351 float_raise( float_flag_divbyzero STATUS_VAR);
3352 return packFloat64( zSign, 0x7FF, 0 );
3354 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3356 if ( aExp == 0 ) {
3357 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3358 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3360 zExp = aExp - bExp + 0x3FD;
3361 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3362 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3363 if ( bSig <= ( aSig + aSig ) ) {
3364 aSig >>= 1;
3365 ++zExp;
3367 zSig = estimateDiv128To64( aSig, 0, bSig );
3368 if ( ( zSig & 0x1FF ) <= 2 ) {
3369 mul64To128( bSig, zSig, &term0, &term1 );
3370 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3371 while ( (int64_t) rem0 < 0 ) {
3372 --zSig;
3373 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3375 zSig |= ( rem1 != 0 );
3377 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3381 /*----------------------------------------------------------------------------
3382 | Returns the remainder of the double-precision floating-point value `a'
3383 | with respect to the corresponding value `b'. The operation is performed
3384 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3385 *----------------------------------------------------------------------------*/
3387 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3389 flag aSign, zSign;
3390 int16 aExp, bExp, expDiff;
3391 uint64_t aSig, bSig;
3392 uint64_t q, alternateASig;
3393 int64_t sigMean;
3395 a = float64_squash_input_denormal(a STATUS_VAR);
3396 b = float64_squash_input_denormal(b STATUS_VAR);
3397 aSig = extractFloat64Frac( a );
3398 aExp = extractFloat64Exp( a );
3399 aSign = extractFloat64Sign( a );
3400 bSig = extractFloat64Frac( b );
3401 bExp = extractFloat64Exp( b );
3402 if ( aExp == 0x7FF ) {
3403 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3404 return propagateFloat64NaN( a, b STATUS_VAR );
3406 float_raise( float_flag_invalid STATUS_VAR);
3407 return float64_default_nan;
3409 if ( bExp == 0x7FF ) {
3410 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3411 return a;
3413 if ( bExp == 0 ) {
3414 if ( bSig == 0 ) {
3415 float_raise( float_flag_invalid STATUS_VAR);
3416 return float64_default_nan;
3418 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3420 if ( aExp == 0 ) {
3421 if ( aSig == 0 ) return a;
3422 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3424 expDiff = aExp - bExp;
3425 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3426 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3427 if ( expDiff < 0 ) {
3428 if ( expDiff < -1 ) return a;
3429 aSig >>= 1;
3431 q = ( bSig <= aSig );
3432 if ( q ) aSig -= bSig;
3433 expDiff -= 64;
3434 while ( 0 < expDiff ) {
3435 q = estimateDiv128To64( aSig, 0, bSig );
3436 q = ( 2 < q ) ? q - 2 : 0;
3437 aSig = - ( ( bSig>>2 ) * q );
3438 expDiff -= 62;
3440 expDiff += 64;
3441 if ( 0 < expDiff ) {
3442 q = estimateDiv128To64( aSig, 0, bSig );
3443 q = ( 2 < q ) ? q - 2 : 0;
3444 q >>= 64 - expDiff;
3445 bSig >>= 2;
3446 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3448 else {
3449 aSig >>= 2;
3450 bSig >>= 2;
3452 do {
3453 alternateASig = aSig;
3454 ++q;
3455 aSig -= bSig;
3456 } while ( 0 <= (int64_t) aSig );
3457 sigMean = aSig + alternateASig;
3458 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3459 aSig = alternateASig;
3461 zSign = ( (int64_t) aSig < 0 );
3462 if ( zSign ) aSig = - aSig;
3463 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3467 /*----------------------------------------------------------------------------
3468 | Returns the square root of the double-precision floating-point value `a'.
3469 | The operation is performed according to the IEC/IEEE Standard for Binary
3470 | Floating-Point Arithmetic.
3471 *----------------------------------------------------------------------------*/
3473 float64 float64_sqrt( float64 a STATUS_PARAM )
3475 flag aSign;
3476 int16 aExp, zExp;
3477 uint64_t aSig, zSig, doubleZSig;
3478 uint64_t rem0, rem1, term0, term1;
3479 a = float64_squash_input_denormal(a STATUS_VAR);
3481 aSig = extractFloat64Frac( a );
3482 aExp = extractFloat64Exp( a );
3483 aSign = extractFloat64Sign( a );
3484 if ( aExp == 0x7FF ) {
3485 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3486 if ( ! aSign ) return a;
3487 float_raise( float_flag_invalid STATUS_VAR);
3488 return float64_default_nan;
3490 if ( aSign ) {
3491 if ( ( aExp | aSig ) == 0 ) return a;
3492 float_raise( float_flag_invalid STATUS_VAR);
3493 return float64_default_nan;
3495 if ( aExp == 0 ) {
3496 if ( aSig == 0 ) return float64_zero;
3497 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3499 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3500 aSig |= LIT64( 0x0010000000000000 );
3501 zSig = estimateSqrt32( aExp, aSig>>21 );
3502 aSig <<= 9 - ( aExp & 1 );
3503 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3504 if ( ( zSig & 0x1FF ) <= 5 ) {
3505 doubleZSig = zSig<<1;
3506 mul64To128( zSig, zSig, &term0, &term1 );
3507 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3508 while ( (int64_t) rem0 < 0 ) {
3509 --zSig;
3510 doubleZSig -= 2;
3511 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3513 zSig |= ( ( rem0 | rem1 ) != 0 );
3515 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3519 /*----------------------------------------------------------------------------
3520 | Returns the binary log of the double-precision floating-point value `a'.
3521 | The operation is performed according to the IEC/IEEE Standard for Binary
3522 | Floating-Point Arithmetic.
3523 *----------------------------------------------------------------------------*/
3524 float64 float64_log2( float64 a STATUS_PARAM )
3526 flag aSign, zSign;
3527 int16 aExp;
3528 uint64_t aSig, aSig0, aSig1, zSig, i;
3529 a = float64_squash_input_denormal(a STATUS_VAR);
3531 aSig = extractFloat64Frac( a );
3532 aExp = extractFloat64Exp( a );
3533 aSign = extractFloat64Sign( a );
3535 if ( aExp == 0 ) {
3536 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3537 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3539 if ( aSign ) {
3540 float_raise( float_flag_invalid STATUS_VAR);
3541 return float64_default_nan;
3543 if ( aExp == 0x7FF ) {
3544 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3545 return a;
3548 aExp -= 0x3FF;
3549 aSig |= LIT64( 0x0010000000000000 );
3550 zSign = aExp < 0;
3551 zSig = (uint64_t)aExp << 52;
3552 for (i = 1LL << 51; i > 0; i >>= 1) {
3553 mul64To128( aSig, aSig, &aSig0, &aSig1 );
3554 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3555 if ( aSig & LIT64( 0x0020000000000000 ) ) {
3556 aSig >>= 1;
3557 zSig |= i;
3561 if ( zSign )
3562 zSig = -zSig;
3563 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3566 /*----------------------------------------------------------------------------
3567 | Returns 1 if the double-precision floating-point value `a' is equal to the
3568 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3569 | if either operand is a NaN. Otherwise, the comparison is performed
3570 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3571 *----------------------------------------------------------------------------*/
3573 int float64_eq( float64 a, float64 b STATUS_PARAM )
3575 uint64_t av, bv;
3576 a = float64_squash_input_denormal(a STATUS_VAR);
3577 b = float64_squash_input_denormal(b STATUS_VAR);
3579 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3580 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3582 float_raise( float_flag_invalid STATUS_VAR);
3583 return 0;
3585 av = float64_val(a);
3586 bv = float64_val(b);
3587 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3591 /*----------------------------------------------------------------------------
3592 | Returns 1 if the double-precision floating-point value `a' is less than or
3593 | equal to the corresponding value `b', and 0 otherwise. The invalid
3594 | exception is raised if either operand is a NaN. The comparison is performed
3595 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3596 *----------------------------------------------------------------------------*/
3598 int float64_le( float64 a, float64 b STATUS_PARAM )
3600 flag aSign, bSign;
3601 uint64_t av, bv;
3602 a = float64_squash_input_denormal(a STATUS_VAR);
3603 b = float64_squash_input_denormal(b STATUS_VAR);
3605 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3606 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3608 float_raise( float_flag_invalid STATUS_VAR);
3609 return 0;
3611 aSign = extractFloat64Sign( a );
3612 bSign = extractFloat64Sign( b );
3613 av = float64_val(a);
3614 bv = float64_val(b);
3615 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3616 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3620 /*----------------------------------------------------------------------------
3621 | Returns 1 if the double-precision floating-point value `a' is less than
3622 | the corresponding value `b', and 0 otherwise. The invalid exception is
3623 | raised if either operand is a NaN. The comparison is performed according
3624 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3625 *----------------------------------------------------------------------------*/
3627 int float64_lt( float64 a, float64 b STATUS_PARAM )
3629 flag aSign, bSign;
3630 uint64_t av, bv;
3632 a = float64_squash_input_denormal(a STATUS_VAR);
3633 b = float64_squash_input_denormal(b STATUS_VAR);
3634 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3635 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3637 float_raise( float_flag_invalid STATUS_VAR);
3638 return 0;
3640 aSign = extractFloat64Sign( a );
3641 bSign = extractFloat64Sign( b );
3642 av = float64_val(a);
3643 bv = float64_val(b);
3644 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3645 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3649 /*----------------------------------------------------------------------------
3650 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3651 | be compared, and 0 otherwise. The invalid exception is raised if either
3652 | operand is a NaN. The comparison is performed according to the IEC/IEEE
3653 | Standard for Binary Floating-Point Arithmetic.
3654 *----------------------------------------------------------------------------*/
3656 int float64_unordered( float64 a, float64 b STATUS_PARAM )
3658 a = float64_squash_input_denormal(a STATUS_VAR);
3659 b = float64_squash_input_denormal(b STATUS_VAR);
3661 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3662 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3664 float_raise( float_flag_invalid STATUS_VAR);
3665 return 1;
3667 return 0;
3670 /*----------------------------------------------------------------------------
3671 | Returns 1 if the double-precision floating-point value `a' is equal to the
3672 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3673 | exception.The comparison is performed according to the IEC/IEEE Standard
3674 | for Binary Floating-Point Arithmetic.
3675 *----------------------------------------------------------------------------*/
3677 int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
3679 uint64_t av, bv;
3680 a = float64_squash_input_denormal(a STATUS_VAR);
3681 b = float64_squash_input_denormal(b STATUS_VAR);
3683 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3684 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3686 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3687 float_raise( float_flag_invalid STATUS_VAR);
3689 return 0;
3691 av = float64_val(a);
3692 bv = float64_val(b);
3693 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3697 /*----------------------------------------------------------------------------
3698 | Returns 1 if the double-precision floating-point value `a' is less than or
3699 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3700 | cause an exception. Otherwise, the comparison is performed according to the
3701 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3702 *----------------------------------------------------------------------------*/
3704 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3706 flag aSign, bSign;
3707 uint64_t av, bv;
3708 a = float64_squash_input_denormal(a STATUS_VAR);
3709 b = float64_squash_input_denormal(b STATUS_VAR);
3711 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3712 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3714 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3715 float_raise( float_flag_invalid STATUS_VAR);
3717 return 0;
3719 aSign = extractFloat64Sign( a );
3720 bSign = extractFloat64Sign( b );
3721 av = float64_val(a);
3722 bv = float64_val(b);
3723 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3724 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3728 /*----------------------------------------------------------------------------
3729 | Returns 1 if the double-precision floating-point value `a' is less than
3730 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3731 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3732 | Standard for Binary Floating-Point Arithmetic.
3733 *----------------------------------------------------------------------------*/
3735 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3737 flag aSign, bSign;
3738 uint64_t av, bv;
3739 a = float64_squash_input_denormal(a STATUS_VAR);
3740 b = float64_squash_input_denormal(b STATUS_VAR);
3742 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3743 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3745 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3746 float_raise( float_flag_invalid STATUS_VAR);
3748 return 0;
3750 aSign = extractFloat64Sign( a );
3751 bSign = extractFloat64Sign( b );
3752 av = float64_val(a);
3753 bv = float64_val(b);
3754 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3755 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3759 /*----------------------------------------------------------------------------
3760 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3761 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3762 | comparison is performed according to the IEC/IEEE Standard for Binary
3763 | Floating-Point Arithmetic.
3764 *----------------------------------------------------------------------------*/
3766 int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
3768 a = float64_squash_input_denormal(a STATUS_VAR);
3769 b = float64_squash_input_denormal(b STATUS_VAR);
3771 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3772 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3774 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3775 float_raise( float_flag_invalid STATUS_VAR);
3777 return 1;
3779 return 0;
3782 /*----------------------------------------------------------------------------
3783 | Returns the result of converting the extended double-precision floating-
3784 | point value `a' to the 32-bit two's complement integer format. The
3785 | conversion is performed according to the IEC/IEEE Standard for Binary
3786 | Floating-Point Arithmetic---which means in particular that the conversion
3787 | is rounded according to the current rounding mode. If `a' is a NaN, the
3788 | largest positive integer is returned. Otherwise, if the conversion
3789 | overflows, the largest integer with the same sign as `a' is returned.
3790 *----------------------------------------------------------------------------*/
3792 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3794 flag aSign;
3795 int32 aExp, shiftCount;
3796 uint64_t aSig;
3798 aSig = extractFloatx80Frac( a );
3799 aExp = extractFloatx80Exp( a );
3800 aSign = extractFloatx80Sign( a );
3801 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3802 shiftCount = 0x4037 - aExp;
3803 if ( shiftCount <= 0 ) shiftCount = 1;
3804 shift64RightJamming( aSig, shiftCount, &aSig );
3805 return roundAndPackInt32( aSign, aSig STATUS_VAR );
3809 /*----------------------------------------------------------------------------
3810 | Returns the result of converting the extended double-precision floating-
3811 | point value `a' to the 32-bit two's complement integer format. The
3812 | conversion is performed according to the IEC/IEEE Standard for Binary
3813 | Floating-Point Arithmetic, except that the conversion is always rounded
3814 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3815 | Otherwise, if the conversion overflows, the largest integer with the same
3816 | sign as `a' is returned.
3817 *----------------------------------------------------------------------------*/
3819 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3821 flag aSign;
3822 int32 aExp, shiftCount;
3823 uint64_t aSig, savedASig;
3824 int32 z;
3826 aSig = extractFloatx80Frac( a );
3827 aExp = extractFloatx80Exp( a );
3828 aSign = extractFloatx80Sign( a );
3829 if ( 0x401E < aExp ) {
3830 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3831 goto invalid;
3833 else if ( aExp < 0x3FFF ) {
3834 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3835 return 0;
3837 shiftCount = 0x403E - aExp;
3838 savedASig = aSig;
3839 aSig >>= shiftCount;
3840 z = aSig;
3841 if ( aSign ) z = - z;
3842 if ( ( z < 0 ) ^ aSign ) {
3843 invalid:
3844 float_raise( float_flag_invalid STATUS_VAR);
3845 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3847 if ( ( aSig<<shiftCount ) != savedASig ) {
3848 STATUS(float_exception_flags) |= float_flag_inexact;
3850 return z;
3854 /*----------------------------------------------------------------------------
3855 | Returns the result of converting the extended double-precision floating-
3856 | point value `a' to the 64-bit two's complement integer format. The
3857 | conversion is performed according to the IEC/IEEE Standard for Binary
3858 | Floating-Point Arithmetic---which means in particular that the conversion
3859 | is rounded according to the current rounding mode. If `a' is a NaN,
3860 | the largest positive integer is returned. Otherwise, if the conversion
3861 | overflows, the largest integer with the same sign as `a' is returned.
3862 *----------------------------------------------------------------------------*/
3864 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3866 flag aSign;
3867 int32 aExp, shiftCount;
3868 uint64_t aSig, aSigExtra;
3870 aSig = extractFloatx80Frac( a );
3871 aExp = extractFloatx80Exp( a );
3872 aSign = extractFloatx80Sign( a );
3873 shiftCount = 0x403E - aExp;
3874 if ( shiftCount <= 0 ) {
3875 if ( shiftCount ) {
3876 float_raise( float_flag_invalid STATUS_VAR);
3877 if ( ! aSign
3878 || ( ( aExp == 0x7FFF )
3879 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3881 return LIT64( 0x7FFFFFFFFFFFFFFF );
3883 return (int64_t) LIT64( 0x8000000000000000 );
3885 aSigExtra = 0;
3887 else {
3888 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3890 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3894 /*----------------------------------------------------------------------------
3895 | Returns the result of converting the extended double-precision floating-
3896 | point value `a' to the 64-bit two's complement integer format. The
3897 | conversion is performed according to the IEC/IEEE Standard for Binary
3898 | Floating-Point Arithmetic, except that the conversion is always rounded
3899 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3900 | Otherwise, if the conversion overflows, the largest integer with the same
3901 | sign as `a' is returned.
3902 *----------------------------------------------------------------------------*/
3904 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3906 flag aSign;
3907 int32 aExp, shiftCount;
3908 uint64_t aSig;
3909 int64 z;
3911 aSig = extractFloatx80Frac( a );
3912 aExp = extractFloatx80Exp( a );
3913 aSign = extractFloatx80Sign( a );
3914 shiftCount = aExp - 0x403E;
3915 if ( 0 <= shiftCount ) {
3916 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3917 if ( ( a.high != 0xC03E ) || aSig ) {
3918 float_raise( float_flag_invalid STATUS_VAR);
3919 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3920 return LIT64( 0x7FFFFFFFFFFFFFFF );
3923 return (int64_t) LIT64( 0x8000000000000000 );
3925 else if ( aExp < 0x3FFF ) {
3926 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3927 return 0;
3929 z = aSig>>( - shiftCount );
3930 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3931 STATUS(float_exception_flags) |= float_flag_inexact;
3933 if ( aSign ) z = - z;
3934 return z;
3938 /*----------------------------------------------------------------------------
3939 | Returns the result of converting the extended double-precision floating-
3940 | point value `a' to the single-precision floating-point format. The
3941 | conversion is performed according to the IEC/IEEE Standard for Binary
3942 | Floating-Point Arithmetic.
3943 *----------------------------------------------------------------------------*/
3945 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3947 flag aSign;
3948 int32 aExp;
3949 uint64_t aSig;
3951 aSig = extractFloatx80Frac( a );
3952 aExp = extractFloatx80Exp( a );
3953 aSign = extractFloatx80Sign( a );
3954 if ( aExp == 0x7FFF ) {
3955 if ( (uint64_t) ( aSig<<1 ) ) {
3956 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3958 return packFloat32( aSign, 0xFF, 0 );
3960 shift64RightJamming( aSig, 33, &aSig );
3961 if ( aExp || aSig ) aExp -= 0x3F81;
3962 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3966 /*----------------------------------------------------------------------------
3967 | Returns the result of converting the extended double-precision floating-
3968 | point value `a' to the double-precision floating-point format. The
3969 | conversion is performed according to the IEC/IEEE Standard for Binary
3970 | Floating-Point Arithmetic.
3971 *----------------------------------------------------------------------------*/
3973 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3975 flag aSign;
3976 int32 aExp;
3977 uint64_t aSig, zSig;
3979 aSig = extractFloatx80Frac( a );
3980 aExp = extractFloatx80Exp( a );
3981 aSign = extractFloatx80Sign( a );
3982 if ( aExp == 0x7FFF ) {
3983 if ( (uint64_t) ( aSig<<1 ) ) {
3984 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3986 return packFloat64( aSign, 0x7FF, 0 );
3988 shift64RightJamming( aSig, 1, &zSig );
3989 if ( aExp || aSig ) aExp -= 0x3C01;
3990 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3994 /*----------------------------------------------------------------------------
3995 | Returns the result of converting the extended double-precision floating-
3996 | point value `a' to the quadruple-precision floating-point format. The
3997 | conversion is performed according to the IEC/IEEE Standard for Binary
3998 | Floating-Point Arithmetic.
3999 *----------------------------------------------------------------------------*/
4001 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4003 flag aSign;
4004 int16 aExp;
4005 uint64_t aSig, zSig0, zSig1;
4007 aSig = extractFloatx80Frac( a );
4008 aExp = extractFloatx80Exp( a );
4009 aSign = extractFloatx80Sign( a );
4010 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4011 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4013 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4014 return packFloat128( aSign, aExp, zSig0, zSig1 );
4018 /*----------------------------------------------------------------------------
4019 | Rounds the extended double-precision floating-point value `a' to an integer,
4020 | and returns the result as an extended quadruple-precision floating-point
4021 | value. The operation is performed according to the IEC/IEEE Standard for
4022 | Binary Floating-Point Arithmetic.
4023 *----------------------------------------------------------------------------*/
4025 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4027 flag aSign;
4028 int32 aExp;
4029 uint64_t lastBitMask, roundBitsMask;
4030 int8 roundingMode;
4031 floatx80 z;
4033 aExp = extractFloatx80Exp( a );
4034 if ( 0x403E <= aExp ) {
4035 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4036 return propagateFloatx80NaN( a, a STATUS_VAR );
4038 return a;
4040 if ( aExp < 0x3FFF ) {
4041 if ( ( aExp == 0 )
4042 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4043 return a;
4045 STATUS(float_exception_flags) |= float_flag_inexact;
4046 aSign = extractFloatx80Sign( a );
4047 switch ( STATUS(float_rounding_mode) ) {
4048 case float_round_nearest_even:
4049 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4051 return
4052 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4054 break;
4055 case float_round_down:
4056 return
4057 aSign ?
4058 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4059 : packFloatx80( 0, 0, 0 );
4060 case float_round_up:
4061 return
4062 aSign ? packFloatx80( 1, 0, 0 )
4063 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4065 return packFloatx80( aSign, 0, 0 );
4067 lastBitMask = 1;
4068 lastBitMask <<= 0x403E - aExp;
4069 roundBitsMask = lastBitMask - 1;
4070 z = a;
4071 roundingMode = STATUS(float_rounding_mode);
4072 if ( roundingMode == float_round_nearest_even ) {
4073 z.low += lastBitMask>>1;
4074 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4076 else if ( roundingMode != float_round_to_zero ) {
4077 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4078 z.low += roundBitsMask;
4081 z.low &= ~ roundBitsMask;
4082 if ( z.low == 0 ) {
4083 ++z.high;
4084 z.low = LIT64( 0x8000000000000000 );
4086 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4087 return z;
4091 /*----------------------------------------------------------------------------
4092 | Returns the result of adding the absolute values of the extended double-
4093 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4094 | negated before being returned. `zSign' is ignored if the result is a NaN.
4095 | The addition is performed according to the IEC/IEEE Standard for Binary
4096 | Floating-Point Arithmetic.
4097 *----------------------------------------------------------------------------*/
4099 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4101 int32 aExp, bExp, zExp;
4102 uint64_t aSig, bSig, zSig0, zSig1;
4103 int32 expDiff;
4105 aSig = extractFloatx80Frac( a );
4106 aExp = extractFloatx80Exp( a );
4107 bSig = extractFloatx80Frac( b );
4108 bExp = extractFloatx80Exp( b );
4109 expDiff = aExp - bExp;
4110 if ( 0 < expDiff ) {
4111 if ( aExp == 0x7FFF ) {
4112 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4113 return a;
4115 if ( bExp == 0 ) --expDiff;
4116 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4117 zExp = aExp;
4119 else if ( expDiff < 0 ) {
4120 if ( bExp == 0x7FFF ) {
4121 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4122 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4124 if ( aExp == 0 ) ++expDiff;
4125 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4126 zExp = bExp;
4128 else {
4129 if ( aExp == 0x7FFF ) {
4130 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4131 return propagateFloatx80NaN( a, b STATUS_VAR );
4133 return a;
4135 zSig1 = 0;
4136 zSig0 = aSig + bSig;
4137 if ( aExp == 0 ) {
4138 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4139 goto roundAndPack;
4141 zExp = aExp;
4142 goto shiftRight1;
4144 zSig0 = aSig + bSig;
4145 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4146 shiftRight1:
4147 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4148 zSig0 |= LIT64( 0x8000000000000000 );
4149 ++zExp;
4150 roundAndPack:
4151 return
4152 roundAndPackFloatx80(
4153 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4157 /*----------------------------------------------------------------------------
4158 | Returns the result of subtracting the absolute values of the extended
4159 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4160 | difference is negated before being returned. `zSign' is ignored if the
4161 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4162 | Standard for Binary Floating-Point Arithmetic.
4163 *----------------------------------------------------------------------------*/
4165 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4167 int32 aExp, bExp, zExp;
4168 uint64_t aSig, bSig, zSig0, zSig1;
4169 int32 expDiff;
4170 floatx80 z;
4172 aSig = extractFloatx80Frac( a );
4173 aExp = extractFloatx80Exp( a );
4174 bSig = extractFloatx80Frac( b );
4175 bExp = extractFloatx80Exp( b );
4176 expDiff = aExp - bExp;
4177 if ( 0 < expDiff ) goto aExpBigger;
4178 if ( expDiff < 0 ) goto bExpBigger;
4179 if ( aExp == 0x7FFF ) {
4180 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4181 return propagateFloatx80NaN( a, b STATUS_VAR );
4183 float_raise( float_flag_invalid STATUS_VAR);
4184 z.low = floatx80_default_nan_low;
4185 z.high = floatx80_default_nan_high;
4186 return z;
4188 if ( aExp == 0 ) {
4189 aExp = 1;
4190 bExp = 1;
4192 zSig1 = 0;
4193 if ( bSig < aSig ) goto aBigger;
4194 if ( aSig < bSig ) goto bBigger;
4195 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4196 bExpBigger:
4197 if ( bExp == 0x7FFF ) {
4198 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4199 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4201 if ( aExp == 0 ) ++expDiff;
4202 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4203 bBigger:
4204 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4205 zExp = bExp;
4206 zSign ^= 1;
4207 goto normalizeRoundAndPack;
4208 aExpBigger:
4209 if ( aExp == 0x7FFF ) {
4210 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4211 return a;
4213 if ( bExp == 0 ) --expDiff;
4214 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4215 aBigger:
4216 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4217 zExp = aExp;
4218 normalizeRoundAndPack:
4219 return
4220 normalizeRoundAndPackFloatx80(
4221 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4225 /*----------------------------------------------------------------------------
4226 | Returns the result of adding the extended double-precision floating-point
4227 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4228 | Standard for Binary Floating-Point Arithmetic.
4229 *----------------------------------------------------------------------------*/
4231 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4233 flag aSign, bSign;
4235 aSign = extractFloatx80Sign( a );
4236 bSign = extractFloatx80Sign( b );
4237 if ( aSign == bSign ) {
4238 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4240 else {
4241 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4246 /*----------------------------------------------------------------------------
4247 | Returns the result of subtracting the extended double-precision floating-
4248 | point values `a' and `b'. The operation is performed according to the
4249 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4250 *----------------------------------------------------------------------------*/
4252 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4254 flag aSign, bSign;
4256 aSign = extractFloatx80Sign( a );
4257 bSign = extractFloatx80Sign( b );
4258 if ( aSign == bSign ) {
4259 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4261 else {
4262 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4267 /*----------------------------------------------------------------------------
4268 | Returns the result of multiplying the extended double-precision floating-
4269 | point values `a' and `b'. The operation is performed according to the
4270 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4271 *----------------------------------------------------------------------------*/
4273 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4275 flag aSign, bSign, zSign;
4276 int32 aExp, bExp, zExp;
4277 uint64_t aSig, bSig, zSig0, zSig1;
4278 floatx80 z;
4280 aSig = extractFloatx80Frac( a );
4281 aExp = extractFloatx80Exp( a );
4282 aSign = extractFloatx80Sign( a );
4283 bSig = extractFloatx80Frac( b );
4284 bExp = extractFloatx80Exp( b );
4285 bSign = extractFloatx80Sign( b );
4286 zSign = aSign ^ bSign;
4287 if ( aExp == 0x7FFF ) {
4288 if ( (uint64_t) ( aSig<<1 )
4289 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4290 return propagateFloatx80NaN( a, b STATUS_VAR );
4292 if ( ( bExp | bSig ) == 0 ) goto invalid;
4293 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4295 if ( bExp == 0x7FFF ) {
4296 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4297 if ( ( aExp | aSig ) == 0 ) {
4298 invalid:
4299 float_raise( float_flag_invalid STATUS_VAR);
4300 z.low = floatx80_default_nan_low;
4301 z.high = floatx80_default_nan_high;
4302 return z;
4304 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4306 if ( aExp == 0 ) {
4307 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4308 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4310 if ( bExp == 0 ) {
4311 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4312 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4314 zExp = aExp + bExp - 0x3FFE;
4315 mul64To128( aSig, bSig, &zSig0, &zSig1 );
4316 if ( 0 < (int64_t) zSig0 ) {
4317 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4318 --zExp;
4320 return
4321 roundAndPackFloatx80(
4322 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4326 /*----------------------------------------------------------------------------
4327 | Returns the result of dividing the extended double-precision floating-point
4328 | value `a' by the corresponding value `b'. The operation is performed
4329 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4330 *----------------------------------------------------------------------------*/
4332 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4334 flag aSign, bSign, zSign;
4335 int32 aExp, bExp, zExp;
4336 uint64_t aSig, bSig, zSig0, zSig1;
4337 uint64_t rem0, rem1, rem2, term0, term1, term2;
4338 floatx80 z;
4340 aSig = extractFloatx80Frac( a );
4341 aExp = extractFloatx80Exp( a );
4342 aSign = extractFloatx80Sign( a );
4343 bSig = extractFloatx80Frac( b );
4344 bExp = extractFloatx80Exp( b );
4345 bSign = extractFloatx80Sign( b );
4346 zSign = aSign ^ bSign;
4347 if ( aExp == 0x7FFF ) {
4348 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4349 if ( bExp == 0x7FFF ) {
4350 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4351 goto invalid;
4353 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4355 if ( bExp == 0x7FFF ) {
4356 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4357 return packFloatx80( zSign, 0, 0 );
4359 if ( bExp == 0 ) {
4360 if ( bSig == 0 ) {
4361 if ( ( aExp | aSig ) == 0 ) {
4362 invalid:
4363 float_raise( float_flag_invalid STATUS_VAR);
4364 z.low = floatx80_default_nan_low;
4365 z.high = floatx80_default_nan_high;
4366 return z;
4368 float_raise( float_flag_divbyzero STATUS_VAR);
4369 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4371 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4373 if ( aExp == 0 ) {
4374 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4375 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4377 zExp = aExp - bExp + 0x3FFE;
4378 rem1 = 0;
4379 if ( bSig <= aSig ) {
4380 shift128Right( aSig, 0, 1, &aSig, &rem1 );
4381 ++zExp;
4383 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4384 mul64To128( bSig, zSig0, &term0, &term1 );
4385 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4386 while ( (int64_t) rem0 < 0 ) {
4387 --zSig0;
4388 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4390 zSig1 = estimateDiv128To64( rem1, 0, bSig );
4391 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4392 mul64To128( bSig, zSig1, &term1, &term2 );
4393 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4394 while ( (int64_t) rem1 < 0 ) {
4395 --zSig1;
4396 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4398 zSig1 |= ( ( rem1 | rem2 ) != 0 );
4400 return
4401 roundAndPackFloatx80(
4402 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4406 /*----------------------------------------------------------------------------
4407 | Returns the remainder of the extended double-precision floating-point value
4408 | `a' with respect to the corresponding value `b'. The operation is performed
4409 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4410 *----------------------------------------------------------------------------*/
4412 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4414 flag aSign, zSign;
4415 int32 aExp, bExp, expDiff;
4416 uint64_t aSig0, aSig1, bSig;
4417 uint64_t q, term0, term1, alternateASig0, alternateASig1;
4418 floatx80 z;
4420 aSig0 = extractFloatx80Frac( a );
4421 aExp = extractFloatx80Exp( a );
4422 aSign = extractFloatx80Sign( a );
4423 bSig = extractFloatx80Frac( b );
4424 bExp = extractFloatx80Exp( b );
4425 if ( aExp == 0x7FFF ) {
4426 if ( (uint64_t) ( aSig0<<1 )
4427 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4428 return propagateFloatx80NaN( a, b STATUS_VAR );
4430 goto invalid;
4432 if ( bExp == 0x7FFF ) {
4433 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4434 return a;
4436 if ( bExp == 0 ) {
4437 if ( bSig == 0 ) {
4438 invalid:
4439 float_raise( float_flag_invalid STATUS_VAR);
4440 z.low = floatx80_default_nan_low;
4441 z.high = floatx80_default_nan_high;
4442 return z;
4444 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4446 if ( aExp == 0 ) {
4447 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4448 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4450 bSig |= LIT64( 0x8000000000000000 );
4451 zSign = aSign;
4452 expDiff = aExp - bExp;
4453 aSig1 = 0;
4454 if ( expDiff < 0 ) {
4455 if ( expDiff < -1 ) return a;
4456 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4457 expDiff = 0;
4459 q = ( bSig <= aSig0 );
4460 if ( q ) aSig0 -= bSig;
4461 expDiff -= 64;
4462 while ( 0 < expDiff ) {
4463 q = estimateDiv128To64( aSig0, aSig1, bSig );
4464 q = ( 2 < q ) ? q - 2 : 0;
4465 mul64To128( bSig, q, &term0, &term1 );
4466 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4467 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4468 expDiff -= 62;
4470 expDiff += 64;
4471 if ( 0 < expDiff ) {
4472 q = estimateDiv128To64( aSig0, aSig1, bSig );
4473 q = ( 2 < q ) ? q - 2 : 0;
4474 q >>= 64 - expDiff;
4475 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4476 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4477 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4478 while ( le128( term0, term1, aSig0, aSig1 ) ) {
4479 ++q;
4480 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4483 else {
4484 term1 = 0;
4485 term0 = bSig;
4487 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4488 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4489 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4490 && ( q & 1 ) )
4492 aSig0 = alternateASig0;
4493 aSig1 = alternateASig1;
4494 zSign = ! zSign;
4496 return
4497 normalizeRoundAndPackFloatx80(
4498 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4502 /*----------------------------------------------------------------------------
4503 | Returns the square root of the extended double-precision floating-point
4504 | value `a'. The operation is performed according to the IEC/IEEE Standard
4505 | for Binary Floating-Point Arithmetic.
4506 *----------------------------------------------------------------------------*/
4508 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4510 flag aSign;
4511 int32 aExp, zExp;
4512 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4513 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4514 floatx80 z;
4516 aSig0 = extractFloatx80Frac( a );
4517 aExp = extractFloatx80Exp( a );
4518 aSign = extractFloatx80Sign( a );
4519 if ( aExp == 0x7FFF ) {
4520 if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4521 if ( ! aSign ) return a;
4522 goto invalid;
4524 if ( aSign ) {
4525 if ( ( aExp | aSig0 ) == 0 ) return a;
4526 invalid:
4527 float_raise( float_flag_invalid STATUS_VAR);
4528 z.low = floatx80_default_nan_low;
4529 z.high = floatx80_default_nan_high;
4530 return z;
4532 if ( aExp == 0 ) {
4533 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4534 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4536 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4537 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4538 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4539 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4540 doubleZSig0 = zSig0<<1;
4541 mul64To128( zSig0, zSig0, &term0, &term1 );
4542 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4543 while ( (int64_t) rem0 < 0 ) {
4544 --zSig0;
4545 doubleZSig0 -= 2;
4546 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4548 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4549 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4550 if ( zSig1 == 0 ) zSig1 = 1;
4551 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4552 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4553 mul64To128( zSig1, zSig1, &term2, &term3 );
4554 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4555 while ( (int64_t) rem1 < 0 ) {
4556 --zSig1;
4557 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4558 term3 |= 1;
4559 term2 |= doubleZSig0;
4560 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4562 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4564 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4565 zSig0 |= doubleZSig0;
4566 return
4567 roundAndPackFloatx80(
4568 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4572 /*----------------------------------------------------------------------------
4573 | Returns 1 if the extended double-precision floating-point value `a' is equal
4574 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4575 | raised if either operand is a NaN. Otherwise, the comparison is performed
4576 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4577 *----------------------------------------------------------------------------*/
4579 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4582 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4583 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4584 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4585 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4587 float_raise( float_flag_invalid STATUS_VAR);
4588 return 0;
4590 return
4591 ( a.low == b.low )
4592 && ( ( a.high == b.high )
4593 || ( ( a.low == 0 )
4594 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4599 /*----------------------------------------------------------------------------
4600 | Returns 1 if the extended double-precision floating-point value `a' is
4601 | less than or equal to the corresponding value `b', and 0 otherwise. The
4602 | invalid exception is raised if either operand is a NaN. The comparison is
4603 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4604 | Arithmetic.
4605 *----------------------------------------------------------------------------*/
4607 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4609 flag aSign, bSign;
4611 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4612 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4613 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4614 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4616 float_raise( float_flag_invalid STATUS_VAR);
4617 return 0;
4619 aSign = extractFloatx80Sign( a );
4620 bSign = extractFloatx80Sign( b );
4621 if ( aSign != bSign ) {
4622 return
4623 aSign
4624 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4625 == 0 );
4627 return
4628 aSign ? le128( b.high, b.low, a.high, a.low )
4629 : le128( a.high, a.low, b.high, b.low );
4633 /*----------------------------------------------------------------------------
4634 | Returns 1 if the extended double-precision floating-point value `a' is
4635 | less than the corresponding value `b', and 0 otherwise. The invalid
4636 | exception is raised if either operand is a NaN. The comparison is performed
4637 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4638 *----------------------------------------------------------------------------*/
4640 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4642 flag aSign, bSign;
4644 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4645 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4646 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4647 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4649 float_raise( float_flag_invalid STATUS_VAR);
4650 return 0;
4652 aSign = extractFloatx80Sign( a );
4653 bSign = extractFloatx80Sign( b );
4654 if ( aSign != bSign ) {
4655 return
4656 aSign
4657 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4658 != 0 );
4660 return
4661 aSign ? lt128( b.high, b.low, a.high, a.low )
4662 : lt128( a.high, a.low, b.high, b.low );
4666 /*----------------------------------------------------------------------------
4667 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4668 | cannot be compared, and 0 otherwise. The invalid exception is raised if
4669 | either operand is a NaN. The comparison is performed according to the
4670 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4671 *----------------------------------------------------------------------------*/
4672 int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
4674 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4675 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4676 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4677 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4679 float_raise( float_flag_invalid STATUS_VAR);
4680 return 1;
4682 return 0;
4685 /*----------------------------------------------------------------------------
4686 | Returns 1 if the extended double-precision floating-point value `a' is
4687 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4688 | cause an exception. The comparison is performed according to the IEC/IEEE
4689 | Standard for Binary Floating-Point Arithmetic.
4690 *----------------------------------------------------------------------------*/
4692 int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4695 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4696 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4697 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4698 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4700 if ( floatx80_is_signaling_nan( a )
4701 || floatx80_is_signaling_nan( b ) ) {
4702 float_raise( float_flag_invalid STATUS_VAR);
4704 return 0;
4706 return
4707 ( a.low == b.low )
4708 && ( ( a.high == b.high )
4709 || ( ( a.low == 0 )
4710 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4715 /*----------------------------------------------------------------------------
4716 | Returns 1 if the extended double-precision floating-point value `a' is less
4717 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4718 | do not cause an exception. Otherwise, the comparison is performed according
4719 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4720 *----------------------------------------------------------------------------*/
4722 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4724 flag aSign, bSign;
4726 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4727 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4728 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4729 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4731 if ( floatx80_is_signaling_nan( a )
4732 || floatx80_is_signaling_nan( b ) ) {
4733 float_raise( float_flag_invalid STATUS_VAR);
4735 return 0;
4737 aSign = extractFloatx80Sign( a );
4738 bSign = extractFloatx80Sign( b );
4739 if ( aSign != bSign ) {
4740 return
4741 aSign
4742 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4743 == 0 );
4745 return
4746 aSign ? le128( b.high, b.low, a.high, a.low )
4747 : le128( a.high, a.low, b.high, b.low );
4751 /*----------------------------------------------------------------------------
4752 | Returns 1 if the extended double-precision floating-point value `a' is less
4753 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4754 | an exception. Otherwise, the comparison is performed according to the
4755 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4756 *----------------------------------------------------------------------------*/
4758 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4760 flag aSign, bSign;
4762 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4763 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4764 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4765 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4767 if ( floatx80_is_signaling_nan( a )
4768 || floatx80_is_signaling_nan( b ) ) {
4769 float_raise( float_flag_invalid STATUS_VAR);
4771 return 0;
4773 aSign = extractFloatx80Sign( a );
4774 bSign = extractFloatx80Sign( b );
4775 if ( aSign != bSign ) {
4776 return
4777 aSign
4778 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4779 != 0 );
4781 return
4782 aSign ? lt128( b.high, b.low, a.high, a.low )
4783 : lt128( a.high, a.low, b.high, b.low );
4787 /*----------------------------------------------------------------------------
4788 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4789 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
4790 | The comparison is performed according to the IEC/IEEE Standard for Binary
4791 | Floating-Point Arithmetic.
4792 *----------------------------------------------------------------------------*/
4793 int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4795 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4796 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4797 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4798 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4800 if ( floatx80_is_signaling_nan( a )
4801 || floatx80_is_signaling_nan( b ) ) {
4802 float_raise( float_flag_invalid STATUS_VAR);
4804 return 1;
4806 return 0;
4809 /*----------------------------------------------------------------------------
4810 | Returns the result of converting the quadruple-precision floating-point
4811 | value `a' to the 32-bit two's complement integer format. The conversion
4812 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4813 | Arithmetic---which means in particular that the conversion is rounded
4814 | according to the current rounding mode. If `a' is a NaN, the largest
4815 | positive integer is returned. Otherwise, if the conversion overflows, the
4816 | largest integer with the same sign as `a' is returned.
4817 *----------------------------------------------------------------------------*/
4819 int32 float128_to_int32( float128 a STATUS_PARAM )
4821 flag aSign;
4822 int32 aExp, shiftCount;
4823 uint64_t aSig0, aSig1;
4825 aSig1 = extractFloat128Frac1( a );
4826 aSig0 = extractFloat128Frac0( a );
4827 aExp = extractFloat128Exp( a );
4828 aSign = extractFloat128Sign( a );
4829 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4830 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4831 aSig0 |= ( aSig1 != 0 );
4832 shiftCount = 0x4028 - aExp;
4833 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4834 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4838 /*----------------------------------------------------------------------------
4839 | Returns the result of converting the quadruple-precision floating-point
4840 | value `a' to the 32-bit two's complement integer format. The conversion
4841 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4842 | Arithmetic, except that the conversion is always rounded toward zero. If
4843 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4844 | conversion overflows, the largest integer with the same sign as `a' is
4845 | returned.
4846 *----------------------------------------------------------------------------*/
4848 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4850 flag aSign;
4851 int32 aExp, shiftCount;
4852 uint64_t aSig0, aSig1, savedASig;
4853 int32 z;
4855 aSig1 = extractFloat128Frac1( a );
4856 aSig0 = extractFloat128Frac0( a );
4857 aExp = extractFloat128Exp( a );
4858 aSign = extractFloat128Sign( a );
4859 aSig0 |= ( aSig1 != 0 );
4860 if ( 0x401E < aExp ) {
4861 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4862 goto invalid;
4864 else if ( aExp < 0x3FFF ) {
4865 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4866 return 0;
4868 aSig0 |= LIT64( 0x0001000000000000 );
4869 shiftCount = 0x402F - aExp;
4870 savedASig = aSig0;
4871 aSig0 >>= shiftCount;
4872 z = aSig0;
4873 if ( aSign ) z = - z;
4874 if ( ( z < 0 ) ^ aSign ) {
4875 invalid:
4876 float_raise( float_flag_invalid STATUS_VAR);
4877 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4879 if ( ( aSig0<<shiftCount ) != savedASig ) {
4880 STATUS(float_exception_flags) |= float_flag_inexact;
4882 return z;
4886 /*----------------------------------------------------------------------------
4887 | Returns the result of converting the quadruple-precision floating-point
4888 | value `a' to the 64-bit two's complement integer format. The conversion
4889 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4890 | Arithmetic---which means in particular that the conversion is rounded
4891 | according to the current rounding mode. If `a' is a NaN, the largest
4892 | positive integer is returned. Otherwise, if the conversion overflows, the
4893 | largest integer with the same sign as `a' is returned.
4894 *----------------------------------------------------------------------------*/
4896 int64 float128_to_int64( float128 a STATUS_PARAM )
4898 flag aSign;
4899 int32 aExp, shiftCount;
4900 uint64_t aSig0, aSig1;
4902 aSig1 = extractFloat128Frac1( a );
4903 aSig0 = extractFloat128Frac0( a );
4904 aExp = extractFloat128Exp( a );
4905 aSign = extractFloat128Sign( a );
4906 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4907 shiftCount = 0x402F - aExp;
4908 if ( shiftCount <= 0 ) {
4909 if ( 0x403E < aExp ) {
4910 float_raise( float_flag_invalid STATUS_VAR);
4911 if ( ! aSign
4912 || ( ( aExp == 0x7FFF )
4913 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4916 return LIT64( 0x7FFFFFFFFFFFFFFF );
4918 return (int64_t) LIT64( 0x8000000000000000 );
4920 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4922 else {
4923 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4925 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4929 /*----------------------------------------------------------------------------
4930 | Returns the result of converting the quadruple-precision floating-point
4931 | value `a' to the 64-bit two's complement integer format. The conversion
4932 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4933 | Arithmetic, except that the conversion is always rounded toward zero.
4934 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4935 | the conversion overflows, the largest integer with the same sign as `a' is
4936 | returned.
4937 *----------------------------------------------------------------------------*/
4939 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4941 flag aSign;
4942 int32 aExp, shiftCount;
4943 uint64_t aSig0, aSig1;
4944 int64 z;
4946 aSig1 = extractFloat128Frac1( a );
4947 aSig0 = extractFloat128Frac0( a );
4948 aExp = extractFloat128Exp( a );
4949 aSign = extractFloat128Sign( a );
4950 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4951 shiftCount = aExp - 0x402F;
4952 if ( 0 < shiftCount ) {
4953 if ( 0x403E <= aExp ) {
4954 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4955 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4956 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4957 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4959 else {
4960 float_raise( float_flag_invalid STATUS_VAR);
4961 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4962 return LIT64( 0x7FFFFFFFFFFFFFFF );
4965 return (int64_t) LIT64( 0x8000000000000000 );
4967 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4968 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
4969 STATUS(float_exception_flags) |= float_flag_inexact;
4972 else {
4973 if ( aExp < 0x3FFF ) {
4974 if ( aExp | aSig0 | aSig1 ) {
4975 STATUS(float_exception_flags) |= float_flag_inexact;
4977 return 0;
4979 z = aSig0>>( - shiftCount );
4980 if ( aSig1
4981 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4982 STATUS(float_exception_flags) |= float_flag_inexact;
4985 if ( aSign ) z = - z;
4986 return z;
4990 /*----------------------------------------------------------------------------
4991 | Returns the result of converting the quadruple-precision floating-point
4992 | value `a' to the single-precision floating-point format. The conversion
4993 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4994 | Arithmetic.
4995 *----------------------------------------------------------------------------*/
4997 float32 float128_to_float32( float128 a STATUS_PARAM )
4999 flag aSign;
5000 int32 aExp;
5001 uint64_t aSig0, aSig1;
5002 uint32_t zSig;
5004 aSig1 = extractFloat128Frac1( a );
5005 aSig0 = extractFloat128Frac0( a );
5006 aExp = extractFloat128Exp( a );
5007 aSign = extractFloat128Sign( a );
5008 if ( aExp == 0x7FFF ) {
5009 if ( aSig0 | aSig1 ) {
5010 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5012 return packFloat32( aSign, 0xFF, 0 );
5014 aSig0 |= ( aSig1 != 0 );
5015 shift64RightJamming( aSig0, 18, &aSig0 );
5016 zSig = aSig0;
5017 if ( aExp || zSig ) {
5018 zSig |= 0x40000000;
5019 aExp -= 0x3F81;
5021 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5025 /*----------------------------------------------------------------------------
5026 | Returns the result of converting the quadruple-precision floating-point
5027 | value `a' to the double-precision floating-point format. The conversion
5028 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5029 | Arithmetic.
5030 *----------------------------------------------------------------------------*/
5032 float64 float128_to_float64( float128 a STATUS_PARAM )
5034 flag aSign;
5035 int32 aExp;
5036 uint64_t aSig0, aSig1;
5038 aSig1 = extractFloat128Frac1( a );
5039 aSig0 = extractFloat128Frac0( a );
5040 aExp = extractFloat128Exp( a );
5041 aSign = extractFloat128Sign( a );
5042 if ( aExp == 0x7FFF ) {
5043 if ( aSig0 | aSig1 ) {
5044 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5046 return packFloat64( aSign, 0x7FF, 0 );
5048 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5049 aSig0 |= ( aSig1 != 0 );
5050 if ( aExp || aSig0 ) {
5051 aSig0 |= LIT64( 0x4000000000000000 );
5052 aExp -= 0x3C01;
5054 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5058 /*----------------------------------------------------------------------------
5059 | Returns the result of converting the quadruple-precision floating-point
5060 | value `a' to the extended double-precision floating-point format. The
5061 | conversion is performed according to the IEC/IEEE Standard for Binary
5062 | Floating-Point Arithmetic.
5063 *----------------------------------------------------------------------------*/
5065 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5067 flag aSign;
5068 int32 aExp;
5069 uint64_t aSig0, aSig1;
5071 aSig1 = extractFloat128Frac1( a );
5072 aSig0 = extractFloat128Frac0( a );
5073 aExp = extractFloat128Exp( a );
5074 aSign = extractFloat128Sign( a );
5075 if ( aExp == 0x7FFF ) {
5076 if ( aSig0 | aSig1 ) {
5077 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5079 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5081 if ( aExp == 0 ) {
5082 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5083 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5085 else {
5086 aSig0 |= LIT64( 0x0001000000000000 );
5088 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5089 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5093 /*----------------------------------------------------------------------------
5094 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5095 | returns the result as a quadruple-precision floating-point value. The
5096 | operation is performed according to the IEC/IEEE Standard for Binary
5097 | Floating-Point Arithmetic.
5098 *----------------------------------------------------------------------------*/
5100 float128 float128_round_to_int( float128 a STATUS_PARAM )
5102 flag aSign;
5103 int32 aExp;
5104 uint64_t lastBitMask, roundBitsMask;
5105 int8 roundingMode;
5106 float128 z;
5108 aExp = extractFloat128Exp( a );
5109 if ( 0x402F <= aExp ) {
5110 if ( 0x406F <= aExp ) {
5111 if ( ( aExp == 0x7FFF )
5112 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5114 return propagateFloat128NaN( a, a STATUS_VAR );
5116 return a;
5118 lastBitMask = 1;
5119 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5120 roundBitsMask = lastBitMask - 1;
5121 z = a;
5122 roundingMode = STATUS(float_rounding_mode);
5123 if ( roundingMode == float_round_nearest_even ) {
5124 if ( lastBitMask ) {
5125 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5126 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5128 else {
5129 if ( (int64_t) z.low < 0 ) {
5130 ++z.high;
5131 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5135 else if ( roundingMode != float_round_to_zero ) {
5136 if ( extractFloat128Sign( z )
5137 ^ ( roundingMode == float_round_up ) ) {
5138 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5141 z.low &= ~ roundBitsMask;
5143 else {
5144 if ( aExp < 0x3FFF ) {
5145 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5146 STATUS(float_exception_flags) |= float_flag_inexact;
5147 aSign = extractFloat128Sign( a );
5148 switch ( STATUS(float_rounding_mode) ) {
5149 case float_round_nearest_even:
5150 if ( ( aExp == 0x3FFE )
5151 && ( extractFloat128Frac0( a )
5152 | extractFloat128Frac1( a ) )
5154 return packFloat128( aSign, 0x3FFF, 0, 0 );
5156 break;
5157 case float_round_down:
5158 return
5159 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5160 : packFloat128( 0, 0, 0, 0 );
5161 case float_round_up:
5162 return
5163 aSign ? packFloat128( 1, 0, 0, 0 )
5164 : packFloat128( 0, 0x3FFF, 0, 0 );
5166 return packFloat128( aSign, 0, 0, 0 );
5168 lastBitMask = 1;
5169 lastBitMask <<= 0x402F - aExp;
5170 roundBitsMask = lastBitMask - 1;
5171 z.low = 0;
5172 z.high = a.high;
5173 roundingMode = STATUS(float_rounding_mode);
5174 if ( roundingMode == float_round_nearest_even ) {
5175 z.high += lastBitMask>>1;
5176 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5177 z.high &= ~ lastBitMask;
5180 else if ( roundingMode != float_round_to_zero ) {
5181 if ( extractFloat128Sign( z )
5182 ^ ( roundingMode == float_round_up ) ) {
5183 z.high |= ( a.low != 0 );
5184 z.high += roundBitsMask;
5187 z.high &= ~ roundBitsMask;
5189 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5190 STATUS(float_exception_flags) |= float_flag_inexact;
5192 return z;
5196 /*----------------------------------------------------------------------------
5197 | Returns the result of adding the absolute values of the quadruple-precision
5198 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5199 | before being returned. `zSign' is ignored if the result is a NaN.
5200 | The addition is performed according to the IEC/IEEE Standard for Binary
5201 | Floating-Point Arithmetic.
5202 *----------------------------------------------------------------------------*/
5204 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5206 int32 aExp, bExp, zExp;
5207 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5208 int32 expDiff;
5210 aSig1 = extractFloat128Frac1( a );
5211 aSig0 = extractFloat128Frac0( a );
5212 aExp = extractFloat128Exp( a );
5213 bSig1 = extractFloat128Frac1( b );
5214 bSig0 = extractFloat128Frac0( b );
5215 bExp = extractFloat128Exp( b );
5216 expDiff = aExp - bExp;
5217 if ( 0 < expDiff ) {
5218 if ( aExp == 0x7FFF ) {
5219 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5220 return a;
5222 if ( bExp == 0 ) {
5223 --expDiff;
5225 else {
5226 bSig0 |= LIT64( 0x0001000000000000 );
5228 shift128ExtraRightJamming(
5229 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5230 zExp = aExp;
5232 else if ( expDiff < 0 ) {
5233 if ( bExp == 0x7FFF ) {
5234 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5235 return packFloat128( zSign, 0x7FFF, 0, 0 );
5237 if ( aExp == 0 ) {
5238 ++expDiff;
5240 else {
5241 aSig0 |= LIT64( 0x0001000000000000 );
5243 shift128ExtraRightJamming(
5244 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5245 zExp = bExp;
5247 else {
5248 if ( aExp == 0x7FFF ) {
5249 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5250 return propagateFloat128NaN( a, b STATUS_VAR );
5252 return a;
5254 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5255 if ( aExp == 0 ) {
5256 if (STATUS(flush_to_zero)) {
5257 if (zSig0 | zSig1) {
5258 float_raise(float_flag_output_denormal STATUS_VAR);
5260 return packFloat128(zSign, 0, 0, 0);
5262 return packFloat128( zSign, 0, zSig0, zSig1 );
5264 zSig2 = 0;
5265 zSig0 |= LIT64( 0x0002000000000000 );
5266 zExp = aExp;
5267 goto shiftRight1;
5269 aSig0 |= LIT64( 0x0001000000000000 );
5270 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5271 --zExp;
5272 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5273 ++zExp;
5274 shiftRight1:
5275 shift128ExtraRightJamming(
5276 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5277 roundAndPack:
5278 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5282 /*----------------------------------------------------------------------------
5283 | Returns the result of subtracting the absolute values of the quadruple-
5284 | precision floating-point values `a' and `b'. If `zSign' is 1, the
5285 | difference is negated before being returned. `zSign' is ignored if the
5286 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5287 | Standard for Binary Floating-Point Arithmetic.
5288 *----------------------------------------------------------------------------*/
5290 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5292 int32 aExp, bExp, zExp;
5293 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5294 int32 expDiff;
5295 float128 z;
5297 aSig1 = extractFloat128Frac1( a );
5298 aSig0 = extractFloat128Frac0( a );
5299 aExp = extractFloat128Exp( a );
5300 bSig1 = extractFloat128Frac1( b );
5301 bSig0 = extractFloat128Frac0( b );
5302 bExp = extractFloat128Exp( b );
5303 expDiff = aExp - bExp;
5304 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5305 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5306 if ( 0 < expDiff ) goto aExpBigger;
5307 if ( expDiff < 0 ) goto bExpBigger;
5308 if ( aExp == 0x7FFF ) {
5309 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5310 return propagateFloat128NaN( a, b STATUS_VAR );
5312 float_raise( float_flag_invalid STATUS_VAR);
5313 z.low = float128_default_nan_low;
5314 z.high = float128_default_nan_high;
5315 return z;
5317 if ( aExp == 0 ) {
5318 aExp = 1;
5319 bExp = 1;
5321 if ( bSig0 < aSig0 ) goto aBigger;
5322 if ( aSig0 < bSig0 ) goto bBigger;
5323 if ( bSig1 < aSig1 ) goto aBigger;
5324 if ( aSig1 < bSig1 ) goto bBigger;
5325 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5326 bExpBigger:
5327 if ( bExp == 0x7FFF ) {
5328 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5329 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5331 if ( aExp == 0 ) {
5332 ++expDiff;
5334 else {
5335 aSig0 |= LIT64( 0x4000000000000000 );
5337 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5338 bSig0 |= LIT64( 0x4000000000000000 );
5339 bBigger:
5340 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5341 zExp = bExp;
5342 zSign ^= 1;
5343 goto normalizeRoundAndPack;
5344 aExpBigger:
5345 if ( aExp == 0x7FFF ) {
5346 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5347 return a;
5349 if ( bExp == 0 ) {
5350 --expDiff;
5352 else {
5353 bSig0 |= LIT64( 0x4000000000000000 );
5355 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5356 aSig0 |= LIT64( 0x4000000000000000 );
5357 aBigger:
5358 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5359 zExp = aExp;
5360 normalizeRoundAndPack:
5361 --zExp;
5362 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5366 /*----------------------------------------------------------------------------
5367 | Returns the result of adding the quadruple-precision floating-point values
5368 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5369 | for Binary Floating-Point Arithmetic.
5370 *----------------------------------------------------------------------------*/
5372 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5374 flag aSign, bSign;
5376 aSign = extractFloat128Sign( a );
5377 bSign = extractFloat128Sign( b );
5378 if ( aSign == bSign ) {
5379 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5381 else {
5382 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5387 /*----------------------------------------------------------------------------
5388 | Returns the result of subtracting the quadruple-precision floating-point
5389 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5390 | Standard for Binary Floating-Point Arithmetic.
5391 *----------------------------------------------------------------------------*/
5393 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5395 flag aSign, bSign;
5397 aSign = extractFloat128Sign( a );
5398 bSign = extractFloat128Sign( b );
5399 if ( aSign == bSign ) {
5400 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5402 else {
5403 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5408 /*----------------------------------------------------------------------------
5409 | Returns the result of multiplying the quadruple-precision floating-point
5410 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5411 | Standard for Binary Floating-Point Arithmetic.
5412 *----------------------------------------------------------------------------*/
5414 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5416 flag aSign, bSign, zSign;
5417 int32 aExp, bExp, zExp;
5418 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5419 float128 z;
5421 aSig1 = extractFloat128Frac1( a );
5422 aSig0 = extractFloat128Frac0( a );
5423 aExp = extractFloat128Exp( a );
5424 aSign = extractFloat128Sign( a );
5425 bSig1 = extractFloat128Frac1( b );
5426 bSig0 = extractFloat128Frac0( b );
5427 bExp = extractFloat128Exp( b );
5428 bSign = extractFloat128Sign( b );
5429 zSign = aSign ^ bSign;
5430 if ( aExp == 0x7FFF ) {
5431 if ( ( aSig0 | aSig1 )
5432 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5433 return propagateFloat128NaN( a, b STATUS_VAR );
5435 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5436 return packFloat128( zSign, 0x7FFF, 0, 0 );
5438 if ( bExp == 0x7FFF ) {
5439 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5440 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5441 invalid:
5442 float_raise( float_flag_invalid STATUS_VAR);
5443 z.low = float128_default_nan_low;
5444 z.high = float128_default_nan_high;
5445 return z;
5447 return packFloat128( zSign, 0x7FFF, 0, 0 );
5449 if ( aExp == 0 ) {
5450 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5451 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5453 if ( bExp == 0 ) {
5454 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5455 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5457 zExp = aExp + bExp - 0x4000;
5458 aSig0 |= LIT64( 0x0001000000000000 );
5459 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5460 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5461 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5462 zSig2 |= ( zSig3 != 0 );
5463 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5464 shift128ExtraRightJamming(
5465 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5466 ++zExp;
5468 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5472 /*----------------------------------------------------------------------------
5473 | Returns the result of dividing the quadruple-precision floating-point value
5474 | `a' by the corresponding value `b'. The operation is performed according to
5475 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5476 *----------------------------------------------------------------------------*/
5478 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5480 flag aSign, bSign, zSign;
5481 int32 aExp, bExp, zExp;
5482 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5483 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5484 float128 z;
5486 aSig1 = extractFloat128Frac1( a );
5487 aSig0 = extractFloat128Frac0( a );
5488 aExp = extractFloat128Exp( a );
5489 aSign = extractFloat128Sign( a );
5490 bSig1 = extractFloat128Frac1( b );
5491 bSig0 = extractFloat128Frac0( b );
5492 bExp = extractFloat128Exp( b );
5493 bSign = extractFloat128Sign( b );
5494 zSign = aSign ^ bSign;
5495 if ( aExp == 0x7FFF ) {
5496 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5497 if ( bExp == 0x7FFF ) {
5498 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5499 goto invalid;
5501 return packFloat128( zSign, 0x7FFF, 0, 0 );
5503 if ( bExp == 0x7FFF ) {
5504 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5505 return packFloat128( zSign, 0, 0, 0 );
5507 if ( bExp == 0 ) {
5508 if ( ( bSig0 | bSig1 ) == 0 ) {
5509 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5510 invalid:
5511 float_raise( float_flag_invalid STATUS_VAR);
5512 z.low = float128_default_nan_low;
5513 z.high = float128_default_nan_high;
5514 return z;
5516 float_raise( float_flag_divbyzero STATUS_VAR);
5517 return packFloat128( zSign, 0x7FFF, 0, 0 );
5519 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5521 if ( aExp == 0 ) {
5522 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5523 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5525 zExp = aExp - bExp + 0x3FFD;
5526 shortShift128Left(
5527 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5528 shortShift128Left(
5529 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5530 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5531 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5532 ++zExp;
5534 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5535 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5536 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5537 while ( (int64_t) rem0 < 0 ) {
5538 --zSig0;
5539 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5541 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5542 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5543 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5544 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5545 while ( (int64_t) rem1 < 0 ) {
5546 --zSig1;
5547 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5549 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5551 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5552 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5556 /*----------------------------------------------------------------------------
5557 | Returns the remainder of the quadruple-precision floating-point value `a'
5558 | with respect to the corresponding value `b'. The operation is performed
5559 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5560 *----------------------------------------------------------------------------*/
5562 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5564 flag aSign, zSign;
5565 int32 aExp, bExp, expDiff;
5566 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5567 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
5568 int64_t sigMean0;
5569 float128 z;
5571 aSig1 = extractFloat128Frac1( a );
5572 aSig0 = extractFloat128Frac0( a );
5573 aExp = extractFloat128Exp( a );
5574 aSign = extractFloat128Sign( a );
5575 bSig1 = extractFloat128Frac1( b );
5576 bSig0 = extractFloat128Frac0( b );
5577 bExp = extractFloat128Exp( b );
5578 if ( aExp == 0x7FFF ) {
5579 if ( ( aSig0 | aSig1 )
5580 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5581 return propagateFloat128NaN( a, b STATUS_VAR );
5583 goto invalid;
5585 if ( bExp == 0x7FFF ) {
5586 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5587 return a;
5589 if ( bExp == 0 ) {
5590 if ( ( bSig0 | bSig1 ) == 0 ) {
5591 invalid:
5592 float_raise( float_flag_invalid STATUS_VAR);
5593 z.low = float128_default_nan_low;
5594 z.high = float128_default_nan_high;
5595 return z;
5597 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5599 if ( aExp == 0 ) {
5600 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5601 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5603 expDiff = aExp - bExp;
5604 if ( expDiff < -1 ) return a;
5605 shortShift128Left(
5606 aSig0 | LIT64( 0x0001000000000000 ),
5607 aSig1,
5608 15 - ( expDiff < 0 ),
5609 &aSig0,
5610 &aSig1
5612 shortShift128Left(
5613 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5614 q = le128( bSig0, bSig1, aSig0, aSig1 );
5615 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5616 expDiff -= 64;
5617 while ( 0 < expDiff ) {
5618 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5619 q = ( 4 < q ) ? q - 4 : 0;
5620 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5621 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5622 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5623 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5624 expDiff -= 61;
5626 if ( -64 < expDiff ) {
5627 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5628 q = ( 4 < q ) ? q - 4 : 0;
5629 q >>= - expDiff;
5630 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5631 expDiff += 52;
5632 if ( expDiff < 0 ) {
5633 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5635 else {
5636 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5638 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5639 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5641 else {
5642 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5643 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5645 do {
5646 alternateASig0 = aSig0;
5647 alternateASig1 = aSig1;
5648 ++q;
5649 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5650 } while ( 0 <= (int64_t) aSig0 );
5651 add128(
5652 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
5653 if ( ( sigMean0 < 0 )
5654 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5655 aSig0 = alternateASig0;
5656 aSig1 = alternateASig1;
5658 zSign = ( (int64_t) aSig0 < 0 );
5659 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5660 return
5661 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5665 /*----------------------------------------------------------------------------
5666 | Returns the square root of the quadruple-precision floating-point value `a'.
5667 | The operation is performed according to the IEC/IEEE Standard for Binary
5668 | Floating-Point Arithmetic.
5669 *----------------------------------------------------------------------------*/
5671 float128 float128_sqrt( float128 a STATUS_PARAM )
5673 flag aSign;
5674 int32 aExp, zExp;
5675 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5676 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5677 float128 z;
5679 aSig1 = extractFloat128Frac1( a );
5680 aSig0 = extractFloat128Frac0( a );
5681 aExp = extractFloat128Exp( a );
5682 aSign = extractFloat128Sign( a );
5683 if ( aExp == 0x7FFF ) {
5684 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5685 if ( ! aSign ) return a;
5686 goto invalid;
5688 if ( aSign ) {
5689 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5690 invalid:
5691 float_raise( float_flag_invalid STATUS_VAR);
5692 z.low = float128_default_nan_low;
5693 z.high = float128_default_nan_high;
5694 return z;
5696 if ( aExp == 0 ) {
5697 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5698 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5700 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5701 aSig0 |= LIT64( 0x0001000000000000 );
5702 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5703 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5704 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5705 doubleZSig0 = zSig0<<1;
5706 mul64To128( zSig0, zSig0, &term0, &term1 );
5707 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5708 while ( (int64_t) rem0 < 0 ) {
5709 --zSig0;
5710 doubleZSig0 -= 2;
5711 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5713 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5714 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5715 if ( zSig1 == 0 ) zSig1 = 1;
5716 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5717 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5718 mul64To128( zSig1, zSig1, &term2, &term3 );
5719 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5720 while ( (int64_t) rem1 < 0 ) {
5721 --zSig1;
5722 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5723 term3 |= 1;
5724 term2 |= doubleZSig0;
5725 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5727 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5729 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5730 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5734 /*----------------------------------------------------------------------------
5735 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5736 | the corresponding value `b', and 0 otherwise. The invalid exception is
5737 | raised if either operand is a NaN. Otherwise, the comparison is performed
5738 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5739 *----------------------------------------------------------------------------*/
5741 int float128_eq( float128 a, float128 b STATUS_PARAM )
5744 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5745 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5746 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5747 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5749 float_raise( float_flag_invalid STATUS_VAR);
5750 return 0;
5752 return
5753 ( a.low == b.low )
5754 && ( ( a.high == b.high )
5755 || ( ( a.low == 0 )
5756 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5761 /*----------------------------------------------------------------------------
5762 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5763 | or equal to the corresponding value `b', and 0 otherwise. The invalid
5764 | exception is raised if either operand is a NaN. The comparison is performed
5765 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5766 *----------------------------------------------------------------------------*/
5768 int float128_le( float128 a, float128 b STATUS_PARAM )
5770 flag aSign, bSign;
5772 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5773 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5774 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5775 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5777 float_raise( float_flag_invalid STATUS_VAR);
5778 return 0;
5780 aSign = extractFloat128Sign( a );
5781 bSign = extractFloat128Sign( b );
5782 if ( aSign != bSign ) {
5783 return
5784 aSign
5785 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5786 == 0 );
5788 return
5789 aSign ? le128( b.high, b.low, a.high, a.low )
5790 : le128( a.high, a.low, b.high, b.low );
5794 /*----------------------------------------------------------------------------
5795 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5796 | the corresponding value `b', and 0 otherwise. The invalid exception is
5797 | raised if either operand is a NaN. The comparison is performed according
5798 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5799 *----------------------------------------------------------------------------*/
5801 int float128_lt( float128 a, float128 b STATUS_PARAM )
5803 flag aSign, bSign;
5805 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5806 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5807 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5808 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5810 float_raise( float_flag_invalid STATUS_VAR);
5811 return 0;
5813 aSign = extractFloat128Sign( a );
5814 bSign = extractFloat128Sign( b );
5815 if ( aSign != bSign ) {
5816 return
5817 aSign
5818 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5819 != 0 );
5821 return
5822 aSign ? lt128( b.high, b.low, a.high, a.low )
5823 : lt128( a.high, a.low, b.high, b.low );
5827 /*----------------------------------------------------------------------------
5828 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5829 | be compared, and 0 otherwise. The invalid exception is raised if either
5830 | operand is a NaN. The comparison is performed according to the IEC/IEEE
5831 | Standard for Binary Floating-Point Arithmetic.
5832 *----------------------------------------------------------------------------*/
5834 int float128_unordered( float128 a, float128 b STATUS_PARAM )
5836 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5837 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5838 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5839 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5841 float_raise( float_flag_invalid STATUS_VAR);
5842 return 1;
5844 return 0;
5847 /*----------------------------------------------------------------------------
5848 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5849 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5850 | exception. The comparison is performed according to the IEC/IEEE Standard
5851 | for Binary Floating-Point Arithmetic.
5852 *----------------------------------------------------------------------------*/
5854 int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
5857 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5858 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5859 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5860 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5862 if ( float128_is_signaling_nan( a )
5863 || float128_is_signaling_nan( b ) ) {
5864 float_raise( float_flag_invalid STATUS_VAR);
5866 return 0;
5868 return
5869 ( a.low == b.low )
5870 && ( ( a.high == b.high )
5871 || ( ( a.low == 0 )
5872 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5877 /*----------------------------------------------------------------------------
5878 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5879 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5880 | cause an exception. Otherwise, the comparison is performed according to the
5881 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5882 *----------------------------------------------------------------------------*/
5884 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5886 flag aSign, bSign;
5888 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5889 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5890 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5891 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5893 if ( float128_is_signaling_nan( a )
5894 || float128_is_signaling_nan( b ) ) {
5895 float_raise( float_flag_invalid STATUS_VAR);
5897 return 0;
5899 aSign = extractFloat128Sign( a );
5900 bSign = extractFloat128Sign( b );
5901 if ( aSign != bSign ) {
5902 return
5903 aSign
5904 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5905 == 0 );
5907 return
5908 aSign ? le128( b.high, b.low, a.high, a.low )
5909 : le128( a.high, a.low, b.high, b.low );
5913 /*----------------------------------------------------------------------------
5914 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5915 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5916 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5917 | Standard for Binary Floating-Point Arithmetic.
5918 *----------------------------------------------------------------------------*/
5920 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5922 flag aSign, bSign;
5924 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5925 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5926 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5927 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5929 if ( float128_is_signaling_nan( a )
5930 || float128_is_signaling_nan( b ) ) {
5931 float_raise( float_flag_invalid STATUS_VAR);
5933 return 0;
5935 aSign = extractFloat128Sign( a );
5936 bSign = extractFloat128Sign( b );
5937 if ( aSign != bSign ) {
5938 return
5939 aSign
5940 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5941 != 0 );
5943 return
5944 aSign ? lt128( b.high, b.low, a.high, a.low )
5945 : lt128( a.high, a.low, b.high, b.low );
5949 /*----------------------------------------------------------------------------
5950 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5951 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
5952 | comparison is performed according to the IEC/IEEE Standard for Binary
5953 | Floating-Point Arithmetic.
5954 *----------------------------------------------------------------------------*/
5956 int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
5958 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5959 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5960 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5961 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5963 if ( float128_is_signaling_nan( a )
5964 || float128_is_signaling_nan( b ) ) {
5965 float_raise( float_flag_invalid STATUS_VAR);
5967 return 1;
5969 return 0;
5972 /* misc functions */
5973 float32 uint32_to_float32( uint32 a STATUS_PARAM )
5975 return int64_to_float32(a STATUS_VAR);
5978 float64 uint32_to_float64( uint32 a STATUS_PARAM )
5980 return int64_to_float64(a STATUS_VAR);
5983 uint32 float32_to_uint32( float32 a STATUS_PARAM )
5985 int64_t v;
5986 uint32 res;
5988 v = float32_to_int64(a STATUS_VAR);
5989 if (v < 0) {
5990 res = 0;
5991 float_raise( float_flag_invalid STATUS_VAR);
5992 } else if (v > 0xffffffff) {
5993 res = 0xffffffff;
5994 float_raise( float_flag_invalid STATUS_VAR);
5995 } else {
5996 res = v;
5998 return res;
6001 uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6003 int64_t v;
6004 uint32 res;
6006 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6007 if (v < 0) {
6008 res = 0;
6009 float_raise( float_flag_invalid STATUS_VAR);
6010 } else if (v > 0xffffffff) {
6011 res = 0xffffffff;
6012 float_raise( float_flag_invalid STATUS_VAR);
6013 } else {
6014 res = v;
6016 return res;
6019 uint16 float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
6021 int64_t v;
6022 uint16 res;
6024 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6025 if (v < 0) {
6026 res = 0;
6027 float_raise( float_flag_invalid STATUS_VAR);
6028 } else if (v > 0xffff) {
6029 res = 0xffff;
6030 float_raise( float_flag_invalid STATUS_VAR);
6031 } else {
6032 res = v;
6034 return res;
6037 uint32 float64_to_uint32( float64 a STATUS_PARAM )
6039 int64_t v;
6040 uint32 res;
6042 v = float64_to_int64(a STATUS_VAR);
6043 if (v < 0) {
6044 res = 0;
6045 float_raise( float_flag_invalid STATUS_VAR);
6046 } else if (v > 0xffffffff) {
6047 res = 0xffffffff;
6048 float_raise( float_flag_invalid STATUS_VAR);
6049 } else {
6050 res = v;
6052 return res;
6055 uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6057 int64_t v;
6058 uint32 res;
6060 v = float64_to_int64_round_to_zero(a STATUS_VAR);
6061 if (v < 0) {
6062 res = 0;
6063 float_raise( float_flag_invalid STATUS_VAR);
6064 } else if (v > 0xffffffff) {
6065 res = 0xffffffff;
6066 float_raise( float_flag_invalid STATUS_VAR);
6067 } else {
6068 res = v;
6070 return res;
6073 uint16 float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
6075 int64_t v;
6076 uint16 res;
6078 v = float64_to_int64_round_to_zero(a STATUS_VAR);
6079 if (v < 0) {
6080 res = 0;
6081 float_raise( float_flag_invalid STATUS_VAR);
6082 } else if (v > 0xffff) {
6083 res = 0xffff;
6084 float_raise( float_flag_invalid STATUS_VAR);
6085 } else {
6086 res = v;
6088 return res;
6091 /* FIXME: This looks broken. */
6092 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
6094 int64_t v;
6096 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6097 v += float64_val(a);
6098 v = float64_to_int64(make_float64(v) STATUS_VAR);
6100 return v - INT64_MIN;
6103 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6105 int64_t v;
6107 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6108 v += float64_val(a);
6109 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
6111 return v - INT64_MIN;
6114 #define COMPARE(s, nan_exp) \
6115 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
6116 int is_quiet STATUS_PARAM ) \
6118 flag aSign, bSign; \
6119 uint ## s ## _t av, bv; \
6120 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6121 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6123 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
6124 extractFloat ## s ## Frac( a ) ) || \
6125 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
6126 extractFloat ## s ## Frac( b ) )) { \
6127 if (!is_quiet || \
6128 float ## s ## _is_signaling_nan( a ) || \
6129 float ## s ## _is_signaling_nan( b ) ) { \
6130 float_raise( float_flag_invalid STATUS_VAR); \
6132 return float_relation_unordered; \
6134 aSign = extractFloat ## s ## Sign( a ); \
6135 bSign = extractFloat ## s ## Sign( b ); \
6136 av = float ## s ## _val(a); \
6137 bv = float ## s ## _val(b); \
6138 if ( aSign != bSign ) { \
6139 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
6140 /* zero case */ \
6141 return float_relation_equal; \
6142 } else { \
6143 return 1 - (2 * aSign); \
6145 } else { \
6146 if (av == bv) { \
6147 return float_relation_equal; \
6148 } else { \
6149 return 1 - 2 * (aSign ^ ( av < bv )); \
6154 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
6156 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
6159 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
6161 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6164 COMPARE(32, 0xff)
6165 COMPARE(64, 0x7ff)
6167 INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6168 int is_quiet STATUS_PARAM )
6170 flag aSign, bSign;
6172 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6173 ( extractFloatx80Frac( a )<<1 ) ) ||
6174 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6175 ( extractFloatx80Frac( b )<<1 ) )) {
6176 if (!is_quiet ||
6177 floatx80_is_signaling_nan( a ) ||
6178 floatx80_is_signaling_nan( b ) ) {
6179 float_raise( float_flag_invalid STATUS_VAR);
6181 return float_relation_unordered;
6183 aSign = extractFloatx80Sign( a );
6184 bSign = extractFloatx80Sign( b );
6185 if ( aSign != bSign ) {
6187 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6188 ( ( a.low | b.low ) == 0 ) ) {
6189 /* zero case */
6190 return float_relation_equal;
6191 } else {
6192 return 1 - (2 * aSign);
6194 } else {
6195 if (a.low == b.low && a.high == b.high) {
6196 return float_relation_equal;
6197 } else {
6198 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6203 int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6205 return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6208 int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6210 return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6213 INLINE int float128_compare_internal( float128 a, float128 b,
6214 int is_quiet STATUS_PARAM )
6216 flag aSign, bSign;
6218 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6219 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6220 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6221 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6222 if (!is_quiet ||
6223 float128_is_signaling_nan( a ) ||
6224 float128_is_signaling_nan( b ) ) {
6225 float_raise( float_flag_invalid STATUS_VAR);
6227 return float_relation_unordered;
6229 aSign = extractFloat128Sign( a );
6230 bSign = extractFloat128Sign( b );
6231 if ( aSign != bSign ) {
6232 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6233 /* zero case */
6234 return float_relation_equal;
6235 } else {
6236 return 1 - (2 * aSign);
6238 } else {
6239 if (a.low == b.low && a.high == b.high) {
6240 return float_relation_equal;
6241 } else {
6242 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6247 int float128_compare( float128 a, float128 b STATUS_PARAM )
6249 return float128_compare_internal(a, b, 0 STATUS_VAR);
6252 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6254 return float128_compare_internal(a, b, 1 STATUS_VAR);
6257 /* min() and max() functions. These can't be implemented as
6258 * 'compare and pick one input' because that would mishandle
6259 * NaNs and +0 vs -0.
6261 #define MINMAX(s, nan_exp) \
6262 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \
6263 int ismin STATUS_PARAM ) \
6265 flag aSign, bSign; \
6266 uint ## s ## _t av, bv; \
6267 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6268 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6269 if (float ## s ## _is_any_nan(a) || \
6270 float ## s ## _is_any_nan(b)) { \
6271 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
6273 aSign = extractFloat ## s ## Sign(a); \
6274 bSign = extractFloat ## s ## Sign(b); \
6275 av = float ## s ## _val(a); \
6276 bv = float ## s ## _val(b); \
6277 if (aSign != bSign) { \
6278 if (ismin) { \
6279 return aSign ? a : b; \
6280 } else { \
6281 return aSign ? b : a; \
6283 } else { \
6284 if (ismin) { \
6285 return (aSign ^ (av < bv)) ? a : b; \
6286 } else { \
6287 return (aSign ^ (av < bv)) ? b : a; \
6292 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
6294 return float ## s ## _minmax(a, b, 1 STATUS_VAR); \
6297 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
6299 return float ## s ## _minmax(a, b, 0 STATUS_VAR); \
6302 MINMAX(32, 0xff)
6303 MINMAX(64, 0x7ff)
6306 /* Multiply A by 2 raised to the power N. */
6307 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6309 flag aSign;
6310 int16_t aExp;
6311 uint32_t aSig;
6313 a = float32_squash_input_denormal(a STATUS_VAR);
6314 aSig = extractFloat32Frac( a );
6315 aExp = extractFloat32Exp( a );
6316 aSign = extractFloat32Sign( a );
6318 if ( aExp == 0xFF ) {
6319 if ( aSig ) {
6320 return propagateFloat32NaN( a, a STATUS_VAR );
6322 return a;
6324 if ( aExp != 0 )
6325 aSig |= 0x00800000;
6326 else if ( aSig == 0 )
6327 return a;
6329 if (n > 0x200) {
6330 n = 0x200;
6331 } else if (n < -0x200) {
6332 n = -0x200;
6335 aExp += n - 1;
6336 aSig <<= 7;
6337 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6340 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6342 flag aSign;
6343 int16_t aExp;
6344 uint64_t aSig;
6346 a = float64_squash_input_denormal(a STATUS_VAR);
6347 aSig = extractFloat64Frac( a );
6348 aExp = extractFloat64Exp( a );
6349 aSign = extractFloat64Sign( a );
6351 if ( aExp == 0x7FF ) {
6352 if ( aSig ) {
6353 return propagateFloat64NaN( a, a STATUS_VAR );
6355 return a;
6357 if ( aExp != 0 )
6358 aSig |= LIT64( 0x0010000000000000 );
6359 else if ( aSig == 0 )
6360 return a;
6362 if (n > 0x1000) {
6363 n = 0x1000;
6364 } else if (n < -0x1000) {
6365 n = -0x1000;
6368 aExp += n - 1;
6369 aSig <<= 10;
6370 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6373 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6375 flag aSign;
6376 int32_t aExp;
6377 uint64_t aSig;
6379 aSig = extractFloatx80Frac( a );
6380 aExp = extractFloatx80Exp( a );
6381 aSign = extractFloatx80Sign( a );
6383 if ( aExp == 0x7FFF ) {
6384 if ( aSig<<1 ) {
6385 return propagateFloatx80NaN( a, a STATUS_VAR );
6387 return a;
6390 if (aExp == 0 && aSig == 0)
6391 return a;
6393 if (n > 0x10000) {
6394 n = 0x10000;
6395 } else if (n < -0x10000) {
6396 n = -0x10000;
6399 aExp += n;
6400 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6401 aSign, aExp, aSig, 0 STATUS_VAR );
6404 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6406 flag aSign;
6407 int32_t aExp;
6408 uint64_t aSig0, aSig1;
6410 aSig1 = extractFloat128Frac1( a );
6411 aSig0 = extractFloat128Frac0( a );
6412 aExp = extractFloat128Exp( a );
6413 aSign = extractFloat128Sign( a );
6414 if ( aExp == 0x7FFF ) {
6415 if ( aSig0 | aSig1 ) {
6416 return propagateFloat128NaN( a, a STATUS_VAR );
6418 return a;
6420 if ( aExp != 0 )
6421 aSig0 |= LIT64( 0x0001000000000000 );
6422 else if ( aSig0 == 0 && aSig1 == 0 )
6423 return a;
6425 if (n > 0x10000) {
6426 n = 0x10000;
6427 } else if (n < -0x10000) {
6428 n = -0x10000;
6431 aExp += n - 1;
6432 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6433 STATUS_VAR );