qcow2: Return right error code in write_refcount_block_entries
[qemu.git] / fpu / softfloat.c
blobe6065b4bc7a992308d20cfeb833e146b89af32fb
2 /*============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5 Package, Release 2b.
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15 arithmetic/SoftFloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
26 Derivative works are acceptable, even for commercial purposes, so long as
27 (1) the source code for the derivative work includes prominent notice that
28 the work is derivative, and (2) the source code includes prominent notice with
29 these four paragraphs for those parts of this code that are retained.
31 =============================================================================*/
33 /* FIXME: Flush-To-Zero only effects results. Denormal inputs should also
34 be flushed to zero. */
35 #include "softfloat.h"
37 /*----------------------------------------------------------------------------
38 | Primitive arithmetic functions, including multi-word arithmetic, and
39 | division and square root approximations. (Can be specialized to target if
40 | desired.)
41 *----------------------------------------------------------------------------*/
42 #include "softfloat-macros.h"
44 /*----------------------------------------------------------------------------
45 | Functions and definitions to determine: (1) whether tininess for underflow
46 | is detected before or after rounding by default, (2) what (if anything)
47 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
48 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
49 | are propagated from function inputs to output. These details are target-
50 | specific.
51 *----------------------------------------------------------------------------*/
52 #include "softfloat-specialize.h"
54 void set_float_rounding_mode(int val STATUS_PARAM)
56 STATUS(float_rounding_mode) = val;
59 void set_float_exception_flags(int val STATUS_PARAM)
61 STATUS(float_exception_flags) = val;
64 #ifdef FLOATX80
65 void set_floatx80_rounding_precision(int val STATUS_PARAM)
67 STATUS(floatx80_rounding_precision) = val;
69 #endif
71 /*----------------------------------------------------------------------------
72 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
73 | and 7, and returns the properly rounded 32-bit integer corresponding to the
74 | input. If `zSign' is 1, the input is negated before being converted to an
75 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
76 | is simply rounded to an integer, with the inexact exception raised if the
77 | input cannot be represented exactly as an integer. However, if the fixed-
78 | point input is too large, the invalid exception is raised and the largest
79 | positive or negative integer is returned.
80 *----------------------------------------------------------------------------*/
82 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
84 int8 roundingMode;
85 flag roundNearestEven;
86 int8 roundIncrement, roundBits;
87 int32 z;
89 roundingMode = STATUS(float_rounding_mode);
90 roundNearestEven = ( roundingMode == float_round_nearest_even );
91 roundIncrement = 0x40;
92 if ( ! roundNearestEven ) {
93 if ( roundingMode == float_round_to_zero ) {
94 roundIncrement = 0;
96 else {
97 roundIncrement = 0x7F;
98 if ( zSign ) {
99 if ( roundingMode == float_round_up ) roundIncrement = 0;
101 else {
102 if ( roundingMode == float_round_down ) roundIncrement = 0;
106 roundBits = absZ & 0x7F;
107 absZ = ( absZ + roundIncrement )>>7;
108 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
109 z = absZ;
110 if ( zSign ) z = - z;
111 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
112 float_raise( float_flag_invalid STATUS_VAR);
113 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
115 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
116 return z;
120 /*----------------------------------------------------------------------------
121 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
122 | `absZ1', with binary point between bits 63 and 64 (between the input words),
123 | and returns the properly rounded 64-bit integer corresponding to the input.
124 | If `zSign' is 1, the input is negated before being converted to an integer.
125 | Ordinarily, the fixed-point input is simply rounded to an integer, with
126 | the inexact exception raised if the input cannot be represented exactly as
127 | an integer. However, if the fixed-point input is too large, the invalid
128 | exception is raised and the largest positive or negative integer is
129 | returned.
130 *----------------------------------------------------------------------------*/
132 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
134 int8 roundingMode;
135 flag roundNearestEven, increment;
136 int64 z;
138 roundingMode = STATUS(float_rounding_mode);
139 roundNearestEven = ( roundingMode == float_round_nearest_even );
140 increment = ( (sbits64) absZ1 < 0 );
141 if ( ! roundNearestEven ) {
142 if ( roundingMode == float_round_to_zero ) {
143 increment = 0;
145 else {
146 if ( zSign ) {
147 increment = ( roundingMode == float_round_down ) && absZ1;
149 else {
150 increment = ( roundingMode == float_round_up ) && absZ1;
154 if ( increment ) {
155 ++absZ0;
156 if ( absZ0 == 0 ) goto overflow;
157 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
159 z = absZ0;
160 if ( zSign ) z = - z;
161 if ( z && ( ( z < 0 ) ^ zSign ) ) {
162 overflow:
163 float_raise( float_flag_invalid STATUS_VAR);
164 return
165 zSign ? (sbits64) LIT64( 0x8000000000000000 )
166 : LIT64( 0x7FFFFFFFFFFFFFFF );
168 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
169 return z;
173 /*----------------------------------------------------------------------------
174 | Returns the fraction bits of the single-precision floating-point value `a'.
175 *----------------------------------------------------------------------------*/
177 INLINE bits32 extractFloat32Frac( float32 a )
180 return float32_val(a) & 0x007FFFFF;
184 /*----------------------------------------------------------------------------
185 | Returns the exponent bits of the single-precision floating-point value `a'.
186 *----------------------------------------------------------------------------*/
188 INLINE int16 extractFloat32Exp( float32 a )
191 return ( float32_val(a)>>23 ) & 0xFF;
195 /*----------------------------------------------------------------------------
196 | Returns the sign bit of the single-precision floating-point value `a'.
197 *----------------------------------------------------------------------------*/
199 INLINE flag extractFloat32Sign( float32 a )
202 return float32_val(a)>>31;
206 /*----------------------------------------------------------------------------
207 | Normalizes the subnormal single-precision floating-point value represented
208 | by the denormalized significand `aSig'. The normalized exponent and
209 | significand are stored at the locations pointed to by `zExpPtr' and
210 | `zSigPtr', respectively.
211 *----------------------------------------------------------------------------*/
213 static void
214 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
216 int8 shiftCount;
218 shiftCount = countLeadingZeros32( aSig ) - 8;
219 *zSigPtr = aSig<<shiftCount;
220 *zExpPtr = 1 - shiftCount;
224 /*----------------------------------------------------------------------------
225 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
226 | single-precision floating-point value, returning the result. After being
227 | shifted into the proper positions, the three fields are simply added
228 | together to form the result. This means that any integer portion of `zSig'
229 | will be added into the exponent. Since a properly normalized significand
230 | will have an integer portion equal to 1, the `zExp' input should be 1 less
231 | than the desired result exponent whenever `zSig' is a complete, normalized
232 | significand.
233 *----------------------------------------------------------------------------*/
235 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
238 return make_float32(
239 ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
243 /*----------------------------------------------------------------------------
244 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
245 | and significand `zSig', and returns the proper single-precision floating-
246 | point value corresponding to the abstract input. Ordinarily, the abstract
247 | value is simply rounded and packed into the single-precision format, with
248 | the inexact exception raised if the abstract input cannot be represented
249 | exactly. However, if the abstract value is too large, the overflow and
250 | inexact exceptions are raised and an infinity or maximal finite value is
251 | returned. If the abstract value is too small, the input value is rounded to
252 | a subnormal number, and the underflow and inexact exceptions are raised if
253 | the abstract input cannot be represented exactly as a subnormal single-
254 | precision floating-point number.
255 | The input significand `zSig' has its binary point between bits 30
256 | and 29, which is 7 bits to the left of the usual location. This shifted
257 | significand must be normalized or smaller. If `zSig' is not normalized,
258 | `zExp' must be 0; in that case, the result returned is a subnormal number,
259 | and it must not require rounding. In the usual case that `zSig' is
260 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
261 | The handling of underflow and overflow follows the IEC/IEEE Standard for
262 | Binary Floating-Point Arithmetic.
263 *----------------------------------------------------------------------------*/
265 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
267 int8 roundingMode;
268 flag roundNearestEven;
269 int8 roundIncrement, roundBits;
270 flag isTiny;
272 roundingMode = STATUS(float_rounding_mode);
273 roundNearestEven = ( roundingMode == float_round_nearest_even );
274 roundIncrement = 0x40;
275 if ( ! roundNearestEven ) {
276 if ( roundingMode == float_round_to_zero ) {
277 roundIncrement = 0;
279 else {
280 roundIncrement = 0x7F;
281 if ( zSign ) {
282 if ( roundingMode == float_round_up ) roundIncrement = 0;
284 else {
285 if ( roundingMode == float_round_down ) roundIncrement = 0;
289 roundBits = zSig & 0x7F;
290 if ( 0xFD <= (bits16) zExp ) {
291 if ( ( 0xFD < zExp )
292 || ( ( zExp == 0xFD )
293 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
295 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
296 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
298 if ( zExp < 0 ) {
299 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
300 isTiny =
301 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
302 || ( zExp < -1 )
303 || ( zSig + roundIncrement < 0x80000000 );
304 shift32RightJamming( zSig, - zExp, &zSig );
305 zExp = 0;
306 roundBits = zSig & 0x7F;
307 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
310 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
311 zSig = ( zSig + roundIncrement )>>7;
312 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
313 if ( zSig == 0 ) zExp = 0;
314 return packFloat32( zSign, zExp, zSig );
318 /*----------------------------------------------------------------------------
319 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
320 | and significand `zSig', and returns the proper single-precision floating-
321 | point value corresponding to the abstract input. This routine is just like
322 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
323 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
324 | floating-point exponent.
325 *----------------------------------------------------------------------------*/
327 static float32
328 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
330 int8 shiftCount;
332 shiftCount = countLeadingZeros32( zSig ) - 1;
333 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
337 /*----------------------------------------------------------------------------
338 | Returns the fraction bits of the double-precision floating-point value `a'.
339 *----------------------------------------------------------------------------*/
341 INLINE bits64 extractFloat64Frac( float64 a )
344 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
348 /*----------------------------------------------------------------------------
349 | Returns the exponent bits of the double-precision floating-point value `a'.
350 *----------------------------------------------------------------------------*/
352 INLINE int16 extractFloat64Exp( float64 a )
355 return ( float64_val(a)>>52 ) & 0x7FF;
359 /*----------------------------------------------------------------------------
360 | Returns the sign bit of the double-precision floating-point value `a'.
361 *----------------------------------------------------------------------------*/
363 INLINE flag extractFloat64Sign( float64 a )
366 return float64_val(a)>>63;
370 /*----------------------------------------------------------------------------
371 | Normalizes the subnormal double-precision floating-point value represented
372 | by the denormalized significand `aSig'. The normalized exponent and
373 | significand are stored at the locations pointed to by `zExpPtr' and
374 | `zSigPtr', respectively.
375 *----------------------------------------------------------------------------*/
377 static void
378 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
380 int8 shiftCount;
382 shiftCount = countLeadingZeros64( aSig ) - 11;
383 *zSigPtr = aSig<<shiftCount;
384 *zExpPtr = 1 - shiftCount;
388 /*----------------------------------------------------------------------------
389 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
390 | double-precision floating-point value, returning the result. After being
391 | shifted into the proper positions, the three fields are simply added
392 | together to form the result. This means that any integer portion of `zSig'
393 | will be added into the exponent. Since a properly normalized significand
394 | will have an integer portion equal to 1, the `zExp' input should be 1 less
395 | than the desired result exponent whenever `zSig' is a complete, normalized
396 | significand.
397 *----------------------------------------------------------------------------*/
399 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
402 return make_float64(
403 ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
407 /*----------------------------------------------------------------------------
408 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
409 | and significand `zSig', and returns the proper double-precision floating-
410 | point value corresponding to the abstract input. Ordinarily, the abstract
411 | value is simply rounded and packed into the double-precision format, with
412 | the inexact exception raised if the abstract input cannot be represented
413 | exactly. However, if the abstract value is too large, the overflow and
414 | inexact exceptions are raised and an infinity or maximal finite value is
415 | returned. If the abstract value is too small, the input value is rounded
416 | to a subnormal number, and the underflow and inexact exceptions are raised
417 | if the abstract input cannot be represented exactly as a subnormal double-
418 | precision floating-point number.
419 | The input significand `zSig' has its binary point between bits 62
420 | and 61, which is 10 bits to the left of the usual location. This shifted
421 | significand must be normalized or smaller. If `zSig' is not normalized,
422 | `zExp' must be 0; in that case, the result returned is a subnormal number,
423 | and it must not require rounding. In the usual case that `zSig' is
424 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
425 | The handling of underflow and overflow follows the IEC/IEEE Standard for
426 | Binary Floating-Point Arithmetic.
427 *----------------------------------------------------------------------------*/
429 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
431 int8 roundingMode;
432 flag roundNearestEven;
433 int16 roundIncrement, roundBits;
434 flag isTiny;
436 roundingMode = STATUS(float_rounding_mode);
437 roundNearestEven = ( roundingMode == float_round_nearest_even );
438 roundIncrement = 0x200;
439 if ( ! roundNearestEven ) {
440 if ( roundingMode == float_round_to_zero ) {
441 roundIncrement = 0;
443 else {
444 roundIncrement = 0x3FF;
445 if ( zSign ) {
446 if ( roundingMode == float_round_up ) roundIncrement = 0;
448 else {
449 if ( roundingMode == float_round_down ) roundIncrement = 0;
453 roundBits = zSig & 0x3FF;
454 if ( 0x7FD <= (bits16) zExp ) {
455 if ( ( 0x7FD < zExp )
456 || ( ( zExp == 0x7FD )
457 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
459 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
460 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
462 if ( zExp < 0 ) {
463 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
464 isTiny =
465 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
466 || ( zExp < -1 )
467 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
468 shift64RightJamming( zSig, - zExp, &zSig );
469 zExp = 0;
470 roundBits = zSig & 0x3FF;
471 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
474 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
475 zSig = ( zSig + roundIncrement )>>10;
476 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
477 if ( zSig == 0 ) zExp = 0;
478 return packFloat64( zSign, zExp, zSig );
482 /*----------------------------------------------------------------------------
483 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
484 | and significand `zSig', and returns the proper double-precision floating-
485 | point value corresponding to the abstract input. This routine is just like
486 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
487 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
488 | floating-point exponent.
489 *----------------------------------------------------------------------------*/
491 static float64
492 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
494 int8 shiftCount;
496 shiftCount = countLeadingZeros64( zSig ) - 1;
497 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
501 #ifdef FLOATX80
503 /*----------------------------------------------------------------------------
504 | Returns the fraction bits of the extended double-precision floating-point
505 | value `a'.
506 *----------------------------------------------------------------------------*/
508 INLINE bits64 extractFloatx80Frac( floatx80 a )
511 return a.low;
515 /*----------------------------------------------------------------------------
516 | Returns the exponent bits of the extended double-precision floating-point
517 | value `a'.
518 *----------------------------------------------------------------------------*/
520 INLINE int32 extractFloatx80Exp( floatx80 a )
523 return a.high & 0x7FFF;
527 /*----------------------------------------------------------------------------
528 | Returns the sign bit of the extended double-precision floating-point value
529 | `a'.
530 *----------------------------------------------------------------------------*/
532 INLINE flag extractFloatx80Sign( floatx80 a )
535 return a.high>>15;
539 /*----------------------------------------------------------------------------
540 | Normalizes the subnormal extended double-precision floating-point value
541 | represented by the denormalized significand `aSig'. The normalized exponent
542 | and significand are stored at the locations pointed to by `zExpPtr' and
543 | `zSigPtr', respectively.
544 *----------------------------------------------------------------------------*/
546 static void
547 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
549 int8 shiftCount;
551 shiftCount = countLeadingZeros64( aSig );
552 *zSigPtr = aSig<<shiftCount;
553 *zExpPtr = 1 - shiftCount;
557 /*----------------------------------------------------------------------------
558 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
559 | extended double-precision floating-point value, returning the result.
560 *----------------------------------------------------------------------------*/
562 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
564 floatx80 z;
566 z.low = zSig;
567 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
568 return z;
572 /*----------------------------------------------------------------------------
573 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
574 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
575 | and returns the proper extended double-precision floating-point value
576 | corresponding to the abstract input. Ordinarily, the abstract value is
577 | rounded and packed into the extended double-precision format, with the
578 | inexact exception raised if the abstract input cannot be represented
579 | exactly. However, if the abstract value is too large, the overflow and
580 | inexact exceptions are raised and an infinity or maximal finite value is
581 | returned. If the abstract value is too small, the input value is rounded to
582 | a subnormal number, and the underflow and inexact exceptions are raised if
583 | the abstract input cannot be represented exactly as a subnormal extended
584 | double-precision floating-point number.
585 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
586 | number of bits as single or double precision, respectively. Otherwise, the
587 | result is rounded to the full precision of the extended double-precision
588 | format.
589 | The input significand must be normalized or smaller. If the input
590 | significand is not normalized, `zExp' must be 0; in that case, the result
591 | returned is a subnormal number, and it must not require rounding. The
592 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
593 | Floating-Point Arithmetic.
594 *----------------------------------------------------------------------------*/
596 static floatx80
597 roundAndPackFloatx80(
598 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
599 STATUS_PARAM)
601 int8 roundingMode;
602 flag roundNearestEven, increment, isTiny;
603 int64 roundIncrement, roundMask, roundBits;
605 roundingMode = STATUS(float_rounding_mode);
606 roundNearestEven = ( roundingMode == float_round_nearest_even );
607 if ( roundingPrecision == 80 ) goto precision80;
608 if ( roundingPrecision == 64 ) {
609 roundIncrement = LIT64( 0x0000000000000400 );
610 roundMask = LIT64( 0x00000000000007FF );
612 else if ( roundingPrecision == 32 ) {
613 roundIncrement = LIT64( 0x0000008000000000 );
614 roundMask = LIT64( 0x000000FFFFFFFFFF );
616 else {
617 goto precision80;
619 zSig0 |= ( zSig1 != 0 );
620 if ( ! roundNearestEven ) {
621 if ( roundingMode == float_round_to_zero ) {
622 roundIncrement = 0;
624 else {
625 roundIncrement = roundMask;
626 if ( zSign ) {
627 if ( roundingMode == float_round_up ) roundIncrement = 0;
629 else {
630 if ( roundingMode == float_round_down ) roundIncrement = 0;
634 roundBits = zSig0 & roundMask;
635 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
636 if ( ( 0x7FFE < zExp )
637 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
639 goto overflow;
641 if ( zExp <= 0 ) {
642 if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
643 isTiny =
644 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
645 || ( zExp < 0 )
646 || ( zSig0 <= zSig0 + roundIncrement );
647 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
648 zExp = 0;
649 roundBits = zSig0 & roundMask;
650 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
651 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
652 zSig0 += roundIncrement;
653 if ( (sbits64) zSig0 < 0 ) zExp = 1;
654 roundIncrement = roundMask + 1;
655 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
656 roundMask |= roundIncrement;
658 zSig0 &= ~ roundMask;
659 return packFloatx80( zSign, zExp, zSig0 );
662 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
663 zSig0 += roundIncrement;
664 if ( zSig0 < roundIncrement ) {
665 ++zExp;
666 zSig0 = LIT64( 0x8000000000000000 );
668 roundIncrement = roundMask + 1;
669 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
670 roundMask |= roundIncrement;
672 zSig0 &= ~ roundMask;
673 if ( zSig0 == 0 ) zExp = 0;
674 return packFloatx80( zSign, zExp, zSig0 );
675 precision80:
676 increment = ( (sbits64) zSig1 < 0 );
677 if ( ! roundNearestEven ) {
678 if ( roundingMode == float_round_to_zero ) {
679 increment = 0;
681 else {
682 if ( zSign ) {
683 increment = ( roundingMode == float_round_down ) && zSig1;
685 else {
686 increment = ( roundingMode == float_round_up ) && zSig1;
690 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
691 if ( ( 0x7FFE < zExp )
692 || ( ( zExp == 0x7FFE )
693 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
694 && increment
697 roundMask = 0;
698 overflow:
699 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
700 if ( ( roundingMode == float_round_to_zero )
701 || ( zSign && ( roundingMode == float_round_up ) )
702 || ( ! zSign && ( roundingMode == float_round_down ) )
704 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
706 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
708 if ( zExp <= 0 ) {
709 isTiny =
710 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
711 || ( zExp < 0 )
712 || ! increment
713 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
714 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
715 zExp = 0;
716 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
717 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
718 if ( roundNearestEven ) {
719 increment = ( (sbits64) zSig1 < 0 );
721 else {
722 if ( zSign ) {
723 increment = ( roundingMode == float_round_down ) && zSig1;
725 else {
726 increment = ( roundingMode == float_round_up ) && zSig1;
729 if ( increment ) {
730 ++zSig0;
731 zSig0 &=
732 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
733 if ( (sbits64) zSig0 < 0 ) zExp = 1;
735 return packFloatx80( zSign, zExp, zSig0 );
738 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
739 if ( increment ) {
740 ++zSig0;
741 if ( zSig0 == 0 ) {
742 ++zExp;
743 zSig0 = LIT64( 0x8000000000000000 );
745 else {
746 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
749 else {
750 if ( zSig0 == 0 ) zExp = 0;
752 return packFloatx80( zSign, zExp, zSig0 );
756 /*----------------------------------------------------------------------------
757 | Takes an abstract floating-point value having sign `zSign', exponent
758 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
759 | and returns the proper extended double-precision floating-point value
760 | corresponding to the abstract input. This routine is just like
761 | `roundAndPackFloatx80' except that the input significand does not have to be
762 | normalized.
763 *----------------------------------------------------------------------------*/
765 static floatx80
766 normalizeRoundAndPackFloatx80(
767 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
768 STATUS_PARAM)
770 int8 shiftCount;
772 if ( zSig0 == 0 ) {
773 zSig0 = zSig1;
774 zSig1 = 0;
775 zExp -= 64;
777 shiftCount = countLeadingZeros64( zSig0 );
778 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
779 zExp -= shiftCount;
780 return
781 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
785 #endif
787 #ifdef FLOAT128
789 /*----------------------------------------------------------------------------
790 | Returns the least-significant 64 fraction bits of the quadruple-precision
791 | floating-point value `a'.
792 *----------------------------------------------------------------------------*/
794 INLINE bits64 extractFloat128Frac1( float128 a )
797 return a.low;
801 /*----------------------------------------------------------------------------
802 | Returns the most-significant 48 fraction bits of the quadruple-precision
803 | floating-point value `a'.
804 *----------------------------------------------------------------------------*/
806 INLINE bits64 extractFloat128Frac0( float128 a )
809 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
813 /*----------------------------------------------------------------------------
814 | Returns the exponent bits of the quadruple-precision floating-point value
815 | `a'.
816 *----------------------------------------------------------------------------*/
818 INLINE int32 extractFloat128Exp( float128 a )
821 return ( a.high>>48 ) & 0x7FFF;
825 /*----------------------------------------------------------------------------
826 | Returns the sign bit of the quadruple-precision floating-point value `a'.
827 *----------------------------------------------------------------------------*/
829 INLINE flag extractFloat128Sign( float128 a )
832 return a.high>>63;
836 /*----------------------------------------------------------------------------
837 | Normalizes the subnormal quadruple-precision floating-point value
838 | represented by the denormalized significand formed by the concatenation of
839 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
840 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
841 | significand are stored at the location pointed to by `zSig0Ptr', and the
842 | least significant 64 bits of the normalized significand are stored at the
843 | location pointed to by `zSig1Ptr'.
844 *----------------------------------------------------------------------------*/
846 static void
847 normalizeFloat128Subnormal(
848 bits64 aSig0,
849 bits64 aSig1,
850 int32 *zExpPtr,
851 bits64 *zSig0Ptr,
852 bits64 *zSig1Ptr
855 int8 shiftCount;
857 if ( aSig0 == 0 ) {
858 shiftCount = countLeadingZeros64( aSig1 ) - 15;
859 if ( shiftCount < 0 ) {
860 *zSig0Ptr = aSig1>>( - shiftCount );
861 *zSig1Ptr = aSig1<<( shiftCount & 63 );
863 else {
864 *zSig0Ptr = aSig1<<shiftCount;
865 *zSig1Ptr = 0;
867 *zExpPtr = - shiftCount - 63;
869 else {
870 shiftCount = countLeadingZeros64( aSig0 ) - 15;
871 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
872 *zExpPtr = 1 - shiftCount;
877 /*----------------------------------------------------------------------------
878 | Packs the sign `zSign', the exponent `zExp', and the significand formed
879 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
880 | floating-point value, returning the result. After being shifted into the
881 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
882 | added together to form the most significant 32 bits of the result. This
883 | means that any integer portion of `zSig0' will be added into the exponent.
884 | Since a properly normalized significand will have an integer portion equal
885 | to 1, the `zExp' input should be 1 less than the desired result exponent
886 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
887 | significand.
888 *----------------------------------------------------------------------------*/
890 INLINE float128
891 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
893 float128 z;
895 z.low = zSig1;
896 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
897 return z;
901 /*----------------------------------------------------------------------------
902 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
903 | and extended significand formed by the concatenation of `zSig0', `zSig1',
904 | and `zSig2', and returns the proper quadruple-precision floating-point value
905 | corresponding to the abstract input. Ordinarily, the abstract value is
906 | simply rounded and packed into the quadruple-precision format, with the
907 | inexact exception raised if the abstract input cannot be represented
908 | exactly. However, if the abstract value is too large, the overflow and
909 | inexact exceptions are raised and an infinity or maximal finite value is
910 | returned. If the abstract value is too small, the input value is rounded to
911 | a subnormal number, and the underflow and inexact exceptions are raised if
912 | the abstract input cannot be represented exactly as a subnormal quadruple-
913 | precision floating-point number.
914 | The input significand must be normalized or smaller. If the input
915 | significand is not normalized, `zExp' must be 0; in that case, the result
916 | returned is a subnormal number, and it must not require rounding. In the
917 | usual case that the input significand is normalized, `zExp' must be 1 less
918 | than the ``true'' floating-point exponent. The handling of underflow and
919 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
920 *----------------------------------------------------------------------------*/
922 static float128
923 roundAndPackFloat128(
924 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
926 int8 roundingMode;
927 flag roundNearestEven, increment, isTiny;
929 roundingMode = STATUS(float_rounding_mode);
930 roundNearestEven = ( roundingMode == float_round_nearest_even );
931 increment = ( (sbits64) zSig2 < 0 );
932 if ( ! roundNearestEven ) {
933 if ( roundingMode == float_round_to_zero ) {
934 increment = 0;
936 else {
937 if ( zSign ) {
938 increment = ( roundingMode == float_round_down ) && zSig2;
940 else {
941 increment = ( roundingMode == float_round_up ) && zSig2;
945 if ( 0x7FFD <= (bits32) zExp ) {
946 if ( ( 0x7FFD < zExp )
947 || ( ( zExp == 0x7FFD )
948 && eq128(
949 LIT64( 0x0001FFFFFFFFFFFF ),
950 LIT64( 0xFFFFFFFFFFFFFFFF ),
951 zSig0,
952 zSig1
954 && increment
957 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
958 if ( ( roundingMode == float_round_to_zero )
959 || ( zSign && ( roundingMode == float_round_up ) )
960 || ( ! zSign && ( roundingMode == float_round_down ) )
962 return
963 packFloat128(
964 zSign,
965 0x7FFE,
966 LIT64( 0x0000FFFFFFFFFFFF ),
967 LIT64( 0xFFFFFFFFFFFFFFFF )
970 return packFloat128( zSign, 0x7FFF, 0, 0 );
972 if ( zExp < 0 ) {
973 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
974 isTiny =
975 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
976 || ( zExp < -1 )
977 || ! increment
978 || lt128(
979 zSig0,
980 zSig1,
981 LIT64( 0x0001FFFFFFFFFFFF ),
982 LIT64( 0xFFFFFFFFFFFFFFFF )
984 shift128ExtraRightJamming(
985 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
986 zExp = 0;
987 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
988 if ( roundNearestEven ) {
989 increment = ( (sbits64) zSig2 < 0 );
991 else {
992 if ( zSign ) {
993 increment = ( roundingMode == float_round_down ) && zSig2;
995 else {
996 increment = ( roundingMode == float_round_up ) && zSig2;
1001 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1002 if ( increment ) {
1003 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1004 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1006 else {
1007 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1009 return packFloat128( zSign, zExp, zSig0, zSig1 );
1013 /*----------------------------------------------------------------------------
1014 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1015 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1016 | returns the proper quadruple-precision floating-point value corresponding
1017 | to the abstract input. This routine is just like `roundAndPackFloat128'
1018 | except that the input significand has fewer bits and does not have to be
1019 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1020 | point exponent.
1021 *----------------------------------------------------------------------------*/
1023 static float128
1024 normalizeRoundAndPackFloat128(
1025 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1027 int8 shiftCount;
1028 bits64 zSig2;
1030 if ( zSig0 == 0 ) {
1031 zSig0 = zSig1;
1032 zSig1 = 0;
1033 zExp -= 64;
1035 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1036 if ( 0 <= shiftCount ) {
1037 zSig2 = 0;
1038 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1040 else {
1041 shift128ExtraRightJamming(
1042 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1044 zExp -= shiftCount;
1045 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1049 #endif
1051 /*----------------------------------------------------------------------------
1052 | Returns the result of converting the 32-bit two's complement integer `a'
1053 | to the single-precision floating-point format. The conversion is performed
1054 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1055 *----------------------------------------------------------------------------*/
1057 float32 int32_to_float32( int32 a STATUS_PARAM )
1059 flag zSign;
1061 if ( a == 0 ) return float32_zero;
1062 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1063 zSign = ( a < 0 );
1064 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1068 /*----------------------------------------------------------------------------
1069 | Returns the result of converting the 32-bit two's complement integer `a'
1070 | to the double-precision floating-point format. The conversion is performed
1071 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1072 *----------------------------------------------------------------------------*/
1074 float64 int32_to_float64( int32 a STATUS_PARAM )
1076 flag zSign;
1077 uint32 absA;
1078 int8 shiftCount;
1079 bits64 zSig;
1081 if ( a == 0 ) return float64_zero;
1082 zSign = ( a < 0 );
1083 absA = zSign ? - a : a;
1084 shiftCount = countLeadingZeros32( absA ) + 21;
1085 zSig = absA;
1086 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1090 #ifdef FLOATX80
1092 /*----------------------------------------------------------------------------
1093 | Returns the result of converting the 32-bit two's complement integer `a'
1094 | to the extended double-precision floating-point format. The conversion
1095 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1096 | Arithmetic.
1097 *----------------------------------------------------------------------------*/
1099 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1101 flag zSign;
1102 uint32 absA;
1103 int8 shiftCount;
1104 bits64 zSig;
1106 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1107 zSign = ( a < 0 );
1108 absA = zSign ? - a : a;
1109 shiftCount = countLeadingZeros32( absA ) + 32;
1110 zSig = absA;
1111 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1115 #endif
1117 #ifdef FLOAT128
1119 /*----------------------------------------------------------------------------
1120 | Returns the result of converting the 32-bit two's complement integer `a' to
1121 | the quadruple-precision floating-point format. The conversion is performed
1122 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1123 *----------------------------------------------------------------------------*/
1125 float128 int32_to_float128( int32 a STATUS_PARAM )
1127 flag zSign;
1128 uint32 absA;
1129 int8 shiftCount;
1130 bits64 zSig0;
1132 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1133 zSign = ( a < 0 );
1134 absA = zSign ? - a : a;
1135 shiftCount = countLeadingZeros32( absA ) + 17;
1136 zSig0 = absA;
1137 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1141 #endif
1143 /*----------------------------------------------------------------------------
1144 | Returns the result of converting the 64-bit two's complement integer `a'
1145 | to the single-precision floating-point format. The conversion is performed
1146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1147 *----------------------------------------------------------------------------*/
1149 float32 int64_to_float32( int64 a STATUS_PARAM )
1151 flag zSign;
1152 uint64 absA;
1153 int8 shiftCount;
1155 if ( a == 0 ) return float32_zero;
1156 zSign = ( a < 0 );
1157 absA = zSign ? - a : a;
1158 shiftCount = countLeadingZeros64( absA ) - 40;
1159 if ( 0 <= shiftCount ) {
1160 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1162 else {
1163 shiftCount += 7;
1164 if ( shiftCount < 0 ) {
1165 shift64RightJamming( absA, - shiftCount, &absA );
1167 else {
1168 absA <<= shiftCount;
1170 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1175 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1177 int8 shiftCount;
1179 if ( a == 0 ) return float32_zero;
1180 shiftCount = countLeadingZeros64( a ) - 40;
1181 if ( 0 <= shiftCount ) {
1182 return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1184 else {
1185 shiftCount += 7;
1186 if ( shiftCount < 0 ) {
1187 shift64RightJamming( a, - shiftCount, &a );
1189 else {
1190 a <<= shiftCount;
1192 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1196 /*----------------------------------------------------------------------------
1197 | Returns the result of converting the 64-bit two's complement integer `a'
1198 | to the double-precision floating-point format. The conversion is performed
1199 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1200 *----------------------------------------------------------------------------*/
1202 float64 int64_to_float64( int64 a STATUS_PARAM )
1204 flag zSign;
1206 if ( a == 0 ) return float64_zero;
1207 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1208 return packFloat64( 1, 0x43E, 0 );
1210 zSign = ( a < 0 );
1211 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1215 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1217 if ( a == 0 ) return float64_zero;
1218 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1222 #ifdef FLOATX80
1224 /*----------------------------------------------------------------------------
1225 | Returns the result of converting the 64-bit two's complement integer `a'
1226 | to the extended double-precision floating-point format. The conversion
1227 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1228 | Arithmetic.
1229 *----------------------------------------------------------------------------*/
1231 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1233 flag zSign;
1234 uint64 absA;
1235 int8 shiftCount;
1237 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1238 zSign = ( a < 0 );
1239 absA = zSign ? - a : a;
1240 shiftCount = countLeadingZeros64( absA );
1241 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1245 #endif
1247 #ifdef FLOAT128
1249 /*----------------------------------------------------------------------------
1250 | Returns the result of converting the 64-bit two's complement integer `a' to
1251 | the quadruple-precision floating-point format. The conversion is performed
1252 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1253 *----------------------------------------------------------------------------*/
1255 float128 int64_to_float128( int64 a STATUS_PARAM )
1257 flag zSign;
1258 uint64 absA;
1259 int8 shiftCount;
1260 int32 zExp;
1261 bits64 zSig0, zSig1;
1263 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1264 zSign = ( a < 0 );
1265 absA = zSign ? - a : a;
1266 shiftCount = countLeadingZeros64( absA ) + 49;
1267 zExp = 0x406E - shiftCount;
1268 if ( 64 <= shiftCount ) {
1269 zSig1 = 0;
1270 zSig0 = absA;
1271 shiftCount -= 64;
1273 else {
1274 zSig1 = absA;
1275 zSig0 = 0;
1277 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1278 return packFloat128( zSign, zExp, zSig0, zSig1 );
1282 #endif
1284 /*----------------------------------------------------------------------------
1285 | Returns the result of converting the single-precision floating-point value
1286 | `a' to the 32-bit two's complement integer format. The conversion is
1287 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1288 | Arithmetic---which means in particular that the conversion is rounded
1289 | according to the current rounding mode. If `a' is a NaN, the largest
1290 | positive integer is returned. Otherwise, if the conversion overflows, the
1291 | largest integer with the same sign as `a' is returned.
1292 *----------------------------------------------------------------------------*/
1294 int32 float32_to_int32( float32 a STATUS_PARAM )
1296 flag aSign;
1297 int16 aExp, shiftCount;
1298 bits32 aSig;
1299 bits64 aSig64;
1301 aSig = extractFloat32Frac( a );
1302 aExp = extractFloat32Exp( a );
1303 aSign = extractFloat32Sign( a );
1304 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1305 if ( aExp ) aSig |= 0x00800000;
1306 shiftCount = 0xAF - aExp;
1307 aSig64 = aSig;
1308 aSig64 <<= 32;
1309 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1310 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1314 /*----------------------------------------------------------------------------
1315 | Returns the result of converting the single-precision floating-point value
1316 | `a' to the 32-bit two's complement integer format. The conversion is
1317 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1318 | Arithmetic, except that the conversion is always rounded toward zero.
1319 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1320 | the conversion overflows, the largest integer with the same sign as `a' is
1321 | returned.
1322 *----------------------------------------------------------------------------*/
1324 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1326 flag aSign;
1327 int16 aExp, shiftCount;
1328 bits32 aSig;
1329 int32 z;
1331 aSig = extractFloat32Frac( a );
1332 aExp = extractFloat32Exp( a );
1333 aSign = extractFloat32Sign( a );
1334 shiftCount = aExp - 0x9E;
1335 if ( 0 <= shiftCount ) {
1336 if ( float32_val(a) != 0xCF000000 ) {
1337 float_raise( float_flag_invalid STATUS_VAR);
1338 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1340 return (sbits32) 0x80000000;
1342 else if ( aExp <= 0x7E ) {
1343 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1344 return 0;
1346 aSig = ( aSig | 0x00800000 )<<8;
1347 z = aSig>>( - shiftCount );
1348 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1349 STATUS(float_exception_flags) |= float_flag_inexact;
1351 if ( aSign ) z = - z;
1352 return z;
1356 /*----------------------------------------------------------------------------
1357 | Returns the result of converting the single-precision floating-point value
1358 | `a' to the 64-bit two's complement integer format. The conversion is
1359 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1360 | Arithmetic---which means in particular that the conversion is rounded
1361 | according to the current rounding mode. If `a' is a NaN, the largest
1362 | positive integer is returned. Otherwise, if the conversion overflows, the
1363 | largest integer with the same sign as `a' is returned.
1364 *----------------------------------------------------------------------------*/
1366 int64 float32_to_int64( float32 a STATUS_PARAM )
1368 flag aSign;
1369 int16 aExp, shiftCount;
1370 bits32 aSig;
1371 bits64 aSig64, aSigExtra;
1373 aSig = extractFloat32Frac( a );
1374 aExp = extractFloat32Exp( a );
1375 aSign = extractFloat32Sign( a );
1376 shiftCount = 0xBE - aExp;
1377 if ( shiftCount < 0 ) {
1378 float_raise( float_flag_invalid STATUS_VAR);
1379 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1380 return LIT64( 0x7FFFFFFFFFFFFFFF );
1382 return (sbits64) LIT64( 0x8000000000000000 );
1384 if ( aExp ) aSig |= 0x00800000;
1385 aSig64 = aSig;
1386 aSig64 <<= 40;
1387 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1388 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1392 /*----------------------------------------------------------------------------
1393 | Returns the result of converting the single-precision floating-point value
1394 | `a' to the 64-bit two's complement integer format. The conversion is
1395 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1396 | Arithmetic, except that the conversion is always rounded toward zero. If
1397 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1398 | conversion overflows, the largest integer with the same sign as `a' is
1399 | returned.
1400 *----------------------------------------------------------------------------*/
1402 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1404 flag aSign;
1405 int16 aExp, shiftCount;
1406 bits32 aSig;
1407 bits64 aSig64;
1408 int64 z;
1410 aSig = extractFloat32Frac( a );
1411 aExp = extractFloat32Exp( a );
1412 aSign = extractFloat32Sign( a );
1413 shiftCount = aExp - 0xBE;
1414 if ( 0 <= shiftCount ) {
1415 if ( float32_val(a) != 0xDF000000 ) {
1416 float_raise( float_flag_invalid STATUS_VAR);
1417 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1418 return LIT64( 0x7FFFFFFFFFFFFFFF );
1421 return (sbits64) LIT64( 0x8000000000000000 );
1423 else if ( aExp <= 0x7E ) {
1424 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1425 return 0;
1427 aSig64 = aSig | 0x00800000;
1428 aSig64 <<= 40;
1429 z = aSig64>>( - shiftCount );
1430 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1431 STATUS(float_exception_flags) |= float_flag_inexact;
1433 if ( aSign ) z = - z;
1434 return z;
1438 /*----------------------------------------------------------------------------
1439 | Returns the result of converting the single-precision floating-point value
1440 | `a' to the double-precision floating-point format. The conversion is
1441 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1442 | Arithmetic.
1443 *----------------------------------------------------------------------------*/
1445 float64 float32_to_float64( float32 a STATUS_PARAM )
1447 flag aSign;
1448 int16 aExp;
1449 bits32 aSig;
1451 aSig = extractFloat32Frac( a );
1452 aExp = extractFloat32Exp( a );
1453 aSign = extractFloat32Sign( a );
1454 if ( aExp == 0xFF ) {
1455 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1456 return packFloat64( aSign, 0x7FF, 0 );
1458 if ( aExp == 0 ) {
1459 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1460 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1461 --aExp;
1463 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1467 #ifdef FLOATX80
1469 /*----------------------------------------------------------------------------
1470 | Returns the result of converting the single-precision floating-point value
1471 | `a' to the extended double-precision floating-point format. The conversion
1472 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1473 | Arithmetic.
1474 *----------------------------------------------------------------------------*/
1476 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1478 flag aSign;
1479 int16 aExp;
1480 bits32 aSig;
1482 aSig = extractFloat32Frac( a );
1483 aExp = extractFloat32Exp( a );
1484 aSign = extractFloat32Sign( a );
1485 if ( aExp == 0xFF ) {
1486 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1487 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1489 if ( aExp == 0 ) {
1490 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1491 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1493 aSig |= 0x00800000;
1494 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1498 #endif
1500 #ifdef FLOAT128
1502 /*----------------------------------------------------------------------------
1503 | Returns the result of converting the single-precision floating-point value
1504 | `a' to the double-precision floating-point format. The conversion is
1505 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1506 | Arithmetic.
1507 *----------------------------------------------------------------------------*/
1509 float128 float32_to_float128( float32 a STATUS_PARAM )
1511 flag aSign;
1512 int16 aExp;
1513 bits32 aSig;
1515 aSig = extractFloat32Frac( a );
1516 aExp = extractFloat32Exp( a );
1517 aSign = extractFloat32Sign( a );
1518 if ( aExp == 0xFF ) {
1519 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1520 return packFloat128( aSign, 0x7FFF, 0, 0 );
1522 if ( aExp == 0 ) {
1523 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1524 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1525 --aExp;
1527 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1531 #endif
1533 /*----------------------------------------------------------------------------
1534 | Rounds the single-precision floating-point value `a' to an integer, and
1535 | returns the result as a single-precision floating-point value. The
1536 | operation is performed according to the IEC/IEEE Standard for Binary
1537 | Floating-Point Arithmetic.
1538 *----------------------------------------------------------------------------*/
1540 float32 float32_round_to_int( float32 a STATUS_PARAM)
1542 flag aSign;
1543 int16 aExp;
1544 bits32 lastBitMask, roundBitsMask;
1545 int8 roundingMode;
1546 bits32 z;
1548 aExp = extractFloat32Exp( a );
1549 if ( 0x96 <= aExp ) {
1550 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1551 return propagateFloat32NaN( a, a STATUS_VAR );
1553 return a;
1555 if ( aExp <= 0x7E ) {
1556 if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1557 STATUS(float_exception_flags) |= float_flag_inexact;
1558 aSign = extractFloat32Sign( a );
1559 switch ( STATUS(float_rounding_mode) ) {
1560 case float_round_nearest_even:
1561 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1562 return packFloat32( aSign, 0x7F, 0 );
1564 break;
1565 case float_round_down:
1566 return make_float32(aSign ? 0xBF800000 : 0);
1567 case float_round_up:
1568 return make_float32(aSign ? 0x80000000 : 0x3F800000);
1570 return packFloat32( aSign, 0, 0 );
1572 lastBitMask = 1;
1573 lastBitMask <<= 0x96 - aExp;
1574 roundBitsMask = lastBitMask - 1;
1575 z = float32_val(a);
1576 roundingMode = STATUS(float_rounding_mode);
1577 if ( roundingMode == float_round_nearest_even ) {
1578 z += lastBitMask>>1;
1579 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1581 else if ( roundingMode != float_round_to_zero ) {
1582 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1583 z += roundBitsMask;
1586 z &= ~ roundBitsMask;
1587 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1588 return make_float32(z);
1592 /*----------------------------------------------------------------------------
1593 | Returns the result of adding the absolute values of the single-precision
1594 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1595 | before being returned. `zSign' is ignored if the result is a NaN.
1596 | The addition is performed according to the IEC/IEEE Standard for Binary
1597 | Floating-Point Arithmetic.
1598 *----------------------------------------------------------------------------*/
1600 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1602 int16 aExp, bExp, zExp;
1603 bits32 aSig, bSig, zSig;
1604 int16 expDiff;
1606 aSig = extractFloat32Frac( a );
1607 aExp = extractFloat32Exp( a );
1608 bSig = extractFloat32Frac( b );
1609 bExp = extractFloat32Exp( b );
1610 expDiff = aExp - bExp;
1611 aSig <<= 6;
1612 bSig <<= 6;
1613 if ( 0 < expDiff ) {
1614 if ( aExp == 0xFF ) {
1615 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1616 return a;
1618 if ( bExp == 0 ) {
1619 --expDiff;
1621 else {
1622 bSig |= 0x20000000;
1624 shift32RightJamming( bSig, expDiff, &bSig );
1625 zExp = aExp;
1627 else if ( expDiff < 0 ) {
1628 if ( bExp == 0xFF ) {
1629 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1630 return packFloat32( zSign, 0xFF, 0 );
1632 if ( aExp == 0 ) {
1633 ++expDiff;
1635 else {
1636 aSig |= 0x20000000;
1638 shift32RightJamming( aSig, - expDiff, &aSig );
1639 zExp = bExp;
1641 else {
1642 if ( aExp == 0xFF ) {
1643 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1644 return a;
1646 if ( aExp == 0 ) {
1647 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1648 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1650 zSig = 0x40000000 + aSig + bSig;
1651 zExp = aExp;
1652 goto roundAndPack;
1654 aSig |= 0x20000000;
1655 zSig = ( aSig + bSig )<<1;
1656 --zExp;
1657 if ( (sbits32) zSig < 0 ) {
1658 zSig = aSig + bSig;
1659 ++zExp;
1661 roundAndPack:
1662 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1666 /*----------------------------------------------------------------------------
1667 | Returns the result of subtracting the absolute values of the single-
1668 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1669 | difference is negated before being returned. `zSign' is ignored if the
1670 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1671 | Standard for Binary Floating-Point Arithmetic.
1672 *----------------------------------------------------------------------------*/
1674 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1676 int16 aExp, bExp, zExp;
1677 bits32 aSig, bSig, zSig;
1678 int16 expDiff;
1680 aSig = extractFloat32Frac( a );
1681 aExp = extractFloat32Exp( a );
1682 bSig = extractFloat32Frac( b );
1683 bExp = extractFloat32Exp( b );
1684 expDiff = aExp - bExp;
1685 aSig <<= 7;
1686 bSig <<= 7;
1687 if ( 0 < expDiff ) goto aExpBigger;
1688 if ( expDiff < 0 ) goto bExpBigger;
1689 if ( aExp == 0xFF ) {
1690 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1691 float_raise( float_flag_invalid STATUS_VAR);
1692 return float32_default_nan;
1694 if ( aExp == 0 ) {
1695 aExp = 1;
1696 bExp = 1;
1698 if ( bSig < aSig ) goto aBigger;
1699 if ( aSig < bSig ) goto bBigger;
1700 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1701 bExpBigger:
1702 if ( bExp == 0xFF ) {
1703 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1704 return packFloat32( zSign ^ 1, 0xFF, 0 );
1706 if ( aExp == 0 ) {
1707 ++expDiff;
1709 else {
1710 aSig |= 0x40000000;
1712 shift32RightJamming( aSig, - expDiff, &aSig );
1713 bSig |= 0x40000000;
1714 bBigger:
1715 zSig = bSig - aSig;
1716 zExp = bExp;
1717 zSign ^= 1;
1718 goto normalizeRoundAndPack;
1719 aExpBigger:
1720 if ( aExp == 0xFF ) {
1721 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1722 return a;
1724 if ( bExp == 0 ) {
1725 --expDiff;
1727 else {
1728 bSig |= 0x40000000;
1730 shift32RightJamming( bSig, expDiff, &bSig );
1731 aSig |= 0x40000000;
1732 aBigger:
1733 zSig = aSig - bSig;
1734 zExp = aExp;
1735 normalizeRoundAndPack:
1736 --zExp;
1737 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1741 /*----------------------------------------------------------------------------
1742 | Returns the result of adding the single-precision floating-point values `a'
1743 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1744 | Binary Floating-Point Arithmetic.
1745 *----------------------------------------------------------------------------*/
1747 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1749 flag aSign, bSign;
1751 aSign = extractFloat32Sign( a );
1752 bSign = extractFloat32Sign( b );
1753 if ( aSign == bSign ) {
1754 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1756 else {
1757 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1762 /*----------------------------------------------------------------------------
1763 | Returns the result of subtracting the single-precision floating-point values
1764 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1765 | for Binary Floating-Point Arithmetic.
1766 *----------------------------------------------------------------------------*/
1768 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1770 flag aSign, bSign;
1772 aSign = extractFloat32Sign( a );
1773 bSign = extractFloat32Sign( b );
1774 if ( aSign == bSign ) {
1775 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1777 else {
1778 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1783 /*----------------------------------------------------------------------------
1784 | Returns the result of multiplying the single-precision floating-point values
1785 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1786 | for Binary Floating-Point Arithmetic.
1787 *----------------------------------------------------------------------------*/
1789 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1791 flag aSign, bSign, zSign;
1792 int16 aExp, bExp, zExp;
1793 bits32 aSig, bSig;
1794 bits64 zSig64;
1795 bits32 zSig;
1797 aSig = extractFloat32Frac( a );
1798 aExp = extractFloat32Exp( a );
1799 aSign = extractFloat32Sign( a );
1800 bSig = extractFloat32Frac( b );
1801 bExp = extractFloat32Exp( b );
1802 bSign = extractFloat32Sign( b );
1803 zSign = aSign ^ bSign;
1804 if ( aExp == 0xFF ) {
1805 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1806 return propagateFloat32NaN( a, b STATUS_VAR );
1808 if ( ( bExp | bSig ) == 0 ) {
1809 float_raise( float_flag_invalid STATUS_VAR);
1810 return float32_default_nan;
1812 return packFloat32( zSign, 0xFF, 0 );
1814 if ( bExp == 0xFF ) {
1815 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1816 if ( ( aExp | aSig ) == 0 ) {
1817 float_raise( float_flag_invalid STATUS_VAR);
1818 return float32_default_nan;
1820 return packFloat32( zSign, 0xFF, 0 );
1822 if ( aExp == 0 ) {
1823 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1824 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1826 if ( bExp == 0 ) {
1827 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1828 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1830 zExp = aExp + bExp - 0x7F;
1831 aSig = ( aSig | 0x00800000 )<<7;
1832 bSig = ( bSig | 0x00800000 )<<8;
1833 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1834 zSig = zSig64;
1835 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1836 zSig <<= 1;
1837 --zExp;
1839 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1843 /*----------------------------------------------------------------------------
1844 | Returns the result of dividing the single-precision floating-point value `a'
1845 | by the corresponding value `b'. The operation is performed according to the
1846 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1847 *----------------------------------------------------------------------------*/
1849 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1851 flag aSign, bSign, zSign;
1852 int16 aExp, bExp, zExp;
1853 bits32 aSig, bSig, zSig;
1855 aSig = extractFloat32Frac( a );
1856 aExp = extractFloat32Exp( a );
1857 aSign = extractFloat32Sign( a );
1858 bSig = extractFloat32Frac( b );
1859 bExp = extractFloat32Exp( b );
1860 bSign = extractFloat32Sign( b );
1861 zSign = aSign ^ bSign;
1862 if ( aExp == 0xFF ) {
1863 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1864 if ( bExp == 0xFF ) {
1865 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1866 float_raise( float_flag_invalid STATUS_VAR);
1867 return float32_default_nan;
1869 return packFloat32( zSign, 0xFF, 0 );
1871 if ( bExp == 0xFF ) {
1872 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1873 return packFloat32( zSign, 0, 0 );
1875 if ( bExp == 0 ) {
1876 if ( bSig == 0 ) {
1877 if ( ( aExp | aSig ) == 0 ) {
1878 float_raise( float_flag_invalid STATUS_VAR);
1879 return float32_default_nan;
1881 float_raise( float_flag_divbyzero STATUS_VAR);
1882 return packFloat32( zSign, 0xFF, 0 );
1884 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1886 if ( aExp == 0 ) {
1887 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1888 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1890 zExp = aExp - bExp + 0x7D;
1891 aSig = ( aSig | 0x00800000 )<<7;
1892 bSig = ( bSig | 0x00800000 )<<8;
1893 if ( bSig <= ( aSig + aSig ) ) {
1894 aSig >>= 1;
1895 ++zExp;
1897 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1898 if ( ( zSig & 0x3F ) == 0 ) {
1899 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1901 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1905 /*----------------------------------------------------------------------------
1906 | Returns the remainder of the single-precision floating-point value `a'
1907 | with respect to the corresponding value `b'. The operation is performed
1908 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1909 *----------------------------------------------------------------------------*/
1911 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1913 flag aSign, zSign;
1914 int16 aExp, bExp, expDiff;
1915 bits32 aSig, bSig;
1916 bits32 q;
1917 bits64 aSig64, bSig64, q64;
1918 bits32 alternateASig;
1919 sbits32 sigMean;
1921 aSig = extractFloat32Frac( a );
1922 aExp = extractFloat32Exp( a );
1923 aSign = extractFloat32Sign( a );
1924 bSig = extractFloat32Frac( b );
1925 bExp = extractFloat32Exp( b );
1926 if ( aExp == 0xFF ) {
1927 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1928 return propagateFloat32NaN( a, b STATUS_VAR );
1930 float_raise( float_flag_invalid STATUS_VAR);
1931 return float32_default_nan;
1933 if ( bExp == 0xFF ) {
1934 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1935 return a;
1937 if ( bExp == 0 ) {
1938 if ( bSig == 0 ) {
1939 float_raise( float_flag_invalid STATUS_VAR);
1940 return float32_default_nan;
1942 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1944 if ( aExp == 0 ) {
1945 if ( aSig == 0 ) return a;
1946 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1948 expDiff = aExp - bExp;
1949 aSig |= 0x00800000;
1950 bSig |= 0x00800000;
1951 if ( expDiff < 32 ) {
1952 aSig <<= 8;
1953 bSig <<= 8;
1954 if ( expDiff < 0 ) {
1955 if ( expDiff < -1 ) return a;
1956 aSig >>= 1;
1958 q = ( bSig <= aSig );
1959 if ( q ) aSig -= bSig;
1960 if ( 0 < expDiff ) {
1961 q = ( ( (bits64) aSig )<<32 ) / bSig;
1962 q >>= 32 - expDiff;
1963 bSig >>= 2;
1964 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1966 else {
1967 aSig >>= 2;
1968 bSig >>= 2;
1971 else {
1972 if ( bSig <= aSig ) aSig -= bSig;
1973 aSig64 = ( (bits64) aSig )<<40;
1974 bSig64 = ( (bits64) bSig )<<40;
1975 expDiff -= 64;
1976 while ( 0 < expDiff ) {
1977 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1978 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1979 aSig64 = - ( ( bSig * q64 )<<38 );
1980 expDiff -= 62;
1982 expDiff += 64;
1983 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1984 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1985 q = q64>>( 64 - expDiff );
1986 bSig <<= 6;
1987 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1989 do {
1990 alternateASig = aSig;
1991 ++q;
1992 aSig -= bSig;
1993 } while ( 0 <= (sbits32) aSig );
1994 sigMean = aSig + alternateASig;
1995 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1996 aSig = alternateASig;
1998 zSign = ( (sbits32) aSig < 0 );
1999 if ( zSign ) aSig = - aSig;
2000 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2004 /*----------------------------------------------------------------------------
2005 | Returns the square root of the single-precision floating-point value `a'.
2006 | The operation is performed according to the IEC/IEEE Standard for Binary
2007 | Floating-Point Arithmetic.
2008 *----------------------------------------------------------------------------*/
2010 float32 float32_sqrt( float32 a STATUS_PARAM )
2012 flag aSign;
2013 int16 aExp, zExp;
2014 bits32 aSig, zSig;
2015 bits64 rem, term;
2017 aSig = extractFloat32Frac( a );
2018 aExp = extractFloat32Exp( a );
2019 aSign = extractFloat32Sign( a );
2020 if ( aExp == 0xFF ) {
2021 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2022 if ( ! aSign ) return a;
2023 float_raise( float_flag_invalid STATUS_VAR);
2024 return float32_default_nan;
2026 if ( aSign ) {
2027 if ( ( aExp | aSig ) == 0 ) return a;
2028 float_raise( float_flag_invalid STATUS_VAR);
2029 return float32_default_nan;
2031 if ( aExp == 0 ) {
2032 if ( aSig == 0 ) return float32_zero;
2033 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2035 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2036 aSig = ( aSig | 0x00800000 )<<8;
2037 zSig = estimateSqrt32( aExp, aSig ) + 2;
2038 if ( ( zSig & 0x7F ) <= 5 ) {
2039 if ( zSig < 2 ) {
2040 zSig = 0x7FFFFFFF;
2041 goto roundAndPack;
2043 aSig >>= aExp & 1;
2044 term = ( (bits64) zSig ) * zSig;
2045 rem = ( ( (bits64) aSig )<<32 ) - term;
2046 while ( (sbits64) rem < 0 ) {
2047 --zSig;
2048 rem += ( ( (bits64) zSig )<<1 ) | 1;
2050 zSig |= ( rem != 0 );
2052 shift32RightJamming( zSig, 1, &zSig );
2053 roundAndPack:
2054 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2058 /*----------------------------------------------------------------------------
2059 | Returns the binary log of the single-precision floating-point value `a'.
2060 | The operation is performed according to the IEC/IEEE Standard for Binary
2061 | Floating-Point Arithmetic.
2062 *----------------------------------------------------------------------------*/
2063 float32 float32_log2( float32 a STATUS_PARAM )
2065 flag aSign, zSign;
2066 int16 aExp;
2067 bits32 aSig, zSig, i;
2069 aSig = extractFloat32Frac( a );
2070 aExp = extractFloat32Exp( a );
2071 aSign = extractFloat32Sign( a );
2073 if ( aExp == 0 ) {
2074 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2075 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2077 if ( aSign ) {
2078 float_raise( float_flag_invalid STATUS_VAR);
2079 return float32_default_nan;
2081 if ( aExp == 0xFF ) {
2082 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2083 return a;
2086 aExp -= 0x7F;
2087 aSig |= 0x00800000;
2088 zSign = aExp < 0;
2089 zSig = aExp << 23;
2091 for (i = 1 << 22; i > 0; i >>= 1) {
2092 aSig = ( (bits64)aSig * aSig ) >> 23;
2093 if ( aSig & 0x01000000 ) {
2094 aSig >>= 1;
2095 zSig |= i;
2099 if ( zSign )
2100 zSig = -zSig;
2102 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2105 /*----------------------------------------------------------------------------
2106 | Returns 1 if the single-precision floating-point value `a' is equal to
2107 | the corresponding value `b', and 0 otherwise. The comparison is performed
2108 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2109 *----------------------------------------------------------------------------*/
2111 int float32_eq( float32 a, float32 b STATUS_PARAM )
2114 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2115 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2117 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2118 float_raise( float_flag_invalid STATUS_VAR);
2120 return 0;
2122 return ( float32_val(a) == float32_val(b) ) ||
2123 ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2127 /*----------------------------------------------------------------------------
2128 | Returns 1 if the single-precision floating-point value `a' is less than
2129 | or equal to the corresponding value `b', and 0 otherwise. The comparison
2130 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2131 | Arithmetic.
2132 *----------------------------------------------------------------------------*/
2134 int float32_le( float32 a, float32 b STATUS_PARAM )
2136 flag aSign, bSign;
2137 bits32 av, bv;
2139 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2140 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2142 float_raise( float_flag_invalid STATUS_VAR);
2143 return 0;
2145 aSign = extractFloat32Sign( a );
2146 bSign = extractFloat32Sign( b );
2147 av = float32_val(a);
2148 bv = float32_val(b);
2149 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2150 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2154 /*----------------------------------------------------------------------------
2155 | Returns 1 if the single-precision floating-point value `a' is less than
2156 | the corresponding value `b', and 0 otherwise. The comparison is performed
2157 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2158 *----------------------------------------------------------------------------*/
2160 int float32_lt( float32 a, float32 b STATUS_PARAM )
2162 flag aSign, bSign;
2163 bits32 av, bv;
2165 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2166 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2168 float_raise( float_flag_invalid STATUS_VAR);
2169 return 0;
2171 aSign = extractFloat32Sign( a );
2172 bSign = extractFloat32Sign( b );
2173 av = float32_val(a);
2174 bv = float32_val(b);
2175 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2176 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2180 /*----------------------------------------------------------------------------
2181 | Returns 1 if the single-precision floating-point value `a' is equal to
2182 | the corresponding value `b', and 0 otherwise. The invalid exception is
2183 | raised if either operand is a NaN. Otherwise, the comparison is performed
2184 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2185 *----------------------------------------------------------------------------*/
2187 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2189 bits32 av, bv;
2191 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2192 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2194 float_raise( float_flag_invalid STATUS_VAR);
2195 return 0;
2197 av = float32_val(a);
2198 bv = float32_val(b);
2199 return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2203 /*----------------------------------------------------------------------------
2204 | Returns 1 if the single-precision floating-point value `a' is less than or
2205 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2206 | cause an exception. Otherwise, the comparison is performed according to the
2207 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2208 *----------------------------------------------------------------------------*/
2210 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2212 flag aSign, bSign;
2213 bits32 av, bv;
2215 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2216 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2218 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2219 float_raise( float_flag_invalid STATUS_VAR);
2221 return 0;
2223 aSign = extractFloat32Sign( a );
2224 bSign = extractFloat32Sign( b );
2225 av = float32_val(a);
2226 bv = float32_val(b);
2227 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2228 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2232 /*----------------------------------------------------------------------------
2233 | Returns 1 if the single-precision floating-point value `a' is less than
2234 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2235 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2236 | Standard for Binary Floating-Point Arithmetic.
2237 *----------------------------------------------------------------------------*/
2239 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2241 flag aSign, bSign;
2242 bits32 av, bv;
2244 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2245 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2247 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2248 float_raise( float_flag_invalid STATUS_VAR);
2250 return 0;
2252 aSign = extractFloat32Sign( a );
2253 bSign = extractFloat32Sign( b );
2254 av = float32_val(a);
2255 bv = float32_val(b);
2256 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2257 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2261 /*----------------------------------------------------------------------------
2262 | Returns the result of converting the double-precision floating-point value
2263 | `a' to the 32-bit two's complement integer format. The conversion is
2264 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2265 | Arithmetic---which means in particular that the conversion is rounded
2266 | according to the current rounding mode. If `a' is a NaN, the largest
2267 | positive integer is returned. Otherwise, if the conversion overflows, the
2268 | largest integer with the same sign as `a' is returned.
2269 *----------------------------------------------------------------------------*/
2271 int32 float64_to_int32( float64 a STATUS_PARAM )
2273 flag aSign;
2274 int16 aExp, shiftCount;
2275 bits64 aSig;
2277 aSig = extractFloat64Frac( a );
2278 aExp = extractFloat64Exp( a );
2279 aSign = extractFloat64Sign( a );
2280 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2281 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2282 shiftCount = 0x42C - aExp;
2283 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2284 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2288 /*----------------------------------------------------------------------------
2289 | Returns the result of converting the double-precision floating-point value
2290 | `a' to the 32-bit two's complement integer format. The conversion is
2291 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2292 | Arithmetic, except that the conversion is always rounded toward zero.
2293 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2294 | the conversion overflows, the largest integer with the same sign as `a' is
2295 | returned.
2296 *----------------------------------------------------------------------------*/
2298 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2300 flag aSign;
2301 int16 aExp, shiftCount;
2302 bits64 aSig, savedASig;
2303 int32 z;
2305 aSig = extractFloat64Frac( a );
2306 aExp = extractFloat64Exp( a );
2307 aSign = extractFloat64Sign( a );
2308 if ( 0x41E < aExp ) {
2309 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2310 goto invalid;
2312 else if ( aExp < 0x3FF ) {
2313 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2314 return 0;
2316 aSig |= LIT64( 0x0010000000000000 );
2317 shiftCount = 0x433 - aExp;
2318 savedASig = aSig;
2319 aSig >>= shiftCount;
2320 z = aSig;
2321 if ( aSign ) z = - z;
2322 if ( ( z < 0 ) ^ aSign ) {
2323 invalid:
2324 float_raise( float_flag_invalid STATUS_VAR);
2325 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2327 if ( ( aSig<<shiftCount ) != savedASig ) {
2328 STATUS(float_exception_flags) |= float_flag_inexact;
2330 return z;
2334 /*----------------------------------------------------------------------------
2335 | Returns the result of converting the double-precision floating-point value
2336 | `a' to the 64-bit two's complement integer format. The conversion is
2337 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2338 | Arithmetic---which means in particular that the conversion is rounded
2339 | according to the current rounding mode. If `a' is a NaN, the largest
2340 | positive integer is returned. Otherwise, if the conversion overflows, the
2341 | largest integer with the same sign as `a' is returned.
2342 *----------------------------------------------------------------------------*/
2344 int64 float64_to_int64( float64 a STATUS_PARAM )
2346 flag aSign;
2347 int16 aExp, shiftCount;
2348 bits64 aSig, aSigExtra;
2350 aSig = extractFloat64Frac( a );
2351 aExp = extractFloat64Exp( a );
2352 aSign = extractFloat64Sign( a );
2353 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2354 shiftCount = 0x433 - aExp;
2355 if ( shiftCount <= 0 ) {
2356 if ( 0x43E < aExp ) {
2357 float_raise( float_flag_invalid STATUS_VAR);
2358 if ( ! aSign
2359 || ( ( aExp == 0x7FF )
2360 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2362 return LIT64( 0x7FFFFFFFFFFFFFFF );
2364 return (sbits64) LIT64( 0x8000000000000000 );
2366 aSigExtra = 0;
2367 aSig <<= - shiftCount;
2369 else {
2370 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2372 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2376 /*----------------------------------------------------------------------------
2377 | Returns the result of converting the double-precision floating-point value
2378 | `a' to the 64-bit two's complement integer format. The conversion is
2379 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2380 | Arithmetic, except that the conversion is always rounded toward zero.
2381 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2382 | the conversion overflows, the largest integer with the same sign as `a' is
2383 | returned.
2384 *----------------------------------------------------------------------------*/
2386 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2388 flag aSign;
2389 int16 aExp, shiftCount;
2390 bits64 aSig;
2391 int64 z;
2393 aSig = extractFloat64Frac( a );
2394 aExp = extractFloat64Exp( a );
2395 aSign = extractFloat64Sign( a );
2396 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2397 shiftCount = aExp - 0x433;
2398 if ( 0 <= shiftCount ) {
2399 if ( 0x43E <= aExp ) {
2400 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2401 float_raise( float_flag_invalid STATUS_VAR);
2402 if ( ! aSign
2403 || ( ( aExp == 0x7FF )
2404 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2406 return LIT64( 0x7FFFFFFFFFFFFFFF );
2409 return (sbits64) LIT64( 0x8000000000000000 );
2411 z = aSig<<shiftCount;
2413 else {
2414 if ( aExp < 0x3FE ) {
2415 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2416 return 0;
2418 z = aSig>>( - shiftCount );
2419 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2420 STATUS(float_exception_flags) |= float_flag_inexact;
2423 if ( aSign ) z = - z;
2424 return z;
2428 /*----------------------------------------------------------------------------
2429 | Returns the result of converting the double-precision floating-point value
2430 | `a' to the single-precision floating-point format. The conversion is
2431 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2432 | Arithmetic.
2433 *----------------------------------------------------------------------------*/
2435 float32 float64_to_float32( float64 a STATUS_PARAM )
2437 flag aSign;
2438 int16 aExp;
2439 bits64 aSig;
2440 bits32 zSig;
2442 aSig = extractFloat64Frac( a );
2443 aExp = extractFloat64Exp( a );
2444 aSign = extractFloat64Sign( a );
2445 if ( aExp == 0x7FF ) {
2446 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2447 return packFloat32( aSign, 0xFF, 0 );
2449 shift64RightJamming( aSig, 22, &aSig );
2450 zSig = aSig;
2451 if ( aExp || zSig ) {
2452 zSig |= 0x40000000;
2453 aExp -= 0x381;
2455 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2460 /*----------------------------------------------------------------------------
2461 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2462 | half-precision floating-point value, returning the result. After being
2463 | shifted into the proper positions, the three fields are simply added
2464 | together to form the result. This means that any integer portion of `zSig'
2465 | will be added into the exponent. Since a properly normalized significand
2466 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2467 | than the desired result exponent whenever `zSig' is a complete, normalized
2468 | significand.
2469 *----------------------------------------------------------------------------*/
2470 static bits16 packFloat16(flag zSign, int16 zExp, bits16 zSig)
2472 return (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig;
2475 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2476 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
2478 float32 float16_to_float32( bits16 a, flag ieee STATUS_PARAM )
2480 flag aSign;
2481 int16 aExp;
2482 bits32 aSig;
2484 aSign = a >> 15;
2485 aExp = (a >> 10) & 0x1f;
2486 aSig = a & 0x3ff;
2488 if (aExp == 0x1f && ieee) {
2489 if (aSig) {
2490 /* Make sure correct exceptions are raised. */
2491 float32ToCommonNaN(a STATUS_VAR);
2492 aSig |= 0x200;
2494 return packFloat32(aSign, 0xff, aSig << 13);
2496 if (aExp == 0) {
2497 int8 shiftCount;
2499 if (aSig == 0) {
2500 return packFloat32(aSign, 0, 0);
2503 shiftCount = countLeadingZeros32( aSig ) - 21;
2504 aSig = aSig << shiftCount;
2505 aExp = -shiftCount;
2507 return packFloat32( aSign, aExp + 0x70, aSig << 13);
2510 bits16 float32_to_float16( float32 a, flag ieee STATUS_PARAM)
2512 flag aSign;
2513 int16 aExp;
2514 bits32 aSig;
2515 bits32 mask;
2516 bits32 increment;
2517 int8 roundingMode;
2519 aSig = extractFloat32Frac( a );
2520 aExp = extractFloat32Exp( a );
2521 aSign = extractFloat32Sign( a );
2522 if ( aExp == 0xFF ) {
2523 if (aSig) {
2524 /* Make sure correct exceptions are raised. */
2525 float32ToCommonNaN(a STATUS_VAR);
2526 aSig |= 0x00400000;
2528 return packFloat16(aSign, 0x1f, aSig >> 13);
2530 if (aExp == 0 && aSign == 0) {
2531 return packFloat16(aSign, 0, 0);
2533 /* Decimal point between bits 22 and 23. */
2534 aSig |= 0x00800000;
2535 aExp -= 0x7f;
2536 if (aExp < -14) {
2537 mask = 0x007fffff;
2538 if (aExp < -24) {
2539 aExp = -25;
2540 } else {
2541 mask >>= 24 + aExp;
2543 } else {
2544 mask = 0x00001fff;
2546 if (aSig & mask) {
2547 float_raise( float_flag_underflow STATUS_VAR );
2548 roundingMode = STATUS(float_rounding_mode);
2549 switch (roundingMode) {
2550 case float_round_nearest_even:
2551 increment = (mask + 1) >> 1;
2552 if ((aSig & mask) == increment) {
2553 increment = aSig & (increment << 1);
2555 break;
2556 case float_round_up:
2557 increment = aSign ? 0 : mask;
2558 break;
2559 case float_round_down:
2560 increment = aSign ? mask : 0;
2561 break;
2562 default: /* round_to_zero */
2563 increment = 0;
2564 break;
2566 aSig += increment;
2567 if (aSig >= 0x01000000) {
2568 aSig >>= 1;
2569 aExp++;
2571 } else if (aExp < -14
2572 && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2573 float_raise( float_flag_underflow STATUS_VAR);
2576 if (ieee) {
2577 if (aExp > 15) {
2578 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2579 return packFloat16(aSign, 0x1f, 0);
2581 } else {
2582 if (aExp > 16) {
2583 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2584 return packFloat16(aSign, 0x1f, 0x3ff);
2587 if (aExp < -24) {
2588 return packFloat16(aSign, 0, 0);
2590 if (aExp < -14) {
2591 aSig >>= -14 - aExp;
2592 aExp = -14;
2594 return packFloat16(aSign, aExp + 14, aSig >> 13);
2597 #ifdef FLOATX80
2599 /*----------------------------------------------------------------------------
2600 | Returns the result of converting the double-precision floating-point value
2601 | `a' to the extended double-precision floating-point format. The conversion
2602 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2603 | Arithmetic.
2604 *----------------------------------------------------------------------------*/
2606 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2608 flag aSign;
2609 int16 aExp;
2610 bits64 aSig;
2612 aSig = extractFloat64Frac( a );
2613 aExp = extractFloat64Exp( a );
2614 aSign = extractFloat64Sign( a );
2615 if ( aExp == 0x7FF ) {
2616 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2617 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2619 if ( aExp == 0 ) {
2620 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2621 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2623 return
2624 packFloatx80(
2625 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2629 #endif
2631 #ifdef FLOAT128
2633 /*----------------------------------------------------------------------------
2634 | Returns the result of converting the double-precision floating-point value
2635 | `a' to the quadruple-precision floating-point format. The conversion is
2636 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2637 | Arithmetic.
2638 *----------------------------------------------------------------------------*/
2640 float128 float64_to_float128( float64 a STATUS_PARAM )
2642 flag aSign;
2643 int16 aExp;
2644 bits64 aSig, zSig0, zSig1;
2646 aSig = extractFloat64Frac( a );
2647 aExp = extractFloat64Exp( a );
2648 aSign = extractFloat64Sign( a );
2649 if ( aExp == 0x7FF ) {
2650 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2651 return packFloat128( aSign, 0x7FFF, 0, 0 );
2653 if ( aExp == 0 ) {
2654 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2655 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2656 --aExp;
2658 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2659 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2663 #endif
2665 /*----------------------------------------------------------------------------
2666 | Rounds the double-precision floating-point value `a' to an integer, and
2667 | returns the result as a double-precision floating-point value. The
2668 | operation is performed according to the IEC/IEEE Standard for Binary
2669 | Floating-Point Arithmetic.
2670 *----------------------------------------------------------------------------*/
2672 float64 float64_round_to_int( float64 a STATUS_PARAM )
2674 flag aSign;
2675 int16 aExp;
2676 bits64 lastBitMask, roundBitsMask;
2677 int8 roundingMode;
2678 bits64 z;
2680 aExp = extractFloat64Exp( a );
2681 if ( 0x433 <= aExp ) {
2682 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2683 return propagateFloat64NaN( a, a STATUS_VAR );
2685 return a;
2687 if ( aExp < 0x3FF ) {
2688 if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2689 STATUS(float_exception_flags) |= float_flag_inexact;
2690 aSign = extractFloat64Sign( a );
2691 switch ( STATUS(float_rounding_mode) ) {
2692 case float_round_nearest_even:
2693 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2694 return packFloat64( aSign, 0x3FF, 0 );
2696 break;
2697 case float_round_down:
2698 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2699 case float_round_up:
2700 return make_float64(
2701 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2703 return packFloat64( aSign, 0, 0 );
2705 lastBitMask = 1;
2706 lastBitMask <<= 0x433 - aExp;
2707 roundBitsMask = lastBitMask - 1;
2708 z = float64_val(a);
2709 roundingMode = STATUS(float_rounding_mode);
2710 if ( roundingMode == float_round_nearest_even ) {
2711 z += lastBitMask>>1;
2712 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2714 else if ( roundingMode != float_round_to_zero ) {
2715 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2716 z += roundBitsMask;
2719 z &= ~ roundBitsMask;
2720 if ( z != float64_val(a) )
2721 STATUS(float_exception_flags) |= float_flag_inexact;
2722 return make_float64(z);
2726 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2728 int oldmode;
2729 float64 res;
2730 oldmode = STATUS(float_rounding_mode);
2731 STATUS(float_rounding_mode) = float_round_to_zero;
2732 res = float64_round_to_int(a STATUS_VAR);
2733 STATUS(float_rounding_mode) = oldmode;
2734 return res;
2737 /*----------------------------------------------------------------------------
2738 | Returns the result of adding the absolute values of the double-precision
2739 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2740 | before being returned. `zSign' is ignored if the result is a NaN.
2741 | The addition is performed according to the IEC/IEEE Standard for Binary
2742 | Floating-Point Arithmetic.
2743 *----------------------------------------------------------------------------*/
2745 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2747 int16 aExp, bExp, zExp;
2748 bits64 aSig, bSig, zSig;
2749 int16 expDiff;
2751 aSig = extractFloat64Frac( a );
2752 aExp = extractFloat64Exp( a );
2753 bSig = extractFloat64Frac( b );
2754 bExp = extractFloat64Exp( b );
2755 expDiff = aExp - bExp;
2756 aSig <<= 9;
2757 bSig <<= 9;
2758 if ( 0 < expDiff ) {
2759 if ( aExp == 0x7FF ) {
2760 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2761 return a;
2763 if ( bExp == 0 ) {
2764 --expDiff;
2766 else {
2767 bSig |= LIT64( 0x2000000000000000 );
2769 shift64RightJamming( bSig, expDiff, &bSig );
2770 zExp = aExp;
2772 else if ( expDiff < 0 ) {
2773 if ( bExp == 0x7FF ) {
2774 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2775 return packFloat64( zSign, 0x7FF, 0 );
2777 if ( aExp == 0 ) {
2778 ++expDiff;
2780 else {
2781 aSig |= LIT64( 0x2000000000000000 );
2783 shift64RightJamming( aSig, - expDiff, &aSig );
2784 zExp = bExp;
2786 else {
2787 if ( aExp == 0x7FF ) {
2788 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2789 return a;
2791 if ( aExp == 0 ) {
2792 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
2793 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2795 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2796 zExp = aExp;
2797 goto roundAndPack;
2799 aSig |= LIT64( 0x2000000000000000 );
2800 zSig = ( aSig + bSig )<<1;
2801 --zExp;
2802 if ( (sbits64) zSig < 0 ) {
2803 zSig = aSig + bSig;
2804 ++zExp;
2806 roundAndPack:
2807 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2811 /*----------------------------------------------------------------------------
2812 | Returns the result of subtracting the absolute values of the double-
2813 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2814 | difference is negated before being returned. `zSign' is ignored if the
2815 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2816 | Standard for Binary Floating-Point Arithmetic.
2817 *----------------------------------------------------------------------------*/
2819 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2821 int16 aExp, bExp, zExp;
2822 bits64 aSig, bSig, zSig;
2823 int16 expDiff;
2825 aSig = extractFloat64Frac( a );
2826 aExp = extractFloat64Exp( a );
2827 bSig = extractFloat64Frac( b );
2828 bExp = extractFloat64Exp( b );
2829 expDiff = aExp - bExp;
2830 aSig <<= 10;
2831 bSig <<= 10;
2832 if ( 0 < expDiff ) goto aExpBigger;
2833 if ( expDiff < 0 ) goto bExpBigger;
2834 if ( aExp == 0x7FF ) {
2835 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2836 float_raise( float_flag_invalid STATUS_VAR);
2837 return float64_default_nan;
2839 if ( aExp == 0 ) {
2840 aExp = 1;
2841 bExp = 1;
2843 if ( bSig < aSig ) goto aBigger;
2844 if ( aSig < bSig ) goto bBigger;
2845 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2846 bExpBigger:
2847 if ( bExp == 0x7FF ) {
2848 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2849 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2851 if ( aExp == 0 ) {
2852 ++expDiff;
2854 else {
2855 aSig |= LIT64( 0x4000000000000000 );
2857 shift64RightJamming( aSig, - expDiff, &aSig );
2858 bSig |= LIT64( 0x4000000000000000 );
2859 bBigger:
2860 zSig = bSig - aSig;
2861 zExp = bExp;
2862 zSign ^= 1;
2863 goto normalizeRoundAndPack;
2864 aExpBigger:
2865 if ( aExp == 0x7FF ) {
2866 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2867 return a;
2869 if ( bExp == 0 ) {
2870 --expDiff;
2872 else {
2873 bSig |= LIT64( 0x4000000000000000 );
2875 shift64RightJamming( bSig, expDiff, &bSig );
2876 aSig |= LIT64( 0x4000000000000000 );
2877 aBigger:
2878 zSig = aSig - bSig;
2879 zExp = aExp;
2880 normalizeRoundAndPack:
2881 --zExp;
2882 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2886 /*----------------------------------------------------------------------------
2887 | Returns the result of adding the double-precision floating-point values `a'
2888 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2889 | Binary Floating-Point Arithmetic.
2890 *----------------------------------------------------------------------------*/
2892 float64 float64_add( float64 a, float64 b STATUS_PARAM )
2894 flag aSign, bSign;
2896 aSign = extractFloat64Sign( a );
2897 bSign = extractFloat64Sign( b );
2898 if ( aSign == bSign ) {
2899 return addFloat64Sigs( a, b, aSign STATUS_VAR );
2901 else {
2902 return subFloat64Sigs( a, b, aSign STATUS_VAR );
2907 /*----------------------------------------------------------------------------
2908 | Returns the result of subtracting the double-precision floating-point values
2909 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2910 | for Binary Floating-Point Arithmetic.
2911 *----------------------------------------------------------------------------*/
2913 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2915 flag aSign, bSign;
2917 aSign = extractFloat64Sign( a );
2918 bSign = extractFloat64Sign( b );
2919 if ( aSign == bSign ) {
2920 return subFloat64Sigs( a, b, aSign STATUS_VAR );
2922 else {
2923 return addFloat64Sigs( a, b, aSign STATUS_VAR );
2928 /*----------------------------------------------------------------------------
2929 | Returns the result of multiplying the double-precision floating-point values
2930 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2931 | for Binary Floating-Point Arithmetic.
2932 *----------------------------------------------------------------------------*/
2934 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2936 flag aSign, bSign, zSign;
2937 int16 aExp, bExp, zExp;
2938 bits64 aSig, bSig, zSig0, zSig1;
2940 aSig = extractFloat64Frac( a );
2941 aExp = extractFloat64Exp( a );
2942 aSign = extractFloat64Sign( a );
2943 bSig = extractFloat64Frac( b );
2944 bExp = extractFloat64Exp( b );
2945 bSign = extractFloat64Sign( b );
2946 zSign = aSign ^ bSign;
2947 if ( aExp == 0x7FF ) {
2948 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2949 return propagateFloat64NaN( a, b STATUS_VAR );
2951 if ( ( bExp | bSig ) == 0 ) {
2952 float_raise( float_flag_invalid STATUS_VAR);
2953 return float64_default_nan;
2955 return packFloat64( zSign, 0x7FF, 0 );
2957 if ( bExp == 0x7FF ) {
2958 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2959 if ( ( aExp | aSig ) == 0 ) {
2960 float_raise( float_flag_invalid STATUS_VAR);
2961 return float64_default_nan;
2963 return packFloat64( zSign, 0x7FF, 0 );
2965 if ( aExp == 0 ) {
2966 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2967 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2969 if ( bExp == 0 ) {
2970 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2971 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2973 zExp = aExp + bExp - 0x3FF;
2974 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2975 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2976 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2977 zSig0 |= ( zSig1 != 0 );
2978 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2979 zSig0 <<= 1;
2980 --zExp;
2982 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2986 /*----------------------------------------------------------------------------
2987 | Returns the result of dividing the double-precision floating-point value `a'
2988 | by the corresponding value `b'. The operation is performed according to
2989 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2990 *----------------------------------------------------------------------------*/
2992 float64 float64_div( float64 a, float64 b STATUS_PARAM )
2994 flag aSign, bSign, zSign;
2995 int16 aExp, bExp, zExp;
2996 bits64 aSig, bSig, zSig;
2997 bits64 rem0, rem1;
2998 bits64 term0, term1;
3000 aSig = extractFloat64Frac( a );
3001 aExp = extractFloat64Exp( a );
3002 aSign = extractFloat64Sign( a );
3003 bSig = extractFloat64Frac( b );
3004 bExp = extractFloat64Exp( b );
3005 bSign = extractFloat64Sign( b );
3006 zSign = aSign ^ bSign;
3007 if ( aExp == 0x7FF ) {
3008 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3009 if ( bExp == 0x7FF ) {
3010 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3011 float_raise( float_flag_invalid STATUS_VAR);
3012 return float64_default_nan;
3014 return packFloat64( zSign, 0x7FF, 0 );
3016 if ( bExp == 0x7FF ) {
3017 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3018 return packFloat64( zSign, 0, 0 );
3020 if ( bExp == 0 ) {
3021 if ( bSig == 0 ) {
3022 if ( ( aExp | aSig ) == 0 ) {
3023 float_raise( float_flag_invalid STATUS_VAR);
3024 return float64_default_nan;
3026 float_raise( float_flag_divbyzero STATUS_VAR);
3027 return packFloat64( zSign, 0x7FF, 0 );
3029 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3031 if ( aExp == 0 ) {
3032 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3033 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3035 zExp = aExp - bExp + 0x3FD;
3036 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3037 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3038 if ( bSig <= ( aSig + aSig ) ) {
3039 aSig >>= 1;
3040 ++zExp;
3042 zSig = estimateDiv128To64( aSig, 0, bSig );
3043 if ( ( zSig & 0x1FF ) <= 2 ) {
3044 mul64To128( bSig, zSig, &term0, &term1 );
3045 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3046 while ( (sbits64) rem0 < 0 ) {
3047 --zSig;
3048 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3050 zSig |= ( rem1 != 0 );
3052 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3056 /*----------------------------------------------------------------------------
3057 | Returns the remainder of the double-precision floating-point value `a'
3058 | with respect to the corresponding value `b'. The operation is performed
3059 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3060 *----------------------------------------------------------------------------*/
3062 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3064 flag aSign, zSign;
3065 int16 aExp, bExp, expDiff;
3066 bits64 aSig, bSig;
3067 bits64 q, alternateASig;
3068 sbits64 sigMean;
3070 aSig = extractFloat64Frac( a );
3071 aExp = extractFloat64Exp( a );
3072 aSign = extractFloat64Sign( a );
3073 bSig = extractFloat64Frac( b );
3074 bExp = extractFloat64Exp( b );
3075 if ( aExp == 0x7FF ) {
3076 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3077 return propagateFloat64NaN( a, b STATUS_VAR );
3079 float_raise( float_flag_invalid STATUS_VAR);
3080 return float64_default_nan;
3082 if ( bExp == 0x7FF ) {
3083 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3084 return a;
3086 if ( bExp == 0 ) {
3087 if ( bSig == 0 ) {
3088 float_raise( float_flag_invalid STATUS_VAR);
3089 return float64_default_nan;
3091 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3093 if ( aExp == 0 ) {
3094 if ( aSig == 0 ) return a;
3095 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3097 expDiff = aExp - bExp;
3098 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3099 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3100 if ( expDiff < 0 ) {
3101 if ( expDiff < -1 ) return a;
3102 aSig >>= 1;
3104 q = ( bSig <= aSig );
3105 if ( q ) aSig -= bSig;
3106 expDiff -= 64;
3107 while ( 0 < expDiff ) {
3108 q = estimateDiv128To64( aSig, 0, bSig );
3109 q = ( 2 < q ) ? q - 2 : 0;
3110 aSig = - ( ( bSig>>2 ) * q );
3111 expDiff -= 62;
3113 expDiff += 64;
3114 if ( 0 < expDiff ) {
3115 q = estimateDiv128To64( aSig, 0, bSig );
3116 q = ( 2 < q ) ? q - 2 : 0;
3117 q >>= 64 - expDiff;
3118 bSig >>= 2;
3119 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3121 else {
3122 aSig >>= 2;
3123 bSig >>= 2;
3125 do {
3126 alternateASig = aSig;
3127 ++q;
3128 aSig -= bSig;
3129 } while ( 0 <= (sbits64) aSig );
3130 sigMean = aSig + alternateASig;
3131 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3132 aSig = alternateASig;
3134 zSign = ( (sbits64) aSig < 0 );
3135 if ( zSign ) aSig = - aSig;
3136 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3140 /*----------------------------------------------------------------------------
3141 | Returns the square root of the double-precision floating-point value `a'.
3142 | The operation is performed according to the IEC/IEEE Standard for Binary
3143 | Floating-Point Arithmetic.
3144 *----------------------------------------------------------------------------*/
3146 float64 float64_sqrt( float64 a STATUS_PARAM )
3148 flag aSign;
3149 int16 aExp, zExp;
3150 bits64 aSig, zSig, doubleZSig;
3151 bits64 rem0, rem1, term0, term1;
3153 aSig = extractFloat64Frac( a );
3154 aExp = extractFloat64Exp( a );
3155 aSign = extractFloat64Sign( a );
3156 if ( aExp == 0x7FF ) {
3157 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3158 if ( ! aSign ) return a;
3159 float_raise( float_flag_invalid STATUS_VAR);
3160 return float64_default_nan;
3162 if ( aSign ) {
3163 if ( ( aExp | aSig ) == 0 ) return a;
3164 float_raise( float_flag_invalid STATUS_VAR);
3165 return float64_default_nan;
3167 if ( aExp == 0 ) {
3168 if ( aSig == 0 ) return float64_zero;
3169 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3171 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3172 aSig |= LIT64( 0x0010000000000000 );
3173 zSig = estimateSqrt32( aExp, aSig>>21 );
3174 aSig <<= 9 - ( aExp & 1 );
3175 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3176 if ( ( zSig & 0x1FF ) <= 5 ) {
3177 doubleZSig = zSig<<1;
3178 mul64To128( zSig, zSig, &term0, &term1 );
3179 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3180 while ( (sbits64) rem0 < 0 ) {
3181 --zSig;
3182 doubleZSig -= 2;
3183 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3185 zSig |= ( ( rem0 | rem1 ) != 0 );
3187 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3191 /*----------------------------------------------------------------------------
3192 | Returns the binary log of the double-precision floating-point value `a'.
3193 | The operation is performed according to the IEC/IEEE Standard for Binary
3194 | Floating-Point Arithmetic.
3195 *----------------------------------------------------------------------------*/
3196 float64 float64_log2( float64 a STATUS_PARAM )
3198 flag aSign, zSign;
3199 int16 aExp;
3200 bits64 aSig, aSig0, aSig1, zSig, i;
3202 aSig = extractFloat64Frac( a );
3203 aExp = extractFloat64Exp( a );
3204 aSign = extractFloat64Sign( a );
3206 if ( aExp == 0 ) {
3207 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3208 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3210 if ( aSign ) {
3211 float_raise( float_flag_invalid STATUS_VAR);
3212 return float64_default_nan;
3214 if ( aExp == 0x7FF ) {
3215 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3216 return a;
3219 aExp -= 0x3FF;
3220 aSig |= LIT64( 0x0010000000000000 );
3221 zSign = aExp < 0;
3222 zSig = (bits64)aExp << 52;
3223 for (i = 1LL << 51; i > 0; i >>= 1) {
3224 mul64To128( aSig, aSig, &aSig0, &aSig1 );
3225 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3226 if ( aSig & LIT64( 0x0020000000000000 ) ) {
3227 aSig >>= 1;
3228 zSig |= i;
3232 if ( zSign )
3233 zSig = -zSig;
3234 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3237 /*----------------------------------------------------------------------------
3238 | Returns 1 if the double-precision floating-point value `a' is equal to the
3239 | corresponding value `b', and 0 otherwise. The comparison is performed
3240 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3241 *----------------------------------------------------------------------------*/
3243 int float64_eq( float64 a, float64 b STATUS_PARAM )
3245 bits64 av, bv;
3247 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3248 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3250 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3251 float_raise( float_flag_invalid STATUS_VAR);
3253 return 0;
3255 av = float64_val(a);
3256 bv = float64_val(b);
3257 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3261 /*----------------------------------------------------------------------------
3262 | Returns 1 if the double-precision floating-point value `a' is less than or
3263 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3264 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3265 | Arithmetic.
3266 *----------------------------------------------------------------------------*/
3268 int float64_le( float64 a, float64 b STATUS_PARAM )
3270 flag aSign, bSign;
3271 bits64 av, bv;
3273 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3274 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3276 float_raise( float_flag_invalid STATUS_VAR);
3277 return 0;
3279 aSign = extractFloat64Sign( a );
3280 bSign = extractFloat64Sign( b );
3281 av = float64_val(a);
3282 bv = float64_val(b);
3283 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3284 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3288 /*----------------------------------------------------------------------------
3289 | Returns 1 if the double-precision floating-point value `a' is less than
3290 | the corresponding value `b', and 0 otherwise. The comparison is performed
3291 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3292 *----------------------------------------------------------------------------*/
3294 int float64_lt( float64 a, float64 b STATUS_PARAM )
3296 flag aSign, bSign;
3297 bits64 av, bv;
3299 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3300 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3302 float_raise( float_flag_invalid STATUS_VAR);
3303 return 0;
3305 aSign = extractFloat64Sign( a );
3306 bSign = extractFloat64Sign( b );
3307 av = float64_val(a);
3308 bv = float64_val(b);
3309 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3310 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3314 /*----------------------------------------------------------------------------
3315 | Returns 1 if the double-precision floating-point value `a' is equal to the
3316 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3317 | if either operand is a NaN. Otherwise, the comparison is performed
3318 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3319 *----------------------------------------------------------------------------*/
3321 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3323 bits64 av, bv;
3325 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3326 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3328 float_raise( float_flag_invalid STATUS_VAR);
3329 return 0;
3331 av = float64_val(a);
3332 bv = float64_val(b);
3333 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3337 /*----------------------------------------------------------------------------
3338 | Returns 1 if the double-precision floating-point value `a' is less than or
3339 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3340 | cause an exception. Otherwise, the comparison is performed according to the
3341 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3342 *----------------------------------------------------------------------------*/
3344 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3346 flag aSign, bSign;
3347 bits64 av, bv;
3349 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3350 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3352 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3353 float_raise( float_flag_invalid STATUS_VAR);
3355 return 0;
3357 aSign = extractFloat64Sign( a );
3358 bSign = extractFloat64Sign( b );
3359 av = float64_val(a);
3360 bv = float64_val(b);
3361 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3362 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3366 /*----------------------------------------------------------------------------
3367 | Returns 1 if the double-precision floating-point value `a' is less than
3368 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3369 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3370 | Standard for Binary Floating-Point Arithmetic.
3371 *----------------------------------------------------------------------------*/
3373 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3375 flag aSign, bSign;
3376 bits64 av, bv;
3378 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3379 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3381 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3382 float_raise( float_flag_invalid STATUS_VAR);
3384 return 0;
3386 aSign = extractFloat64Sign( a );
3387 bSign = extractFloat64Sign( b );
3388 av = float64_val(a);
3389 bv = float64_val(b);
3390 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3391 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3395 #ifdef FLOATX80
3397 /*----------------------------------------------------------------------------
3398 | Returns the result of converting the extended double-precision floating-
3399 | point value `a' to the 32-bit two's complement integer format. The
3400 | conversion is performed according to the IEC/IEEE Standard for Binary
3401 | Floating-Point Arithmetic---which means in particular that the conversion
3402 | is rounded according to the current rounding mode. If `a' is a NaN, the
3403 | largest positive integer is returned. Otherwise, if the conversion
3404 | overflows, the largest integer with the same sign as `a' is returned.
3405 *----------------------------------------------------------------------------*/
3407 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3409 flag aSign;
3410 int32 aExp, shiftCount;
3411 bits64 aSig;
3413 aSig = extractFloatx80Frac( a );
3414 aExp = extractFloatx80Exp( a );
3415 aSign = extractFloatx80Sign( a );
3416 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3417 shiftCount = 0x4037 - aExp;
3418 if ( shiftCount <= 0 ) shiftCount = 1;
3419 shift64RightJamming( aSig, shiftCount, &aSig );
3420 return roundAndPackInt32( aSign, aSig STATUS_VAR );
3424 /*----------------------------------------------------------------------------
3425 | Returns the result of converting the extended double-precision floating-
3426 | point value `a' to the 32-bit two's complement integer format. The
3427 | conversion is performed according to the IEC/IEEE Standard for Binary
3428 | Floating-Point Arithmetic, except that the conversion is always rounded
3429 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3430 | Otherwise, if the conversion overflows, the largest integer with the same
3431 | sign as `a' is returned.
3432 *----------------------------------------------------------------------------*/
3434 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3436 flag aSign;
3437 int32 aExp, shiftCount;
3438 bits64 aSig, savedASig;
3439 int32 z;
3441 aSig = extractFloatx80Frac( a );
3442 aExp = extractFloatx80Exp( a );
3443 aSign = extractFloatx80Sign( a );
3444 if ( 0x401E < aExp ) {
3445 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3446 goto invalid;
3448 else if ( aExp < 0x3FFF ) {
3449 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3450 return 0;
3452 shiftCount = 0x403E - aExp;
3453 savedASig = aSig;
3454 aSig >>= shiftCount;
3455 z = aSig;
3456 if ( aSign ) z = - z;
3457 if ( ( z < 0 ) ^ aSign ) {
3458 invalid:
3459 float_raise( float_flag_invalid STATUS_VAR);
3460 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3462 if ( ( aSig<<shiftCount ) != savedASig ) {
3463 STATUS(float_exception_flags) |= float_flag_inexact;
3465 return z;
3469 /*----------------------------------------------------------------------------
3470 | Returns the result of converting the extended double-precision floating-
3471 | point value `a' to the 64-bit two's complement integer format. The
3472 | conversion is performed according to the IEC/IEEE Standard for Binary
3473 | Floating-Point Arithmetic---which means in particular that the conversion
3474 | is rounded according to the current rounding mode. If `a' is a NaN,
3475 | the largest positive integer is returned. Otherwise, if the conversion
3476 | overflows, the largest integer with the same sign as `a' is returned.
3477 *----------------------------------------------------------------------------*/
3479 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3481 flag aSign;
3482 int32 aExp, shiftCount;
3483 bits64 aSig, aSigExtra;
3485 aSig = extractFloatx80Frac( a );
3486 aExp = extractFloatx80Exp( a );
3487 aSign = extractFloatx80Sign( a );
3488 shiftCount = 0x403E - aExp;
3489 if ( shiftCount <= 0 ) {
3490 if ( shiftCount ) {
3491 float_raise( float_flag_invalid STATUS_VAR);
3492 if ( ! aSign
3493 || ( ( aExp == 0x7FFF )
3494 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3496 return LIT64( 0x7FFFFFFFFFFFFFFF );
3498 return (sbits64) LIT64( 0x8000000000000000 );
3500 aSigExtra = 0;
3502 else {
3503 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3505 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3509 /*----------------------------------------------------------------------------
3510 | Returns the result of converting the extended double-precision floating-
3511 | point value `a' to the 64-bit two's complement integer format. The
3512 | conversion is performed according to the IEC/IEEE Standard for Binary
3513 | Floating-Point Arithmetic, except that the conversion is always rounded
3514 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3515 | Otherwise, if the conversion overflows, the largest integer with the same
3516 | sign as `a' is returned.
3517 *----------------------------------------------------------------------------*/
3519 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3521 flag aSign;
3522 int32 aExp, shiftCount;
3523 bits64 aSig;
3524 int64 z;
3526 aSig = extractFloatx80Frac( a );
3527 aExp = extractFloatx80Exp( a );
3528 aSign = extractFloatx80Sign( a );
3529 shiftCount = aExp - 0x403E;
3530 if ( 0 <= shiftCount ) {
3531 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3532 if ( ( a.high != 0xC03E ) || aSig ) {
3533 float_raise( float_flag_invalid STATUS_VAR);
3534 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3535 return LIT64( 0x7FFFFFFFFFFFFFFF );
3538 return (sbits64) LIT64( 0x8000000000000000 );
3540 else if ( aExp < 0x3FFF ) {
3541 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3542 return 0;
3544 z = aSig>>( - shiftCount );
3545 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3546 STATUS(float_exception_flags) |= float_flag_inexact;
3548 if ( aSign ) z = - z;
3549 return z;
3553 /*----------------------------------------------------------------------------
3554 | Returns the result of converting the extended double-precision floating-
3555 | point value `a' to the single-precision floating-point format. The
3556 | conversion is performed according to the IEC/IEEE Standard for Binary
3557 | Floating-Point Arithmetic.
3558 *----------------------------------------------------------------------------*/
3560 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3562 flag aSign;
3563 int32 aExp;
3564 bits64 aSig;
3566 aSig = extractFloatx80Frac( a );
3567 aExp = extractFloatx80Exp( a );
3568 aSign = extractFloatx80Sign( a );
3569 if ( aExp == 0x7FFF ) {
3570 if ( (bits64) ( aSig<<1 ) ) {
3571 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3573 return packFloat32( aSign, 0xFF, 0 );
3575 shift64RightJamming( aSig, 33, &aSig );
3576 if ( aExp || aSig ) aExp -= 0x3F81;
3577 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3581 /*----------------------------------------------------------------------------
3582 | Returns the result of converting the extended double-precision floating-
3583 | point value `a' to the double-precision floating-point format. The
3584 | conversion is performed according to the IEC/IEEE Standard for Binary
3585 | Floating-Point Arithmetic.
3586 *----------------------------------------------------------------------------*/
3588 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3590 flag aSign;
3591 int32 aExp;
3592 bits64 aSig, zSig;
3594 aSig = extractFloatx80Frac( a );
3595 aExp = extractFloatx80Exp( a );
3596 aSign = extractFloatx80Sign( a );
3597 if ( aExp == 0x7FFF ) {
3598 if ( (bits64) ( aSig<<1 ) ) {
3599 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3601 return packFloat64( aSign, 0x7FF, 0 );
3603 shift64RightJamming( aSig, 1, &zSig );
3604 if ( aExp || aSig ) aExp -= 0x3C01;
3605 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3609 #ifdef FLOAT128
3611 /*----------------------------------------------------------------------------
3612 | Returns the result of converting the extended double-precision floating-
3613 | point value `a' to the quadruple-precision floating-point format. The
3614 | conversion is performed according to the IEC/IEEE Standard for Binary
3615 | Floating-Point Arithmetic.
3616 *----------------------------------------------------------------------------*/
3618 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3620 flag aSign;
3621 int16 aExp;
3622 bits64 aSig, zSig0, zSig1;
3624 aSig = extractFloatx80Frac( a );
3625 aExp = extractFloatx80Exp( a );
3626 aSign = extractFloatx80Sign( a );
3627 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3628 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3630 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3631 return packFloat128( aSign, aExp, zSig0, zSig1 );
3635 #endif
3637 /*----------------------------------------------------------------------------
3638 | Rounds the extended double-precision floating-point value `a' to an integer,
3639 | and returns the result as an extended quadruple-precision floating-point
3640 | value. The operation is performed according to the IEC/IEEE Standard for
3641 | Binary Floating-Point Arithmetic.
3642 *----------------------------------------------------------------------------*/
3644 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3646 flag aSign;
3647 int32 aExp;
3648 bits64 lastBitMask, roundBitsMask;
3649 int8 roundingMode;
3650 floatx80 z;
3652 aExp = extractFloatx80Exp( a );
3653 if ( 0x403E <= aExp ) {
3654 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3655 return propagateFloatx80NaN( a, a STATUS_VAR );
3657 return a;
3659 if ( aExp < 0x3FFF ) {
3660 if ( ( aExp == 0 )
3661 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3662 return a;
3664 STATUS(float_exception_flags) |= float_flag_inexact;
3665 aSign = extractFloatx80Sign( a );
3666 switch ( STATUS(float_rounding_mode) ) {
3667 case float_round_nearest_even:
3668 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3670 return
3671 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3673 break;
3674 case float_round_down:
3675 return
3676 aSign ?
3677 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3678 : packFloatx80( 0, 0, 0 );
3679 case float_round_up:
3680 return
3681 aSign ? packFloatx80( 1, 0, 0 )
3682 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3684 return packFloatx80( aSign, 0, 0 );
3686 lastBitMask = 1;
3687 lastBitMask <<= 0x403E - aExp;
3688 roundBitsMask = lastBitMask - 1;
3689 z = a;
3690 roundingMode = STATUS(float_rounding_mode);
3691 if ( roundingMode == float_round_nearest_even ) {
3692 z.low += lastBitMask>>1;
3693 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3695 else if ( roundingMode != float_round_to_zero ) {
3696 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3697 z.low += roundBitsMask;
3700 z.low &= ~ roundBitsMask;
3701 if ( z.low == 0 ) {
3702 ++z.high;
3703 z.low = LIT64( 0x8000000000000000 );
3705 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3706 return z;
3710 /*----------------------------------------------------------------------------
3711 | Returns the result of adding the absolute values of the extended double-
3712 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3713 | negated before being returned. `zSign' is ignored if the result is a NaN.
3714 | The addition is performed according to the IEC/IEEE Standard for Binary
3715 | Floating-Point Arithmetic.
3716 *----------------------------------------------------------------------------*/
3718 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3720 int32 aExp, bExp, zExp;
3721 bits64 aSig, bSig, zSig0, zSig1;
3722 int32 expDiff;
3724 aSig = extractFloatx80Frac( a );
3725 aExp = extractFloatx80Exp( a );
3726 bSig = extractFloatx80Frac( b );
3727 bExp = extractFloatx80Exp( b );
3728 expDiff = aExp - bExp;
3729 if ( 0 < expDiff ) {
3730 if ( aExp == 0x7FFF ) {
3731 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3732 return a;
3734 if ( bExp == 0 ) --expDiff;
3735 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3736 zExp = aExp;
3738 else if ( expDiff < 0 ) {
3739 if ( bExp == 0x7FFF ) {
3740 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3741 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3743 if ( aExp == 0 ) ++expDiff;
3744 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3745 zExp = bExp;
3747 else {
3748 if ( aExp == 0x7FFF ) {
3749 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3750 return propagateFloatx80NaN( a, b STATUS_VAR );
3752 return a;
3754 zSig1 = 0;
3755 zSig0 = aSig + bSig;
3756 if ( aExp == 0 ) {
3757 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3758 goto roundAndPack;
3760 zExp = aExp;
3761 goto shiftRight1;
3763 zSig0 = aSig + bSig;
3764 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3765 shiftRight1:
3766 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3767 zSig0 |= LIT64( 0x8000000000000000 );
3768 ++zExp;
3769 roundAndPack:
3770 return
3771 roundAndPackFloatx80(
3772 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3776 /*----------------------------------------------------------------------------
3777 | Returns the result of subtracting the absolute values of the extended
3778 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3779 | difference is negated before being returned. `zSign' is ignored if the
3780 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3781 | Standard for Binary Floating-Point Arithmetic.
3782 *----------------------------------------------------------------------------*/
3784 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3786 int32 aExp, bExp, zExp;
3787 bits64 aSig, bSig, zSig0, zSig1;
3788 int32 expDiff;
3789 floatx80 z;
3791 aSig = extractFloatx80Frac( a );
3792 aExp = extractFloatx80Exp( a );
3793 bSig = extractFloatx80Frac( b );
3794 bExp = extractFloatx80Exp( b );
3795 expDiff = aExp - bExp;
3796 if ( 0 < expDiff ) goto aExpBigger;
3797 if ( expDiff < 0 ) goto bExpBigger;
3798 if ( aExp == 0x7FFF ) {
3799 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3800 return propagateFloatx80NaN( a, b STATUS_VAR );
3802 float_raise( float_flag_invalid STATUS_VAR);
3803 z.low = floatx80_default_nan_low;
3804 z.high = floatx80_default_nan_high;
3805 return z;
3807 if ( aExp == 0 ) {
3808 aExp = 1;
3809 bExp = 1;
3811 zSig1 = 0;
3812 if ( bSig < aSig ) goto aBigger;
3813 if ( aSig < bSig ) goto bBigger;
3814 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3815 bExpBigger:
3816 if ( bExp == 0x7FFF ) {
3817 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3818 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3820 if ( aExp == 0 ) ++expDiff;
3821 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3822 bBigger:
3823 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3824 zExp = bExp;
3825 zSign ^= 1;
3826 goto normalizeRoundAndPack;
3827 aExpBigger:
3828 if ( aExp == 0x7FFF ) {
3829 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3830 return a;
3832 if ( bExp == 0 ) --expDiff;
3833 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3834 aBigger:
3835 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3836 zExp = aExp;
3837 normalizeRoundAndPack:
3838 return
3839 normalizeRoundAndPackFloatx80(
3840 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3844 /*----------------------------------------------------------------------------
3845 | Returns the result of adding the extended double-precision floating-point
3846 | values `a' and `b'. The operation is performed according to the IEC/IEEE
3847 | Standard for Binary Floating-Point Arithmetic.
3848 *----------------------------------------------------------------------------*/
3850 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3852 flag aSign, bSign;
3854 aSign = extractFloatx80Sign( a );
3855 bSign = extractFloatx80Sign( b );
3856 if ( aSign == bSign ) {
3857 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3859 else {
3860 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3865 /*----------------------------------------------------------------------------
3866 | Returns the result of subtracting the extended double-precision floating-
3867 | point values `a' and `b'. The operation is performed according to the
3868 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3869 *----------------------------------------------------------------------------*/
3871 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3873 flag aSign, bSign;
3875 aSign = extractFloatx80Sign( a );
3876 bSign = extractFloatx80Sign( b );
3877 if ( aSign == bSign ) {
3878 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3880 else {
3881 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3886 /*----------------------------------------------------------------------------
3887 | Returns the result of multiplying the extended double-precision floating-
3888 | point values `a' and `b'. The operation is performed according to the
3889 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3890 *----------------------------------------------------------------------------*/
3892 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3894 flag aSign, bSign, zSign;
3895 int32 aExp, bExp, zExp;
3896 bits64 aSig, bSig, zSig0, zSig1;
3897 floatx80 z;
3899 aSig = extractFloatx80Frac( a );
3900 aExp = extractFloatx80Exp( a );
3901 aSign = extractFloatx80Sign( a );
3902 bSig = extractFloatx80Frac( b );
3903 bExp = extractFloatx80Exp( b );
3904 bSign = extractFloatx80Sign( b );
3905 zSign = aSign ^ bSign;
3906 if ( aExp == 0x7FFF ) {
3907 if ( (bits64) ( aSig<<1 )
3908 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3909 return propagateFloatx80NaN( a, b STATUS_VAR );
3911 if ( ( bExp | bSig ) == 0 ) goto invalid;
3912 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3914 if ( bExp == 0x7FFF ) {
3915 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3916 if ( ( aExp | aSig ) == 0 ) {
3917 invalid:
3918 float_raise( float_flag_invalid STATUS_VAR);
3919 z.low = floatx80_default_nan_low;
3920 z.high = floatx80_default_nan_high;
3921 return z;
3923 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3925 if ( aExp == 0 ) {
3926 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3927 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3929 if ( bExp == 0 ) {
3930 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3931 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3933 zExp = aExp + bExp - 0x3FFE;
3934 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3935 if ( 0 < (sbits64) zSig0 ) {
3936 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3937 --zExp;
3939 return
3940 roundAndPackFloatx80(
3941 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3945 /*----------------------------------------------------------------------------
3946 | Returns the result of dividing the extended double-precision floating-point
3947 | value `a' by the corresponding value `b'. The operation is performed
3948 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3949 *----------------------------------------------------------------------------*/
3951 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3953 flag aSign, bSign, zSign;
3954 int32 aExp, bExp, zExp;
3955 bits64 aSig, bSig, zSig0, zSig1;
3956 bits64 rem0, rem1, rem2, term0, term1, term2;
3957 floatx80 z;
3959 aSig = extractFloatx80Frac( a );
3960 aExp = extractFloatx80Exp( a );
3961 aSign = extractFloatx80Sign( a );
3962 bSig = extractFloatx80Frac( b );
3963 bExp = extractFloatx80Exp( b );
3964 bSign = extractFloatx80Sign( b );
3965 zSign = aSign ^ bSign;
3966 if ( aExp == 0x7FFF ) {
3967 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3968 if ( bExp == 0x7FFF ) {
3969 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3970 goto invalid;
3972 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3974 if ( bExp == 0x7FFF ) {
3975 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3976 return packFloatx80( zSign, 0, 0 );
3978 if ( bExp == 0 ) {
3979 if ( bSig == 0 ) {
3980 if ( ( aExp | aSig ) == 0 ) {
3981 invalid:
3982 float_raise( float_flag_invalid STATUS_VAR);
3983 z.low = floatx80_default_nan_low;
3984 z.high = floatx80_default_nan_high;
3985 return z;
3987 float_raise( float_flag_divbyzero STATUS_VAR);
3988 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3990 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3992 if ( aExp == 0 ) {
3993 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3994 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3996 zExp = aExp - bExp + 0x3FFE;
3997 rem1 = 0;
3998 if ( bSig <= aSig ) {
3999 shift128Right( aSig, 0, 1, &aSig, &rem1 );
4000 ++zExp;
4002 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4003 mul64To128( bSig, zSig0, &term0, &term1 );
4004 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4005 while ( (sbits64) rem0 < 0 ) {
4006 --zSig0;
4007 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4009 zSig1 = estimateDiv128To64( rem1, 0, bSig );
4010 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
4011 mul64To128( bSig, zSig1, &term1, &term2 );
4012 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4013 while ( (sbits64) rem1 < 0 ) {
4014 --zSig1;
4015 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4017 zSig1 |= ( ( rem1 | rem2 ) != 0 );
4019 return
4020 roundAndPackFloatx80(
4021 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4025 /*----------------------------------------------------------------------------
4026 | Returns the remainder of the extended double-precision floating-point value
4027 | `a' with respect to the corresponding value `b'. The operation is performed
4028 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4029 *----------------------------------------------------------------------------*/
4031 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4033 flag aSign, zSign;
4034 int32 aExp, bExp, expDiff;
4035 bits64 aSig0, aSig1, bSig;
4036 bits64 q, term0, term1, alternateASig0, alternateASig1;
4037 floatx80 z;
4039 aSig0 = extractFloatx80Frac( a );
4040 aExp = extractFloatx80Exp( a );
4041 aSign = extractFloatx80Sign( a );
4042 bSig = extractFloatx80Frac( b );
4043 bExp = extractFloatx80Exp( b );
4044 if ( aExp == 0x7FFF ) {
4045 if ( (bits64) ( aSig0<<1 )
4046 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4047 return propagateFloatx80NaN( a, b STATUS_VAR );
4049 goto invalid;
4051 if ( bExp == 0x7FFF ) {
4052 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4053 return a;
4055 if ( bExp == 0 ) {
4056 if ( bSig == 0 ) {
4057 invalid:
4058 float_raise( float_flag_invalid STATUS_VAR);
4059 z.low = floatx80_default_nan_low;
4060 z.high = floatx80_default_nan_high;
4061 return z;
4063 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4065 if ( aExp == 0 ) {
4066 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4067 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4069 bSig |= LIT64( 0x8000000000000000 );
4070 zSign = aSign;
4071 expDiff = aExp - bExp;
4072 aSig1 = 0;
4073 if ( expDiff < 0 ) {
4074 if ( expDiff < -1 ) return a;
4075 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4076 expDiff = 0;
4078 q = ( bSig <= aSig0 );
4079 if ( q ) aSig0 -= bSig;
4080 expDiff -= 64;
4081 while ( 0 < expDiff ) {
4082 q = estimateDiv128To64( aSig0, aSig1, bSig );
4083 q = ( 2 < q ) ? q - 2 : 0;
4084 mul64To128( bSig, q, &term0, &term1 );
4085 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4086 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4087 expDiff -= 62;
4089 expDiff += 64;
4090 if ( 0 < expDiff ) {
4091 q = estimateDiv128To64( aSig0, aSig1, bSig );
4092 q = ( 2 < q ) ? q - 2 : 0;
4093 q >>= 64 - expDiff;
4094 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4095 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4096 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4097 while ( le128( term0, term1, aSig0, aSig1 ) ) {
4098 ++q;
4099 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4102 else {
4103 term1 = 0;
4104 term0 = bSig;
4106 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4107 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4108 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4109 && ( q & 1 ) )
4111 aSig0 = alternateASig0;
4112 aSig1 = alternateASig1;
4113 zSign = ! zSign;
4115 return
4116 normalizeRoundAndPackFloatx80(
4117 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4121 /*----------------------------------------------------------------------------
4122 | Returns the square root of the extended double-precision floating-point
4123 | value `a'. The operation is performed according to the IEC/IEEE Standard
4124 | for Binary Floating-Point Arithmetic.
4125 *----------------------------------------------------------------------------*/
4127 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4129 flag aSign;
4130 int32 aExp, zExp;
4131 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4132 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4133 floatx80 z;
4135 aSig0 = extractFloatx80Frac( a );
4136 aExp = extractFloatx80Exp( a );
4137 aSign = extractFloatx80Sign( a );
4138 if ( aExp == 0x7FFF ) {
4139 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4140 if ( ! aSign ) return a;
4141 goto invalid;
4143 if ( aSign ) {
4144 if ( ( aExp | aSig0 ) == 0 ) return a;
4145 invalid:
4146 float_raise( float_flag_invalid STATUS_VAR);
4147 z.low = floatx80_default_nan_low;
4148 z.high = floatx80_default_nan_high;
4149 return z;
4151 if ( aExp == 0 ) {
4152 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4153 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4155 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4156 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4157 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4158 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4159 doubleZSig0 = zSig0<<1;
4160 mul64To128( zSig0, zSig0, &term0, &term1 );
4161 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4162 while ( (sbits64) rem0 < 0 ) {
4163 --zSig0;
4164 doubleZSig0 -= 2;
4165 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4167 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4168 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4169 if ( zSig1 == 0 ) zSig1 = 1;
4170 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4171 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4172 mul64To128( zSig1, zSig1, &term2, &term3 );
4173 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4174 while ( (sbits64) rem1 < 0 ) {
4175 --zSig1;
4176 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4177 term3 |= 1;
4178 term2 |= doubleZSig0;
4179 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4181 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4183 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4184 zSig0 |= doubleZSig0;
4185 return
4186 roundAndPackFloatx80(
4187 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4191 /*----------------------------------------------------------------------------
4192 | Returns 1 if the extended double-precision floating-point value `a' is
4193 | equal to the corresponding value `b', and 0 otherwise. The comparison is
4194 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4195 | Arithmetic.
4196 *----------------------------------------------------------------------------*/
4198 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4201 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4202 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4203 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4204 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4206 if ( floatx80_is_signaling_nan( a )
4207 || floatx80_is_signaling_nan( b ) ) {
4208 float_raise( float_flag_invalid STATUS_VAR);
4210 return 0;
4212 return
4213 ( a.low == b.low )
4214 && ( ( a.high == b.high )
4215 || ( ( a.low == 0 )
4216 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4221 /*----------------------------------------------------------------------------
4222 | Returns 1 if the extended double-precision floating-point value `a' is
4223 | less than or equal to the corresponding value `b', and 0 otherwise. The
4224 | comparison is performed according to the IEC/IEEE Standard for Binary
4225 | Floating-Point Arithmetic.
4226 *----------------------------------------------------------------------------*/
4228 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4230 flag aSign, bSign;
4232 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4233 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4234 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4235 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4237 float_raise( float_flag_invalid STATUS_VAR);
4238 return 0;
4240 aSign = extractFloatx80Sign( a );
4241 bSign = extractFloatx80Sign( b );
4242 if ( aSign != bSign ) {
4243 return
4244 aSign
4245 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4246 == 0 );
4248 return
4249 aSign ? le128( b.high, b.low, a.high, a.low )
4250 : le128( a.high, a.low, b.high, b.low );
4254 /*----------------------------------------------------------------------------
4255 | Returns 1 if the extended double-precision floating-point value `a' is
4256 | less than the corresponding value `b', and 0 otherwise. The comparison
4257 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4258 | Arithmetic.
4259 *----------------------------------------------------------------------------*/
4261 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4263 flag aSign, bSign;
4265 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4266 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4267 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4268 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4270 float_raise( float_flag_invalid STATUS_VAR);
4271 return 0;
4273 aSign = extractFloatx80Sign( a );
4274 bSign = extractFloatx80Sign( b );
4275 if ( aSign != bSign ) {
4276 return
4277 aSign
4278 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4279 != 0 );
4281 return
4282 aSign ? lt128( b.high, b.low, a.high, a.low )
4283 : lt128( a.high, a.low, b.high, b.low );
4287 /*----------------------------------------------------------------------------
4288 | Returns 1 if the extended double-precision floating-point value `a' is equal
4289 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4290 | raised if either operand is a NaN. Otherwise, the comparison is performed
4291 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4292 *----------------------------------------------------------------------------*/
4294 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4297 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4298 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4299 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4300 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4302 float_raise( float_flag_invalid STATUS_VAR);
4303 return 0;
4305 return
4306 ( a.low == b.low )
4307 && ( ( a.high == b.high )
4308 || ( ( a.low == 0 )
4309 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4314 /*----------------------------------------------------------------------------
4315 | Returns 1 if the extended double-precision floating-point value `a' is less
4316 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4317 | do not cause an exception. Otherwise, the comparison is performed according
4318 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4319 *----------------------------------------------------------------------------*/
4321 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4323 flag aSign, bSign;
4325 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4326 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4327 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4328 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4330 if ( floatx80_is_signaling_nan( a )
4331 || floatx80_is_signaling_nan( b ) ) {
4332 float_raise( float_flag_invalid STATUS_VAR);
4334 return 0;
4336 aSign = extractFloatx80Sign( a );
4337 bSign = extractFloatx80Sign( b );
4338 if ( aSign != bSign ) {
4339 return
4340 aSign
4341 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4342 == 0 );
4344 return
4345 aSign ? le128( b.high, b.low, a.high, a.low )
4346 : le128( a.high, a.low, b.high, b.low );
4350 /*----------------------------------------------------------------------------
4351 | Returns 1 if the extended double-precision floating-point value `a' is less
4352 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4353 | an exception. Otherwise, the comparison is performed according to the
4354 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4355 *----------------------------------------------------------------------------*/
4357 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4359 flag aSign, bSign;
4361 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4362 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4363 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4364 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4366 if ( floatx80_is_signaling_nan( a )
4367 || floatx80_is_signaling_nan( b ) ) {
4368 float_raise( float_flag_invalid STATUS_VAR);
4370 return 0;
4372 aSign = extractFloatx80Sign( a );
4373 bSign = extractFloatx80Sign( b );
4374 if ( aSign != bSign ) {
4375 return
4376 aSign
4377 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4378 != 0 );
4380 return
4381 aSign ? lt128( b.high, b.low, a.high, a.low )
4382 : lt128( a.high, a.low, b.high, b.low );
4386 #endif
4388 #ifdef FLOAT128
4390 /*----------------------------------------------------------------------------
4391 | Returns the result of converting the quadruple-precision floating-point
4392 | value `a' to the 32-bit two's complement integer format. The conversion
4393 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4394 | Arithmetic---which means in particular that the conversion is rounded
4395 | according to the current rounding mode. If `a' is a NaN, the largest
4396 | positive integer is returned. Otherwise, if the conversion overflows, the
4397 | largest integer with the same sign as `a' is returned.
4398 *----------------------------------------------------------------------------*/
4400 int32 float128_to_int32( float128 a STATUS_PARAM )
4402 flag aSign;
4403 int32 aExp, shiftCount;
4404 bits64 aSig0, aSig1;
4406 aSig1 = extractFloat128Frac1( a );
4407 aSig0 = extractFloat128Frac0( a );
4408 aExp = extractFloat128Exp( a );
4409 aSign = extractFloat128Sign( a );
4410 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4411 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4412 aSig0 |= ( aSig1 != 0 );
4413 shiftCount = 0x4028 - aExp;
4414 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4415 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4419 /*----------------------------------------------------------------------------
4420 | Returns the result of converting the quadruple-precision floating-point
4421 | value `a' to the 32-bit two's complement integer format. The conversion
4422 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4423 | Arithmetic, except that the conversion is always rounded toward zero. If
4424 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4425 | conversion overflows, the largest integer with the same sign as `a' is
4426 | returned.
4427 *----------------------------------------------------------------------------*/
4429 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4431 flag aSign;
4432 int32 aExp, shiftCount;
4433 bits64 aSig0, aSig1, savedASig;
4434 int32 z;
4436 aSig1 = extractFloat128Frac1( a );
4437 aSig0 = extractFloat128Frac0( a );
4438 aExp = extractFloat128Exp( a );
4439 aSign = extractFloat128Sign( a );
4440 aSig0 |= ( aSig1 != 0 );
4441 if ( 0x401E < aExp ) {
4442 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4443 goto invalid;
4445 else if ( aExp < 0x3FFF ) {
4446 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4447 return 0;
4449 aSig0 |= LIT64( 0x0001000000000000 );
4450 shiftCount = 0x402F - aExp;
4451 savedASig = aSig0;
4452 aSig0 >>= shiftCount;
4453 z = aSig0;
4454 if ( aSign ) z = - z;
4455 if ( ( z < 0 ) ^ aSign ) {
4456 invalid:
4457 float_raise( float_flag_invalid STATUS_VAR);
4458 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4460 if ( ( aSig0<<shiftCount ) != savedASig ) {
4461 STATUS(float_exception_flags) |= float_flag_inexact;
4463 return z;
4467 /*----------------------------------------------------------------------------
4468 | Returns the result of converting the quadruple-precision floating-point
4469 | value `a' to the 64-bit two's complement integer format. The conversion
4470 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4471 | Arithmetic---which means in particular that the conversion is rounded
4472 | according to the current rounding mode. If `a' is a NaN, the largest
4473 | positive integer is returned. Otherwise, if the conversion overflows, the
4474 | largest integer with the same sign as `a' is returned.
4475 *----------------------------------------------------------------------------*/
4477 int64 float128_to_int64( float128 a STATUS_PARAM )
4479 flag aSign;
4480 int32 aExp, shiftCount;
4481 bits64 aSig0, aSig1;
4483 aSig1 = extractFloat128Frac1( a );
4484 aSig0 = extractFloat128Frac0( a );
4485 aExp = extractFloat128Exp( a );
4486 aSign = extractFloat128Sign( a );
4487 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4488 shiftCount = 0x402F - aExp;
4489 if ( shiftCount <= 0 ) {
4490 if ( 0x403E < aExp ) {
4491 float_raise( float_flag_invalid STATUS_VAR);
4492 if ( ! aSign
4493 || ( ( aExp == 0x7FFF )
4494 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4497 return LIT64( 0x7FFFFFFFFFFFFFFF );
4499 return (sbits64) LIT64( 0x8000000000000000 );
4501 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4503 else {
4504 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4506 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4510 /*----------------------------------------------------------------------------
4511 | Returns the result of converting the quadruple-precision floating-point
4512 | value `a' to the 64-bit two's complement integer format. The conversion
4513 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4514 | Arithmetic, except that the conversion is always rounded toward zero.
4515 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4516 | the conversion overflows, the largest integer with the same sign as `a' is
4517 | returned.
4518 *----------------------------------------------------------------------------*/
4520 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4522 flag aSign;
4523 int32 aExp, shiftCount;
4524 bits64 aSig0, aSig1;
4525 int64 z;
4527 aSig1 = extractFloat128Frac1( a );
4528 aSig0 = extractFloat128Frac0( a );
4529 aExp = extractFloat128Exp( a );
4530 aSign = extractFloat128Sign( a );
4531 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4532 shiftCount = aExp - 0x402F;
4533 if ( 0 < shiftCount ) {
4534 if ( 0x403E <= aExp ) {
4535 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4536 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4537 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4538 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4540 else {
4541 float_raise( float_flag_invalid STATUS_VAR);
4542 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4543 return LIT64( 0x7FFFFFFFFFFFFFFF );
4546 return (sbits64) LIT64( 0x8000000000000000 );
4548 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4549 if ( (bits64) ( aSig1<<shiftCount ) ) {
4550 STATUS(float_exception_flags) |= float_flag_inexact;
4553 else {
4554 if ( aExp < 0x3FFF ) {
4555 if ( aExp | aSig0 | aSig1 ) {
4556 STATUS(float_exception_flags) |= float_flag_inexact;
4558 return 0;
4560 z = aSig0>>( - shiftCount );
4561 if ( aSig1
4562 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4563 STATUS(float_exception_flags) |= float_flag_inexact;
4566 if ( aSign ) z = - z;
4567 return z;
4571 /*----------------------------------------------------------------------------
4572 | Returns the result of converting the quadruple-precision floating-point
4573 | value `a' to the single-precision floating-point format. The conversion
4574 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4575 | Arithmetic.
4576 *----------------------------------------------------------------------------*/
4578 float32 float128_to_float32( float128 a STATUS_PARAM )
4580 flag aSign;
4581 int32 aExp;
4582 bits64 aSig0, aSig1;
4583 bits32 zSig;
4585 aSig1 = extractFloat128Frac1( a );
4586 aSig0 = extractFloat128Frac0( a );
4587 aExp = extractFloat128Exp( a );
4588 aSign = extractFloat128Sign( a );
4589 if ( aExp == 0x7FFF ) {
4590 if ( aSig0 | aSig1 ) {
4591 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4593 return packFloat32( aSign, 0xFF, 0 );
4595 aSig0 |= ( aSig1 != 0 );
4596 shift64RightJamming( aSig0, 18, &aSig0 );
4597 zSig = aSig0;
4598 if ( aExp || zSig ) {
4599 zSig |= 0x40000000;
4600 aExp -= 0x3F81;
4602 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4606 /*----------------------------------------------------------------------------
4607 | Returns the result of converting the quadruple-precision floating-point
4608 | value `a' to the double-precision floating-point format. The conversion
4609 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4610 | Arithmetic.
4611 *----------------------------------------------------------------------------*/
4613 float64 float128_to_float64( float128 a STATUS_PARAM )
4615 flag aSign;
4616 int32 aExp;
4617 bits64 aSig0, aSig1;
4619 aSig1 = extractFloat128Frac1( a );
4620 aSig0 = extractFloat128Frac0( a );
4621 aExp = extractFloat128Exp( a );
4622 aSign = extractFloat128Sign( a );
4623 if ( aExp == 0x7FFF ) {
4624 if ( aSig0 | aSig1 ) {
4625 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4627 return packFloat64( aSign, 0x7FF, 0 );
4629 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4630 aSig0 |= ( aSig1 != 0 );
4631 if ( aExp || aSig0 ) {
4632 aSig0 |= LIT64( 0x4000000000000000 );
4633 aExp -= 0x3C01;
4635 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4639 #ifdef FLOATX80
4641 /*----------------------------------------------------------------------------
4642 | Returns the result of converting the quadruple-precision floating-point
4643 | value `a' to the extended double-precision floating-point format. The
4644 | conversion is performed according to the IEC/IEEE Standard for Binary
4645 | Floating-Point Arithmetic.
4646 *----------------------------------------------------------------------------*/
4648 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4650 flag aSign;
4651 int32 aExp;
4652 bits64 aSig0, aSig1;
4654 aSig1 = extractFloat128Frac1( a );
4655 aSig0 = extractFloat128Frac0( a );
4656 aExp = extractFloat128Exp( a );
4657 aSign = extractFloat128Sign( a );
4658 if ( aExp == 0x7FFF ) {
4659 if ( aSig0 | aSig1 ) {
4660 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4662 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4664 if ( aExp == 0 ) {
4665 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4666 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4668 else {
4669 aSig0 |= LIT64( 0x0001000000000000 );
4671 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4672 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4676 #endif
4678 /*----------------------------------------------------------------------------
4679 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4680 | returns the result as a quadruple-precision floating-point value. The
4681 | operation is performed according to the IEC/IEEE Standard for Binary
4682 | Floating-Point Arithmetic.
4683 *----------------------------------------------------------------------------*/
4685 float128 float128_round_to_int( float128 a STATUS_PARAM )
4687 flag aSign;
4688 int32 aExp;
4689 bits64 lastBitMask, roundBitsMask;
4690 int8 roundingMode;
4691 float128 z;
4693 aExp = extractFloat128Exp( a );
4694 if ( 0x402F <= aExp ) {
4695 if ( 0x406F <= aExp ) {
4696 if ( ( aExp == 0x7FFF )
4697 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4699 return propagateFloat128NaN( a, a STATUS_VAR );
4701 return a;
4703 lastBitMask = 1;
4704 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4705 roundBitsMask = lastBitMask - 1;
4706 z = a;
4707 roundingMode = STATUS(float_rounding_mode);
4708 if ( roundingMode == float_round_nearest_even ) {
4709 if ( lastBitMask ) {
4710 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4711 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4713 else {
4714 if ( (sbits64) z.low < 0 ) {
4715 ++z.high;
4716 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4720 else if ( roundingMode != float_round_to_zero ) {
4721 if ( extractFloat128Sign( z )
4722 ^ ( roundingMode == float_round_up ) ) {
4723 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4726 z.low &= ~ roundBitsMask;
4728 else {
4729 if ( aExp < 0x3FFF ) {
4730 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4731 STATUS(float_exception_flags) |= float_flag_inexact;
4732 aSign = extractFloat128Sign( a );
4733 switch ( STATUS(float_rounding_mode) ) {
4734 case float_round_nearest_even:
4735 if ( ( aExp == 0x3FFE )
4736 && ( extractFloat128Frac0( a )
4737 | extractFloat128Frac1( a ) )
4739 return packFloat128( aSign, 0x3FFF, 0, 0 );
4741 break;
4742 case float_round_down:
4743 return
4744 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4745 : packFloat128( 0, 0, 0, 0 );
4746 case float_round_up:
4747 return
4748 aSign ? packFloat128( 1, 0, 0, 0 )
4749 : packFloat128( 0, 0x3FFF, 0, 0 );
4751 return packFloat128( aSign, 0, 0, 0 );
4753 lastBitMask = 1;
4754 lastBitMask <<= 0x402F - aExp;
4755 roundBitsMask = lastBitMask - 1;
4756 z.low = 0;
4757 z.high = a.high;
4758 roundingMode = STATUS(float_rounding_mode);
4759 if ( roundingMode == float_round_nearest_even ) {
4760 z.high += lastBitMask>>1;
4761 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4762 z.high &= ~ lastBitMask;
4765 else if ( roundingMode != float_round_to_zero ) {
4766 if ( extractFloat128Sign( z )
4767 ^ ( roundingMode == float_round_up ) ) {
4768 z.high |= ( a.low != 0 );
4769 z.high += roundBitsMask;
4772 z.high &= ~ roundBitsMask;
4774 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4775 STATUS(float_exception_flags) |= float_flag_inexact;
4777 return z;
4781 /*----------------------------------------------------------------------------
4782 | Returns the result of adding the absolute values of the quadruple-precision
4783 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4784 | before being returned. `zSign' is ignored if the result is a NaN.
4785 | The addition is performed according to the IEC/IEEE Standard for Binary
4786 | Floating-Point Arithmetic.
4787 *----------------------------------------------------------------------------*/
4789 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4791 int32 aExp, bExp, zExp;
4792 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4793 int32 expDiff;
4795 aSig1 = extractFloat128Frac1( a );
4796 aSig0 = extractFloat128Frac0( a );
4797 aExp = extractFloat128Exp( a );
4798 bSig1 = extractFloat128Frac1( b );
4799 bSig0 = extractFloat128Frac0( b );
4800 bExp = extractFloat128Exp( b );
4801 expDiff = aExp - bExp;
4802 if ( 0 < expDiff ) {
4803 if ( aExp == 0x7FFF ) {
4804 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4805 return a;
4807 if ( bExp == 0 ) {
4808 --expDiff;
4810 else {
4811 bSig0 |= LIT64( 0x0001000000000000 );
4813 shift128ExtraRightJamming(
4814 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4815 zExp = aExp;
4817 else if ( expDiff < 0 ) {
4818 if ( bExp == 0x7FFF ) {
4819 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4820 return packFloat128( zSign, 0x7FFF, 0, 0 );
4822 if ( aExp == 0 ) {
4823 ++expDiff;
4825 else {
4826 aSig0 |= LIT64( 0x0001000000000000 );
4828 shift128ExtraRightJamming(
4829 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4830 zExp = bExp;
4832 else {
4833 if ( aExp == 0x7FFF ) {
4834 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4835 return propagateFloat128NaN( a, b STATUS_VAR );
4837 return a;
4839 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4840 if ( aExp == 0 ) {
4841 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
4842 return packFloat128( zSign, 0, zSig0, zSig1 );
4844 zSig2 = 0;
4845 zSig0 |= LIT64( 0x0002000000000000 );
4846 zExp = aExp;
4847 goto shiftRight1;
4849 aSig0 |= LIT64( 0x0001000000000000 );
4850 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4851 --zExp;
4852 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4853 ++zExp;
4854 shiftRight1:
4855 shift128ExtraRightJamming(
4856 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4857 roundAndPack:
4858 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4862 /*----------------------------------------------------------------------------
4863 | Returns the result of subtracting the absolute values of the quadruple-
4864 | precision floating-point values `a' and `b'. If `zSign' is 1, the
4865 | difference is negated before being returned. `zSign' is ignored if the
4866 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4867 | Standard for Binary Floating-Point Arithmetic.
4868 *----------------------------------------------------------------------------*/
4870 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4872 int32 aExp, bExp, zExp;
4873 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4874 int32 expDiff;
4875 float128 z;
4877 aSig1 = extractFloat128Frac1( a );
4878 aSig0 = extractFloat128Frac0( a );
4879 aExp = extractFloat128Exp( a );
4880 bSig1 = extractFloat128Frac1( b );
4881 bSig0 = extractFloat128Frac0( b );
4882 bExp = extractFloat128Exp( b );
4883 expDiff = aExp - bExp;
4884 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4885 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4886 if ( 0 < expDiff ) goto aExpBigger;
4887 if ( expDiff < 0 ) goto bExpBigger;
4888 if ( aExp == 0x7FFF ) {
4889 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4890 return propagateFloat128NaN( a, b STATUS_VAR );
4892 float_raise( float_flag_invalid STATUS_VAR);
4893 z.low = float128_default_nan_low;
4894 z.high = float128_default_nan_high;
4895 return z;
4897 if ( aExp == 0 ) {
4898 aExp = 1;
4899 bExp = 1;
4901 if ( bSig0 < aSig0 ) goto aBigger;
4902 if ( aSig0 < bSig0 ) goto bBigger;
4903 if ( bSig1 < aSig1 ) goto aBigger;
4904 if ( aSig1 < bSig1 ) goto bBigger;
4905 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4906 bExpBigger:
4907 if ( bExp == 0x7FFF ) {
4908 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4909 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4911 if ( aExp == 0 ) {
4912 ++expDiff;
4914 else {
4915 aSig0 |= LIT64( 0x4000000000000000 );
4917 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4918 bSig0 |= LIT64( 0x4000000000000000 );
4919 bBigger:
4920 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4921 zExp = bExp;
4922 zSign ^= 1;
4923 goto normalizeRoundAndPack;
4924 aExpBigger:
4925 if ( aExp == 0x7FFF ) {
4926 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4927 return a;
4929 if ( bExp == 0 ) {
4930 --expDiff;
4932 else {
4933 bSig0 |= LIT64( 0x4000000000000000 );
4935 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4936 aSig0 |= LIT64( 0x4000000000000000 );
4937 aBigger:
4938 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4939 zExp = aExp;
4940 normalizeRoundAndPack:
4941 --zExp;
4942 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4946 /*----------------------------------------------------------------------------
4947 | Returns the result of adding the quadruple-precision floating-point values
4948 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4949 | for Binary Floating-Point Arithmetic.
4950 *----------------------------------------------------------------------------*/
4952 float128 float128_add( float128 a, float128 b STATUS_PARAM )
4954 flag aSign, bSign;
4956 aSign = extractFloat128Sign( a );
4957 bSign = extractFloat128Sign( b );
4958 if ( aSign == bSign ) {
4959 return addFloat128Sigs( a, b, aSign STATUS_VAR );
4961 else {
4962 return subFloat128Sigs( a, b, aSign STATUS_VAR );
4967 /*----------------------------------------------------------------------------
4968 | Returns the result of subtracting the quadruple-precision floating-point
4969 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4970 | Standard for Binary Floating-Point Arithmetic.
4971 *----------------------------------------------------------------------------*/
4973 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4975 flag aSign, bSign;
4977 aSign = extractFloat128Sign( a );
4978 bSign = extractFloat128Sign( b );
4979 if ( aSign == bSign ) {
4980 return subFloat128Sigs( a, b, aSign STATUS_VAR );
4982 else {
4983 return addFloat128Sigs( a, b, aSign STATUS_VAR );
4988 /*----------------------------------------------------------------------------
4989 | Returns the result of multiplying the quadruple-precision floating-point
4990 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4991 | Standard for Binary Floating-Point Arithmetic.
4992 *----------------------------------------------------------------------------*/
4994 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4996 flag aSign, bSign, zSign;
4997 int32 aExp, bExp, zExp;
4998 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4999 float128 z;
5001 aSig1 = extractFloat128Frac1( a );
5002 aSig0 = extractFloat128Frac0( a );
5003 aExp = extractFloat128Exp( a );
5004 aSign = extractFloat128Sign( a );
5005 bSig1 = extractFloat128Frac1( b );
5006 bSig0 = extractFloat128Frac0( b );
5007 bExp = extractFloat128Exp( b );
5008 bSign = extractFloat128Sign( b );
5009 zSign = aSign ^ bSign;
5010 if ( aExp == 0x7FFF ) {
5011 if ( ( aSig0 | aSig1 )
5012 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5013 return propagateFloat128NaN( a, b STATUS_VAR );
5015 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5016 return packFloat128( zSign, 0x7FFF, 0, 0 );
5018 if ( bExp == 0x7FFF ) {
5019 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5020 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5021 invalid:
5022 float_raise( float_flag_invalid STATUS_VAR);
5023 z.low = float128_default_nan_low;
5024 z.high = float128_default_nan_high;
5025 return z;
5027 return packFloat128( zSign, 0x7FFF, 0, 0 );
5029 if ( aExp == 0 ) {
5030 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5031 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5033 if ( bExp == 0 ) {
5034 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5035 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5037 zExp = aExp + bExp - 0x4000;
5038 aSig0 |= LIT64( 0x0001000000000000 );
5039 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5040 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5041 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5042 zSig2 |= ( zSig3 != 0 );
5043 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5044 shift128ExtraRightJamming(
5045 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5046 ++zExp;
5048 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5052 /*----------------------------------------------------------------------------
5053 | Returns the result of dividing the quadruple-precision floating-point value
5054 | `a' by the corresponding value `b'. The operation is performed according to
5055 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5056 *----------------------------------------------------------------------------*/
5058 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5060 flag aSign, bSign, zSign;
5061 int32 aExp, bExp, zExp;
5062 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5063 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5064 float128 z;
5066 aSig1 = extractFloat128Frac1( a );
5067 aSig0 = extractFloat128Frac0( a );
5068 aExp = extractFloat128Exp( a );
5069 aSign = extractFloat128Sign( a );
5070 bSig1 = extractFloat128Frac1( b );
5071 bSig0 = extractFloat128Frac0( b );
5072 bExp = extractFloat128Exp( b );
5073 bSign = extractFloat128Sign( b );
5074 zSign = aSign ^ bSign;
5075 if ( aExp == 0x7FFF ) {
5076 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5077 if ( bExp == 0x7FFF ) {
5078 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5079 goto invalid;
5081 return packFloat128( zSign, 0x7FFF, 0, 0 );
5083 if ( bExp == 0x7FFF ) {
5084 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5085 return packFloat128( zSign, 0, 0, 0 );
5087 if ( bExp == 0 ) {
5088 if ( ( bSig0 | bSig1 ) == 0 ) {
5089 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5090 invalid:
5091 float_raise( float_flag_invalid STATUS_VAR);
5092 z.low = float128_default_nan_low;
5093 z.high = float128_default_nan_high;
5094 return z;
5096 float_raise( float_flag_divbyzero STATUS_VAR);
5097 return packFloat128( zSign, 0x7FFF, 0, 0 );
5099 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5101 if ( aExp == 0 ) {
5102 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5103 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5105 zExp = aExp - bExp + 0x3FFD;
5106 shortShift128Left(
5107 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5108 shortShift128Left(
5109 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5110 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5111 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5112 ++zExp;
5114 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5115 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5116 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5117 while ( (sbits64) rem0 < 0 ) {
5118 --zSig0;
5119 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5121 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5122 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5123 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5124 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5125 while ( (sbits64) rem1 < 0 ) {
5126 --zSig1;
5127 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5129 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5131 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5132 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5136 /*----------------------------------------------------------------------------
5137 | Returns the remainder of the quadruple-precision floating-point value `a'
5138 | with respect to the corresponding value `b'. The operation is performed
5139 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5140 *----------------------------------------------------------------------------*/
5142 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5144 flag aSign, zSign;
5145 int32 aExp, bExp, expDiff;
5146 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5147 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5148 sbits64 sigMean0;
5149 float128 z;
5151 aSig1 = extractFloat128Frac1( a );
5152 aSig0 = extractFloat128Frac0( a );
5153 aExp = extractFloat128Exp( a );
5154 aSign = extractFloat128Sign( a );
5155 bSig1 = extractFloat128Frac1( b );
5156 bSig0 = extractFloat128Frac0( b );
5157 bExp = extractFloat128Exp( b );
5158 if ( aExp == 0x7FFF ) {
5159 if ( ( aSig0 | aSig1 )
5160 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5161 return propagateFloat128NaN( a, b STATUS_VAR );
5163 goto invalid;
5165 if ( bExp == 0x7FFF ) {
5166 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5167 return a;
5169 if ( bExp == 0 ) {
5170 if ( ( bSig0 | bSig1 ) == 0 ) {
5171 invalid:
5172 float_raise( float_flag_invalid STATUS_VAR);
5173 z.low = float128_default_nan_low;
5174 z.high = float128_default_nan_high;
5175 return z;
5177 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5179 if ( aExp == 0 ) {
5180 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5181 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5183 expDiff = aExp - bExp;
5184 if ( expDiff < -1 ) return a;
5185 shortShift128Left(
5186 aSig0 | LIT64( 0x0001000000000000 ),
5187 aSig1,
5188 15 - ( expDiff < 0 ),
5189 &aSig0,
5190 &aSig1
5192 shortShift128Left(
5193 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5194 q = le128( bSig0, bSig1, aSig0, aSig1 );
5195 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5196 expDiff -= 64;
5197 while ( 0 < expDiff ) {
5198 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5199 q = ( 4 < q ) ? q - 4 : 0;
5200 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5201 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5202 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5203 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5204 expDiff -= 61;
5206 if ( -64 < expDiff ) {
5207 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5208 q = ( 4 < q ) ? q - 4 : 0;
5209 q >>= - expDiff;
5210 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5211 expDiff += 52;
5212 if ( expDiff < 0 ) {
5213 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5215 else {
5216 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5218 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5219 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5221 else {
5222 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5223 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5225 do {
5226 alternateASig0 = aSig0;
5227 alternateASig1 = aSig1;
5228 ++q;
5229 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5230 } while ( 0 <= (sbits64) aSig0 );
5231 add128(
5232 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5233 if ( ( sigMean0 < 0 )
5234 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5235 aSig0 = alternateASig0;
5236 aSig1 = alternateASig1;
5238 zSign = ( (sbits64) aSig0 < 0 );
5239 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5240 return
5241 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5245 /*----------------------------------------------------------------------------
5246 | Returns the square root of the quadruple-precision floating-point value `a'.
5247 | The operation is performed according to the IEC/IEEE Standard for Binary
5248 | Floating-Point Arithmetic.
5249 *----------------------------------------------------------------------------*/
5251 float128 float128_sqrt( float128 a STATUS_PARAM )
5253 flag aSign;
5254 int32 aExp, zExp;
5255 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5256 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5257 float128 z;
5259 aSig1 = extractFloat128Frac1( a );
5260 aSig0 = extractFloat128Frac0( a );
5261 aExp = extractFloat128Exp( a );
5262 aSign = extractFloat128Sign( a );
5263 if ( aExp == 0x7FFF ) {
5264 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5265 if ( ! aSign ) return a;
5266 goto invalid;
5268 if ( aSign ) {
5269 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5270 invalid:
5271 float_raise( float_flag_invalid STATUS_VAR);
5272 z.low = float128_default_nan_low;
5273 z.high = float128_default_nan_high;
5274 return z;
5276 if ( aExp == 0 ) {
5277 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5278 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5280 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5281 aSig0 |= LIT64( 0x0001000000000000 );
5282 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5283 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5284 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5285 doubleZSig0 = zSig0<<1;
5286 mul64To128( zSig0, zSig0, &term0, &term1 );
5287 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5288 while ( (sbits64) rem0 < 0 ) {
5289 --zSig0;
5290 doubleZSig0 -= 2;
5291 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5293 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5294 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5295 if ( zSig1 == 0 ) zSig1 = 1;
5296 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5297 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5298 mul64To128( zSig1, zSig1, &term2, &term3 );
5299 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5300 while ( (sbits64) rem1 < 0 ) {
5301 --zSig1;
5302 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5303 term3 |= 1;
5304 term2 |= doubleZSig0;
5305 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5307 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5309 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5310 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5314 /*----------------------------------------------------------------------------
5315 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5316 | the corresponding value `b', and 0 otherwise. The comparison is performed
5317 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5318 *----------------------------------------------------------------------------*/
5320 int float128_eq( float128 a, float128 b STATUS_PARAM )
5323 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5324 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5325 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5326 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5328 if ( float128_is_signaling_nan( a )
5329 || float128_is_signaling_nan( b ) ) {
5330 float_raise( float_flag_invalid STATUS_VAR);
5332 return 0;
5334 return
5335 ( a.low == b.low )
5336 && ( ( a.high == b.high )
5337 || ( ( a.low == 0 )
5338 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5343 /*----------------------------------------------------------------------------
5344 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5345 | or equal to the corresponding value `b', and 0 otherwise. The comparison
5346 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5347 | Arithmetic.
5348 *----------------------------------------------------------------------------*/
5350 int float128_le( float128 a, float128 b STATUS_PARAM )
5352 flag aSign, bSign;
5354 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5355 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5356 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5357 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5359 float_raise( float_flag_invalid STATUS_VAR);
5360 return 0;
5362 aSign = extractFloat128Sign( a );
5363 bSign = extractFloat128Sign( b );
5364 if ( aSign != bSign ) {
5365 return
5366 aSign
5367 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5368 == 0 );
5370 return
5371 aSign ? le128( b.high, b.low, a.high, a.low )
5372 : le128( a.high, a.low, b.high, b.low );
5376 /*----------------------------------------------------------------------------
5377 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5378 | the corresponding value `b', and 0 otherwise. The comparison is performed
5379 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5380 *----------------------------------------------------------------------------*/
5382 int float128_lt( float128 a, float128 b STATUS_PARAM )
5384 flag aSign, bSign;
5386 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5387 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5388 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5389 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5391 float_raise( float_flag_invalid STATUS_VAR);
5392 return 0;
5394 aSign = extractFloat128Sign( a );
5395 bSign = extractFloat128Sign( b );
5396 if ( aSign != bSign ) {
5397 return
5398 aSign
5399 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5400 != 0 );
5402 return
5403 aSign ? lt128( b.high, b.low, a.high, a.low )
5404 : lt128( a.high, a.low, b.high, b.low );
5408 /*----------------------------------------------------------------------------
5409 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5410 | the corresponding value `b', and 0 otherwise. The invalid exception is
5411 | raised if either operand is a NaN. Otherwise, the comparison is performed
5412 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5413 *----------------------------------------------------------------------------*/
5415 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5418 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5419 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5420 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5421 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5423 float_raise( float_flag_invalid STATUS_VAR);
5424 return 0;
5426 return
5427 ( a.low == b.low )
5428 && ( ( a.high == b.high )
5429 || ( ( a.low == 0 )
5430 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5435 /*----------------------------------------------------------------------------
5436 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5437 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5438 | cause an exception. Otherwise, the comparison is performed according to the
5439 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5440 *----------------------------------------------------------------------------*/
5442 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5444 flag aSign, bSign;
5446 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5447 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5448 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5449 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5451 if ( float128_is_signaling_nan( a )
5452 || float128_is_signaling_nan( b ) ) {
5453 float_raise( float_flag_invalid STATUS_VAR);
5455 return 0;
5457 aSign = extractFloat128Sign( a );
5458 bSign = extractFloat128Sign( b );
5459 if ( aSign != bSign ) {
5460 return
5461 aSign
5462 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5463 == 0 );
5465 return
5466 aSign ? le128( b.high, b.low, a.high, a.low )
5467 : le128( a.high, a.low, b.high, b.low );
5471 /*----------------------------------------------------------------------------
5472 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5473 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5474 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5475 | Standard for Binary Floating-Point Arithmetic.
5476 *----------------------------------------------------------------------------*/
5478 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5480 flag aSign, bSign;
5482 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5483 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5484 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5485 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5487 if ( float128_is_signaling_nan( a )
5488 || float128_is_signaling_nan( b ) ) {
5489 float_raise( float_flag_invalid STATUS_VAR);
5491 return 0;
5493 aSign = extractFloat128Sign( a );
5494 bSign = extractFloat128Sign( b );
5495 if ( aSign != bSign ) {
5496 return
5497 aSign
5498 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5499 != 0 );
5501 return
5502 aSign ? lt128( b.high, b.low, a.high, a.low )
5503 : lt128( a.high, a.low, b.high, b.low );
5507 #endif
5509 /* misc functions */
5510 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5512 return int64_to_float32(a STATUS_VAR);
5515 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5517 return int64_to_float64(a STATUS_VAR);
5520 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5522 int64_t v;
5523 unsigned int res;
5525 v = float32_to_int64(a STATUS_VAR);
5526 if (v < 0) {
5527 res = 0;
5528 float_raise( float_flag_invalid STATUS_VAR);
5529 } else if (v > 0xffffffff) {
5530 res = 0xffffffff;
5531 float_raise( float_flag_invalid STATUS_VAR);
5532 } else {
5533 res = v;
5535 return res;
5538 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5540 int64_t v;
5541 unsigned int res;
5543 v = float32_to_int64_round_to_zero(a STATUS_VAR);
5544 if (v < 0) {
5545 res = 0;
5546 float_raise( float_flag_invalid STATUS_VAR);
5547 } else if (v > 0xffffffff) {
5548 res = 0xffffffff;
5549 float_raise( float_flag_invalid STATUS_VAR);
5550 } else {
5551 res = v;
5553 return res;
5556 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5558 int64_t v;
5559 unsigned int res;
5561 v = float64_to_int64(a STATUS_VAR);
5562 if (v < 0) {
5563 res = 0;
5564 float_raise( float_flag_invalid STATUS_VAR);
5565 } else if (v > 0xffffffff) {
5566 res = 0xffffffff;
5567 float_raise( float_flag_invalid STATUS_VAR);
5568 } else {
5569 res = v;
5571 return res;
5574 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5576 int64_t v;
5577 unsigned int res;
5579 v = float64_to_int64_round_to_zero(a STATUS_VAR);
5580 if (v < 0) {
5581 res = 0;
5582 float_raise( float_flag_invalid STATUS_VAR);
5583 } else if (v > 0xffffffff) {
5584 res = 0xffffffff;
5585 float_raise( float_flag_invalid STATUS_VAR);
5586 } else {
5587 res = v;
5589 return res;
5592 /* FIXME: This looks broken. */
5593 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5595 int64_t v;
5597 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5598 v += float64_val(a);
5599 v = float64_to_int64(make_float64(v) STATUS_VAR);
5601 return v - INT64_MIN;
5604 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5606 int64_t v;
5608 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5609 v += float64_val(a);
5610 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5612 return v - INT64_MIN;
5615 #define COMPARE(s, nan_exp) \
5616 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
5617 int is_quiet STATUS_PARAM ) \
5619 flag aSign, bSign; \
5620 bits ## s av, bv; \
5622 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
5623 extractFloat ## s ## Frac( a ) ) || \
5624 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
5625 extractFloat ## s ## Frac( b ) )) { \
5626 if (!is_quiet || \
5627 float ## s ## _is_signaling_nan( a ) || \
5628 float ## s ## _is_signaling_nan( b ) ) { \
5629 float_raise( float_flag_invalid STATUS_VAR); \
5631 return float_relation_unordered; \
5633 aSign = extractFloat ## s ## Sign( a ); \
5634 bSign = extractFloat ## s ## Sign( b ); \
5635 av = float ## s ## _val(a); \
5636 bv = float ## s ## _val(b); \
5637 if ( aSign != bSign ) { \
5638 if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) { \
5639 /* zero case */ \
5640 return float_relation_equal; \
5641 } else { \
5642 return 1 - (2 * aSign); \
5644 } else { \
5645 if (av == bv) { \
5646 return float_relation_equal; \
5647 } else { \
5648 return 1 - 2 * (aSign ^ ( av < bv )); \
5653 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
5655 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
5658 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
5660 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
5663 COMPARE(32, 0xff)
5664 COMPARE(64, 0x7ff)
5666 INLINE int float128_compare_internal( float128 a, float128 b,
5667 int is_quiet STATUS_PARAM )
5669 flag aSign, bSign;
5671 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
5672 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
5673 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
5674 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
5675 if (!is_quiet ||
5676 float128_is_signaling_nan( a ) ||
5677 float128_is_signaling_nan( b ) ) {
5678 float_raise( float_flag_invalid STATUS_VAR);
5680 return float_relation_unordered;
5682 aSign = extractFloat128Sign( a );
5683 bSign = extractFloat128Sign( b );
5684 if ( aSign != bSign ) {
5685 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
5686 /* zero case */
5687 return float_relation_equal;
5688 } else {
5689 return 1 - (2 * aSign);
5691 } else {
5692 if (a.low == b.low && a.high == b.high) {
5693 return float_relation_equal;
5694 } else {
5695 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
5700 int float128_compare( float128 a, float128 b STATUS_PARAM )
5702 return float128_compare_internal(a, b, 0 STATUS_VAR);
5705 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
5707 return float128_compare_internal(a, b, 1 STATUS_VAR);
5710 /* Multiply A by 2 raised to the power N. */
5711 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
5713 flag aSign;
5714 int16 aExp;
5715 bits32 aSig;
5717 aSig = extractFloat32Frac( a );
5718 aExp = extractFloat32Exp( a );
5719 aSign = extractFloat32Sign( a );
5721 if ( aExp == 0xFF ) {
5722 return a;
5724 if ( aExp != 0 )
5725 aSig |= 0x00800000;
5726 else if ( aSig == 0 )
5727 return a;
5729 aExp += n - 1;
5730 aSig <<= 7;
5731 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
5734 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
5736 flag aSign;
5737 int16 aExp;
5738 bits64 aSig;
5740 aSig = extractFloat64Frac( a );
5741 aExp = extractFloat64Exp( a );
5742 aSign = extractFloat64Sign( a );
5744 if ( aExp == 0x7FF ) {
5745 return a;
5747 if ( aExp != 0 )
5748 aSig |= LIT64( 0x0010000000000000 );
5749 else if ( aSig == 0 )
5750 return a;
5752 aExp += n - 1;
5753 aSig <<= 10;
5754 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
5757 #ifdef FLOATX80
5758 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
5760 flag aSign;
5761 int16 aExp;
5762 bits64 aSig;
5764 aSig = extractFloatx80Frac( a );
5765 aExp = extractFloatx80Exp( a );
5766 aSign = extractFloatx80Sign( a );
5768 if ( aExp == 0x7FF ) {
5769 return a;
5771 if (aExp == 0 && aSig == 0)
5772 return a;
5774 aExp += n;
5775 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
5776 aSign, aExp, aSig, 0 STATUS_VAR );
5778 #endif
5780 #ifdef FLOAT128
5781 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
5783 flag aSign;
5784 int32 aExp;
5785 bits64 aSig0, aSig1;
5787 aSig1 = extractFloat128Frac1( a );
5788 aSig0 = extractFloat128Frac0( a );
5789 aExp = extractFloat128Exp( a );
5790 aSign = extractFloat128Sign( a );
5791 if ( aExp == 0x7FFF ) {
5792 return a;
5794 if ( aExp != 0 )
5795 aSig0 |= LIT64( 0x0001000000000000 );
5796 else if ( aSig0 == 0 && aSig1 == 0 )
5797 return a;
5799 aExp += n - 1;
5800 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
5801 STATUS_VAR );
5804 #endif