scsi-disk: Remove duplicate cdb parsing
[qemu/ar7.git] / fpu / softfloat.c
blob0b8279798cb1eeee70c70501e9b0d418cac858a0
2 /*============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5 Package, Release 2b.
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15 arithmetic/SoftFloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
26 Derivative works are acceptable, even for commercial purposes, so long as
27 (1) the source code for the derivative work includes prominent notice that
28 the work is derivative, and (2) the source code includes prominent notice with
29 these four paragraphs for those parts of this code that are retained.
31 =============================================================================*/
33 /* FIXME: Flush-To-Zero only effects results. Denormal inputs should also
34 be flushed to zero. */
35 #include "softfloat.h"
37 /*----------------------------------------------------------------------------
38 | Primitive arithmetic functions, including multi-word arithmetic, and
39 | division and square root approximations. (Can be specialized to target if
40 | desired.)
41 *----------------------------------------------------------------------------*/
42 #include "softfloat-macros.h"
44 /*----------------------------------------------------------------------------
45 | Functions and definitions to determine: (1) whether tininess for underflow
46 | is detected before or after rounding by default, (2) what (if anything)
47 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
48 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
49 | are propagated from function inputs to output. These details are target-
50 | specific.
51 *----------------------------------------------------------------------------*/
52 #include "softfloat-specialize.h"
54 void set_float_rounding_mode(int val STATUS_PARAM)
56 STATUS(float_rounding_mode) = val;
59 void set_float_exception_flags(int val STATUS_PARAM)
61 STATUS(float_exception_flags) = val;
64 #ifdef FLOATX80
65 void set_floatx80_rounding_precision(int val STATUS_PARAM)
67 STATUS(floatx80_rounding_precision) = val;
69 #endif
71 /*----------------------------------------------------------------------------
72 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
73 | and 7, and returns the properly rounded 32-bit integer corresponding to the
74 | input. If `zSign' is 1, the input is negated before being converted to an
75 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
76 | is simply rounded to an integer, with the inexact exception raised if the
77 | input cannot be represented exactly as an integer. However, if the fixed-
78 | point input is too large, the invalid exception is raised and the largest
79 | positive or negative integer is returned.
80 *----------------------------------------------------------------------------*/
82 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
84 int8 roundingMode;
85 flag roundNearestEven;
86 int8 roundIncrement, roundBits;
87 int32 z;
89 roundingMode = STATUS(float_rounding_mode);
90 roundNearestEven = ( roundingMode == float_round_nearest_even );
91 roundIncrement = 0x40;
92 if ( ! roundNearestEven ) {
93 if ( roundingMode == float_round_to_zero ) {
94 roundIncrement = 0;
96 else {
97 roundIncrement = 0x7F;
98 if ( zSign ) {
99 if ( roundingMode == float_round_up ) roundIncrement = 0;
101 else {
102 if ( roundingMode == float_round_down ) roundIncrement = 0;
106 roundBits = absZ & 0x7F;
107 absZ = ( absZ + roundIncrement )>>7;
108 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
109 z = absZ;
110 if ( zSign ) z = - z;
111 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
112 float_raise( float_flag_invalid STATUS_VAR);
113 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
115 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
116 return z;
120 /*----------------------------------------------------------------------------
121 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
122 | `absZ1', with binary point between bits 63 and 64 (between the input words),
123 | and returns the properly rounded 64-bit integer corresponding to the input.
124 | If `zSign' is 1, the input is negated before being converted to an integer.
125 | Ordinarily, the fixed-point input is simply rounded to an integer, with
126 | the inexact exception raised if the input cannot be represented exactly as
127 | an integer. However, if the fixed-point input is too large, the invalid
128 | exception is raised and the largest positive or negative integer is
129 | returned.
130 *----------------------------------------------------------------------------*/
132 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
134 int8 roundingMode;
135 flag roundNearestEven, increment;
136 int64 z;
138 roundingMode = STATUS(float_rounding_mode);
139 roundNearestEven = ( roundingMode == float_round_nearest_even );
140 increment = ( (sbits64) absZ1 < 0 );
141 if ( ! roundNearestEven ) {
142 if ( roundingMode == float_round_to_zero ) {
143 increment = 0;
145 else {
146 if ( zSign ) {
147 increment = ( roundingMode == float_round_down ) && absZ1;
149 else {
150 increment = ( roundingMode == float_round_up ) && absZ1;
154 if ( increment ) {
155 ++absZ0;
156 if ( absZ0 == 0 ) goto overflow;
157 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
159 z = absZ0;
160 if ( zSign ) z = - z;
161 if ( z && ( ( z < 0 ) ^ zSign ) ) {
162 overflow:
163 float_raise( float_flag_invalid STATUS_VAR);
164 return
165 zSign ? (sbits64) LIT64( 0x8000000000000000 )
166 : LIT64( 0x7FFFFFFFFFFFFFFF );
168 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
169 return z;
173 /*----------------------------------------------------------------------------
174 | Returns the fraction bits of the single-precision floating-point value `a'.
175 *----------------------------------------------------------------------------*/
177 INLINE bits32 extractFloat32Frac( float32 a )
180 return float32_val(a) & 0x007FFFFF;
184 /*----------------------------------------------------------------------------
185 | Returns the exponent bits of the single-precision floating-point value `a'.
186 *----------------------------------------------------------------------------*/
188 INLINE int16 extractFloat32Exp( float32 a )
191 return ( float32_val(a)>>23 ) & 0xFF;
195 /*----------------------------------------------------------------------------
196 | Returns the sign bit of the single-precision floating-point value `a'.
197 *----------------------------------------------------------------------------*/
199 INLINE flag extractFloat32Sign( float32 a )
202 return float32_val(a)>>31;
206 /*----------------------------------------------------------------------------
207 | Normalizes the subnormal single-precision floating-point value represented
208 | by the denormalized significand `aSig'. The normalized exponent and
209 | significand are stored at the locations pointed to by `zExpPtr' and
210 | `zSigPtr', respectively.
211 *----------------------------------------------------------------------------*/
213 static void
214 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
216 int8 shiftCount;
218 shiftCount = countLeadingZeros32( aSig ) - 8;
219 *zSigPtr = aSig<<shiftCount;
220 *zExpPtr = 1 - shiftCount;
224 /*----------------------------------------------------------------------------
225 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
226 | single-precision floating-point value, returning the result. After being
227 | shifted into the proper positions, the three fields are simply added
228 | together to form the result. This means that any integer portion of `zSig'
229 | will be added into the exponent. Since a properly normalized significand
230 | will have an integer portion equal to 1, the `zExp' input should be 1 less
231 | than the desired result exponent whenever `zSig' is a complete, normalized
232 | significand.
233 *----------------------------------------------------------------------------*/
235 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
238 return make_float32(
239 ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
243 /*----------------------------------------------------------------------------
244 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
245 | and significand `zSig', and returns the proper single-precision floating-
246 | point value corresponding to the abstract input. Ordinarily, the abstract
247 | value is simply rounded and packed into the single-precision format, with
248 | the inexact exception raised if the abstract input cannot be represented
249 | exactly. However, if the abstract value is too large, the overflow and
250 | inexact exceptions are raised and an infinity or maximal finite value is
251 | returned. If the abstract value is too small, the input value is rounded to
252 | a subnormal number, and the underflow and inexact exceptions are raised if
253 | the abstract input cannot be represented exactly as a subnormal single-
254 | precision floating-point number.
255 | The input significand `zSig' has its binary point between bits 30
256 | and 29, which is 7 bits to the left of the usual location. This shifted
257 | significand must be normalized or smaller. If `zSig' is not normalized,
258 | `zExp' must be 0; in that case, the result returned is a subnormal number,
259 | and it must not require rounding. In the usual case that `zSig' is
260 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
261 | The handling of underflow and overflow follows the IEC/IEEE Standard for
262 | Binary Floating-Point Arithmetic.
263 *----------------------------------------------------------------------------*/
265 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
267 int8 roundingMode;
268 flag roundNearestEven;
269 int8 roundIncrement, roundBits;
270 flag isTiny;
272 roundingMode = STATUS(float_rounding_mode);
273 roundNearestEven = ( roundingMode == float_round_nearest_even );
274 roundIncrement = 0x40;
275 if ( ! roundNearestEven ) {
276 if ( roundingMode == float_round_to_zero ) {
277 roundIncrement = 0;
279 else {
280 roundIncrement = 0x7F;
281 if ( zSign ) {
282 if ( roundingMode == float_round_up ) roundIncrement = 0;
284 else {
285 if ( roundingMode == float_round_down ) roundIncrement = 0;
289 roundBits = zSig & 0x7F;
290 if ( 0xFD <= (bits16) zExp ) {
291 if ( ( 0xFD < zExp )
292 || ( ( zExp == 0xFD )
293 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
295 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
296 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
298 if ( zExp < 0 ) {
299 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
300 isTiny =
301 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
302 || ( zExp < -1 )
303 || ( zSig + roundIncrement < 0x80000000 );
304 shift32RightJamming( zSig, - zExp, &zSig );
305 zExp = 0;
306 roundBits = zSig & 0x7F;
307 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
310 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
311 zSig = ( zSig + roundIncrement )>>7;
312 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
313 if ( zSig == 0 ) zExp = 0;
314 return packFloat32( zSign, zExp, zSig );
318 /*----------------------------------------------------------------------------
319 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
320 | and significand `zSig', and returns the proper single-precision floating-
321 | point value corresponding to the abstract input. This routine is just like
322 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
323 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
324 | floating-point exponent.
325 *----------------------------------------------------------------------------*/
327 static float32
328 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
330 int8 shiftCount;
332 shiftCount = countLeadingZeros32( zSig ) - 1;
333 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
337 /*----------------------------------------------------------------------------
338 | Returns the fraction bits of the double-precision floating-point value `a'.
339 *----------------------------------------------------------------------------*/
341 INLINE bits64 extractFloat64Frac( float64 a )
344 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
348 /*----------------------------------------------------------------------------
349 | Returns the exponent bits of the double-precision floating-point value `a'.
350 *----------------------------------------------------------------------------*/
352 INLINE int16 extractFloat64Exp( float64 a )
355 return ( float64_val(a)>>52 ) & 0x7FF;
359 /*----------------------------------------------------------------------------
360 | Returns the sign bit of the double-precision floating-point value `a'.
361 *----------------------------------------------------------------------------*/
363 INLINE flag extractFloat64Sign( float64 a )
366 return float64_val(a)>>63;
370 /*----------------------------------------------------------------------------
371 | Normalizes the subnormal double-precision floating-point value represented
372 | by the denormalized significand `aSig'. The normalized exponent and
373 | significand are stored at the locations pointed to by `zExpPtr' and
374 | `zSigPtr', respectively.
375 *----------------------------------------------------------------------------*/
377 static void
378 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
380 int8 shiftCount;
382 shiftCount = countLeadingZeros64( aSig ) - 11;
383 *zSigPtr = aSig<<shiftCount;
384 *zExpPtr = 1 - shiftCount;
388 /*----------------------------------------------------------------------------
389 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
390 | double-precision floating-point value, returning the result. After being
391 | shifted into the proper positions, the three fields are simply added
392 | together to form the result. This means that any integer portion of `zSig'
393 | will be added into the exponent. Since a properly normalized significand
394 | will have an integer portion equal to 1, the `zExp' input should be 1 less
395 | than the desired result exponent whenever `zSig' is a complete, normalized
396 | significand.
397 *----------------------------------------------------------------------------*/
399 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
402 return make_float64(
403 ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
407 /*----------------------------------------------------------------------------
408 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
409 | and significand `zSig', and returns the proper double-precision floating-
410 | point value corresponding to the abstract input. Ordinarily, the abstract
411 | value is simply rounded and packed into the double-precision format, with
412 | the inexact exception raised if the abstract input cannot be represented
413 | exactly. However, if the abstract value is too large, the overflow and
414 | inexact exceptions are raised and an infinity or maximal finite value is
415 | returned. If the abstract value is too small, the input value is rounded
416 | to a subnormal number, and the underflow and inexact exceptions are raised
417 | if the abstract input cannot be represented exactly as a subnormal double-
418 | precision floating-point number.
419 | The input significand `zSig' has its binary point between bits 62
420 | and 61, which is 10 bits to the left of the usual location. This shifted
421 | significand must be normalized or smaller. If `zSig' is not normalized,
422 | `zExp' must be 0; in that case, the result returned is a subnormal number,
423 | and it must not require rounding. In the usual case that `zSig' is
424 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
425 | The handling of underflow and overflow follows the IEC/IEEE Standard for
426 | Binary Floating-Point Arithmetic.
427 *----------------------------------------------------------------------------*/
429 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
431 int8 roundingMode;
432 flag roundNearestEven;
433 int16 roundIncrement, roundBits;
434 flag isTiny;
436 roundingMode = STATUS(float_rounding_mode);
437 roundNearestEven = ( roundingMode == float_round_nearest_even );
438 roundIncrement = 0x200;
439 if ( ! roundNearestEven ) {
440 if ( roundingMode == float_round_to_zero ) {
441 roundIncrement = 0;
443 else {
444 roundIncrement = 0x3FF;
445 if ( zSign ) {
446 if ( roundingMode == float_round_up ) roundIncrement = 0;
448 else {
449 if ( roundingMode == float_round_down ) roundIncrement = 0;
453 roundBits = zSig & 0x3FF;
454 if ( 0x7FD <= (bits16) zExp ) {
455 if ( ( 0x7FD < zExp )
456 || ( ( zExp == 0x7FD )
457 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
459 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
460 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
462 if ( zExp < 0 ) {
463 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
464 isTiny =
465 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
466 || ( zExp < -1 )
467 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
468 shift64RightJamming( zSig, - zExp, &zSig );
469 zExp = 0;
470 roundBits = zSig & 0x3FF;
471 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
474 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
475 zSig = ( zSig + roundIncrement )>>10;
476 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
477 if ( zSig == 0 ) zExp = 0;
478 return packFloat64( zSign, zExp, zSig );
482 /*----------------------------------------------------------------------------
483 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
484 | and significand `zSig', and returns the proper double-precision floating-
485 | point value corresponding to the abstract input. This routine is just like
486 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
487 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
488 | floating-point exponent.
489 *----------------------------------------------------------------------------*/
491 static float64
492 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
494 int8 shiftCount;
496 shiftCount = countLeadingZeros64( zSig ) - 1;
497 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
501 #ifdef FLOATX80
503 /*----------------------------------------------------------------------------
504 | Returns the fraction bits of the extended double-precision floating-point
505 | value `a'.
506 *----------------------------------------------------------------------------*/
508 INLINE bits64 extractFloatx80Frac( floatx80 a )
511 return a.low;
515 /*----------------------------------------------------------------------------
516 | Returns the exponent bits of the extended double-precision floating-point
517 | value `a'.
518 *----------------------------------------------------------------------------*/
520 INLINE int32 extractFloatx80Exp( floatx80 a )
523 return a.high & 0x7FFF;
527 /*----------------------------------------------------------------------------
528 | Returns the sign bit of the extended double-precision floating-point value
529 | `a'.
530 *----------------------------------------------------------------------------*/
532 INLINE flag extractFloatx80Sign( floatx80 a )
535 return a.high>>15;
539 /*----------------------------------------------------------------------------
540 | Normalizes the subnormal extended double-precision floating-point value
541 | represented by the denormalized significand `aSig'. The normalized exponent
542 | and significand are stored at the locations pointed to by `zExpPtr' and
543 | `zSigPtr', respectively.
544 *----------------------------------------------------------------------------*/
546 static void
547 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
549 int8 shiftCount;
551 shiftCount = countLeadingZeros64( aSig );
552 *zSigPtr = aSig<<shiftCount;
553 *zExpPtr = 1 - shiftCount;
557 /*----------------------------------------------------------------------------
558 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
559 | extended double-precision floating-point value, returning the result.
560 *----------------------------------------------------------------------------*/
562 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
564 floatx80 z;
566 z.low = zSig;
567 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
568 return z;
572 /*----------------------------------------------------------------------------
573 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
574 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
575 | and returns the proper extended double-precision floating-point value
576 | corresponding to the abstract input. Ordinarily, the abstract value is
577 | rounded and packed into the extended double-precision format, with the
578 | inexact exception raised if the abstract input cannot be represented
579 | exactly. However, if the abstract value is too large, the overflow and
580 | inexact exceptions are raised and an infinity or maximal finite value is
581 | returned. If the abstract value is too small, the input value is rounded to
582 | a subnormal number, and the underflow and inexact exceptions are raised if
583 | the abstract input cannot be represented exactly as a subnormal extended
584 | double-precision floating-point number.
585 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
586 | number of bits as single or double precision, respectively. Otherwise, the
587 | result is rounded to the full precision of the extended double-precision
588 | format.
589 | The input significand must be normalized or smaller. If the input
590 | significand is not normalized, `zExp' must be 0; in that case, the result
591 | returned is a subnormal number, and it must not require rounding. The
592 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
593 | Floating-Point Arithmetic.
594 *----------------------------------------------------------------------------*/
596 static floatx80
597 roundAndPackFloatx80(
598 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
599 STATUS_PARAM)
601 int8 roundingMode;
602 flag roundNearestEven, increment, isTiny;
603 int64 roundIncrement, roundMask, roundBits;
605 roundingMode = STATUS(float_rounding_mode);
606 roundNearestEven = ( roundingMode == float_round_nearest_even );
607 if ( roundingPrecision == 80 ) goto precision80;
608 if ( roundingPrecision == 64 ) {
609 roundIncrement = LIT64( 0x0000000000000400 );
610 roundMask = LIT64( 0x00000000000007FF );
612 else if ( roundingPrecision == 32 ) {
613 roundIncrement = LIT64( 0x0000008000000000 );
614 roundMask = LIT64( 0x000000FFFFFFFFFF );
616 else {
617 goto precision80;
619 zSig0 |= ( zSig1 != 0 );
620 if ( ! roundNearestEven ) {
621 if ( roundingMode == float_round_to_zero ) {
622 roundIncrement = 0;
624 else {
625 roundIncrement = roundMask;
626 if ( zSign ) {
627 if ( roundingMode == float_round_up ) roundIncrement = 0;
629 else {
630 if ( roundingMode == float_round_down ) roundIncrement = 0;
634 roundBits = zSig0 & roundMask;
635 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
636 if ( ( 0x7FFE < zExp )
637 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
639 goto overflow;
641 if ( zExp <= 0 ) {
642 if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
643 isTiny =
644 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
645 || ( zExp < 0 )
646 || ( zSig0 <= zSig0 + roundIncrement );
647 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
648 zExp = 0;
649 roundBits = zSig0 & roundMask;
650 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
651 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
652 zSig0 += roundIncrement;
653 if ( (sbits64) zSig0 < 0 ) zExp = 1;
654 roundIncrement = roundMask + 1;
655 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
656 roundMask |= roundIncrement;
658 zSig0 &= ~ roundMask;
659 return packFloatx80( zSign, zExp, zSig0 );
662 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
663 zSig0 += roundIncrement;
664 if ( zSig0 < roundIncrement ) {
665 ++zExp;
666 zSig0 = LIT64( 0x8000000000000000 );
668 roundIncrement = roundMask + 1;
669 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
670 roundMask |= roundIncrement;
672 zSig0 &= ~ roundMask;
673 if ( zSig0 == 0 ) zExp = 0;
674 return packFloatx80( zSign, zExp, zSig0 );
675 precision80:
676 increment = ( (sbits64) zSig1 < 0 );
677 if ( ! roundNearestEven ) {
678 if ( roundingMode == float_round_to_zero ) {
679 increment = 0;
681 else {
682 if ( zSign ) {
683 increment = ( roundingMode == float_round_down ) && zSig1;
685 else {
686 increment = ( roundingMode == float_round_up ) && zSig1;
690 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
691 if ( ( 0x7FFE < zExp )
692 || ( ( zExp == 0x7FFE )
693 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
694 && increment
697 roundMask = 0;
698 overflow:
699 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
700 if ( ( roundingMode == float_round_to_zero )
701 || ( zSign && ( roundingMode == float_round_up ) )
702 || ( ! zSign && ( roundingMode == float_round_down ) )
704 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
706 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
708 if ( zExp <= 0 ) {
709 isTiny =
710 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
711 || ( zExp < 0 )
712 || ! increment
713 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
714 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
715 zExp = 0;
716 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
717 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
718 if ( roundNearestEven ) {
719 increment = ( (sbits64) zSig1 < 0 );
721 else {
722 if ( zSign ) {
723 increment = ( roundingMode == float_round_down ) && zSig1;
725 else {
726 increment = ( roundingMode == float_round_up ) && zSig1;
729 if ( increment ) {
730 ++zSig0;
731 zSig0 &=
732 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
733 if ( (sbits64) zSig0 < 0 ) zExp = 1;
735 return packFloatx80( zSign, zExp, zSig0 );
738 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
739 if ( increment ) {
740 ++zSig0;
741 if ( zSig0 == 0 ) {
742 ++zExp;
743 zSig0 = LIT64( 0x8000000000000000 );
745 else {
746 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
749 else {
750 if ( zSig0 == 0 ) zExp = 0;
752 return packFloatx80( zSign, zExp, zSig0 );
756 /*----------------------------------------------------------------------------
757 | Takes an abstract floating-point value having sign `zSign', exponent
758 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
759 | and returns the proper extended double-precision floating-point value
760 | corresponding to the abstract input. This routine is just like
761 | `roundAndPackFloatx80' except that the input significand does not have to be
762 | normalized.
763 *----------------------------------------------------------------------------*/
765 static floatx80
766 normalizeRoundAndPackFloatx80(
767 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
768 STATUS_PARAM)
770 int8 shiftCount;
772 if ( zSig0 == 0 ) {
773 zSig0 = zSig1;
774 zSig1 = 0;
775 zExp -= 64;
777 shiftCount = countLeadingZeros64( zSig0 );
778 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
779 zExp -= shiftCount;
780 return
781 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
785 #endif
787 #ifdef FLOAT128
789 /*----------------------------------------------------------------------------
790 | Returns the least-significant 64 fraction bits of the quadruple-precision
791 | floating-point value `a'.
792 *----------------------------------------------------------------------------*/
794 INLINE bits64 extractFloat128Frac1( float128 a )
797 return a.low;
801 /*----------------------------------------------------------------------------
802 | Returns the most-significant 48 fraction bits of the quadruple-precision
803 | floating-point value `a'.
804 *----------------------------------------------------------------------------*/
806 INLINE bits64 extractFloat128Frac0( float128 a )
809 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
813 /*----------------------------------------------------------------------------
814 | Returns the exponent bits of the quadruple-precision floating-point value
815 | `a'.
816 *----------------------------------------------------------------------------*/
818 INLINE int32 extractFloat128Exp( float128 a )
821 return ( a.high>>48 ) & 0x7FFF;
825 /*----------------------------------------------------------------------------
826 | Returns the sign bit of the quadruple-precision floating-point value `a'.
827 *----------------------------------------------------------------------------*/
829 INLINE flag extractFloat128Sign( float128 a )
832 return a.high>>63;
836 /*----------------------------------------------------------------------------
837 | Normalizes the subnormal quadruple-precision floating-point value
838 | represented by the denormalized significand formed by the concatenation of
839 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
840 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
841 | significand are stored at the location pointed to by `zSig0Ptr', and the
842 | least significant 64 bits of the normalized significand are stored at the
843 | location pointed to by `zSig1Ptr'.
844 *----------------------------------------------------------------------------*/
846 static void
847 normalizeFloat128Subnormal(
848 bits64 aSig0,
849 bits64 aSig1,
850 int32 *zExpPtr,
851 bits64 *zSig0Ptr,
852 bits64 *zSig1Ptr
855 int8 shiftCount;
857 if ( aSig0 == 0 ) {
858 shiftCount = countLeadingZeros64( aSig1 ) - 15;
859 if ( shiftCount < 0 ) {
860 *zSig0Ptr = aSig1>>( - shiftCount );
861 *zSig1Ptr = aSig1<<( shiftCount & 63 );
863 else {
864 *zSig0Ptr = aSig1<<shiftCount;
865 *zSig1Ptr = 0;
867 *zExpPtr = - shiftCount - 63;
869 else {
870 shiftCount = countLeadingZeros64( aSig0 ) - 15;
871 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
872 *zExpPtr = 1 - shiftCount;
877 /*----------------------------------------------------------------------------
878 | Packs the sign `zSign', the exponent `zExp', and the significand formed
879 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
880 | floating-point value, returning the result. After being shifted into the
881 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
882 | added together to form the most significant 32 bits of the result. This
883 | means that any integer portion of `zSig0' will be added into the exponent.
884 | Since a properly normalized significand will have an integer portion equal
885 | to 1, the `zExp' input should be 1 less than the desired result exponent
886 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
887 | significand.
888 *----------------------------------------------------------------------------*/
890 INLINE float128
891 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
893 float128 z;
895 z.low = zSig1;
896 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
897 return z;
901 /*----------------------------------------------------------------------------
902 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
903 | and extended significand formed by the concatenation of `zSig0', `zSig1',
904 | and `zSig2', and returns the proper quadruple-precision floating-point value
905 | corresponding to the abstract input. Ordinarily, the abstract value is
906 | simply rounded and packed into the quadruple-precision format, with the
907 | inexact exception raised if the abstract input cannot be represented
908 | exactly. However, if the abstract value is too large, the overflow and
909 | inexact exceptions are raised and an infinity or maximal finite value is
910 | returned. If the abstract value is too small, the input value is rounded to
911 | a subnormal number, and the underflow and inexact exceptions are raised if
912 | the abstract input cannot be represented exactly as a subnormal quadruple-
913 | precision floating-point number.
914 | The input significand must be normalized or smaller. If the input
915 | significand is not normalized, `zExp' must be 0; in that case, the result
916 | returned is a subnormal number, and it must not require rounding. In the
917 | usual case that the input significand is normalized, `zExp' must be 1 less
918 | than the ``true'' floating-point exponent. The handling of underflow and
919 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
920 *----------------------------------------------------------------------------*/
922 static float128
923 roundAndPackFloat128(
924 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
926 int8 roundingMode;
927 flag roundNearestEven, increment, isTiny;
929 roundingMode = STATUS(float_rounding_mode);
930 roundNearestEven = ( roundingMode == float_round_nearest_even );
931 increment = ( (sbits64) zSig2 < 0 );
932 if ( ! roundNearestEven ) {
933 if ( roundingMode == float_round_to_zero ) {
934 increment = 0;
936 else {
937 if ( zSign ) {
938 increment = ( roundingMode == float_round_down ) && zSig2;
940 else {
941 increment = ( roundingMode == float_round_up ) && zSig2;
945 if ( 0x7FFD <= (bits32) zExp ) {
946 if ( ( 0x7FFD < zExp )
947 || ( ( zExp == 0x7FFD )
948 && eq128(
949 LIT64( 0x0001FFFFFFFFFFFF ),
950 LIT64( 0xFFFFFFFFFFFFFFFF ),
951 zSig0,
952 zSig1
954 && increment
957 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
958 if ( ( roundingMode == float_round_to_zero )
959 || ( zSign && ( roundingMode == float_round_up ) )
960 || ( ! zSign && ( roundingMode == float_round_down ) )
962 return
963 packFloat128(
964 zSign,
965 0x7FFE,
966 LIT64( 0x0000FFFFFFFFFFFF ),
967 LIT64( 0xFFFFFFFFFFFFFFFF )
970 return packFloat128( zSign, 0x7FFF, 0, 0 );
972 if ( zExp < 0 ) {
973 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
974 isTiny =
975 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
976 || ( zExp < -1 )
977 || ! increment
978 || lt128(
979 zSig0,
980 zSig1,
981 LIT64( 0x0001FFFFFFFFFFFF ),
982 LIT64( 0xFFFFFFFFFFFFFFFF )
984 shift128ExtraRightJamming(
985 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
986 zExp = 0;
987 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
988 if ( roundNearestEven ) {
989 increment = ( (sbits64) zSig2 < 0 );
991 else {
992 if ( zSign ) {
993 increment = ( roundingMode == float_round_down ) && zSig2;
995 else {
996 increment = ( roundingMode == float_round_up ) && zSig2;
1001 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1002 if ( increment ) {
1003 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1004 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1006 else {
1007 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1009 return packFloat128( zSign, zExp, zSig0, zSig1 );
1013 /*----------------------------------------------------------------------------
1014 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1015 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1016 | returns the proper quadruple-precision floating-point value corresponding
1017 | to the abstract input. This routine is just like `roundAndPackFloat128'
1018 | except that the input significand has fewer bits and does not have to be
1019 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1020 | point exponent.
1021 *----------------------------------------------------------------------------*/
1023 static float128
1024 normalizeRoundAndPackFloat128(
1025 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1027 int8 shiftCount;
1028 bits64 zSig2;
1030 if ( zSig0 == 0 ) {
1031 zSig0 = zSig1;
1032 zSig1 = 0;
1033 zExp -= 64;
1035 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1036 if ( 0 <= shiftCount ) {
1037 zSig2 = 0;
1038 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1040 else {
1041 shift128ExtraRightJamming(
1042 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1044 zExp -= shiftCount;
1045 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1049 #endif
1051 /*----------------------------------------------------------------------------
1052 | Returns the result of converting the 32-bit two's complement integer `a'
1053 | to the single-precision floating-point format. The conversion is performed
1054 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1055 *----------------------------------------------------------------------------*/
1057 float32 int32_to_float32( int32 a STATUS_PARAM )
1059 flag zSign;
1061 if ( a == 0 ) return float32_zero;
1062 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1063 zSign = ( a < 0 );
1064 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1068 /*----------------------------------------------------------------------------
1069 | Returns the result of converting the 32-bit two's complement integer `a'
1070 | to the double-precision floating-point format. The conversion is performed
1071 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1072 *----------------------------------------------------------------------------*/
1074 float64 int32_to_float64( int32 a STATUS_PARAM )
1076 flag zSign;
1077 uint32 absA;
1078 int8 shiftCount;
1079 bits64 zSig;
1081 if ( a == 0 ) return float64_zero;
1082 zSign = ( a < 0 );
1083 absA = zSign ? - a : a;
1084 shiftCount = countLeadingZeros32( absA ) + 21;
1085 zSig = absA;
1086 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1090 #ifdef FLOATX80
1092 /*----------------------------------------------------------------------------
1093 | Returns the result of converting the 32-bit two's complement integer `a'
1094 | to the extended double-precision floating-point format. The conversion
1095 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1096 | Arithmetic.
1097 *----------------------------------------------------------------------------*/
1099 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1101 flag zSign;
1102 uint32 absA;
1103 int8 shiftCount;
1104 bits64 zSig;
1106 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1107 zSign = ( a < 0 );
1108 absA = zSign ? - a : a;
1109 shiftCount = countLeadingZeros32( absA ) + 32;
1110 zSig = absA;
1111 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1115 #endif
1117 #ifdef FLOAT128
1119 /*----------------------------------------------------------------------------
1120 | Returns the result of converting the 32-bit two's complement integer `a' to
1121 | the quadruple-precision floating-point format. The conversion is performed
1122 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1123 *----------------------------------------------------------------------------*/
1125 float128 int32_to_float128( int32 a STATUS_PARAM )
1127 flag zSign;
1128 uint32 absA;
1129 int8 shiftCount;
1130 bits64 zSig0;
1132 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1133 zSign = ( a < 0 );
1134 absA = zSign ? - a : a;
1135 shiftCount = countLeadingZeros32( absA ) + 17;
1136 zSig0 = absA;
1137 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1141 #endif
1143 /*----------------------------------------------------------------------------
1144 | Returns the result of converting the 64-bit two's complement integer `a'
1145 | to the single-precision floating-point format. The conversion is performed
1146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1147 *----------------------------------------------------------------------------*/
1149 float32 int64_to_float32( int64 a STATUS_PARAM )
1151 flag zSign;
1152 uint64 absA;
1153 int8 shiftCount;
1155 if ( a == 0 ) return float32_zero;
1156 zSign = ( a < 0 );
1157 absA = zSign ? - a : a;
1158 shiftCount = countLeadingZeros64( absA ) - 40;
1159 if ( 0 <= shiftCount ) {
1160 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1162 else {
1163 shiftCount += 7;
1164 if ( shiftCount < 0 ) {
1165 shift64RightJamming( absA, - shiftCount, &absA );
1167 else {
1168 absA <<= shiftCount;
1170 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1175 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1177 int8 shiftCount;
1179 if ( a == 0 ) return float32_zero;
1180 shiftCount = countLeadingZeros64( a ) - 40;
1181 if ( 0 <= shiftCount ) {
1182 return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1184 else {
1185 shiftCount += 7;
1186 if ( shiftCount < 0 ) {
1187 shift64RightJamming( a, - shiftCount, &a );
1189 else {
1190 a <<= shiftCount;
1192 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1196 /*----------------------------------------------------------------------------
1197 | Returns the result of converting the 64-bit two's complement integer `a'
1198 | to the double-precision floating-point format. The conversion is performed
1199 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1200 *----------------------------------------------------------------------------*/
1202 float64 int64_to_float64( int64 a STATUS_PARAM )
1204 flag zSign;
1206 if ( a == 0 ) return float64_zero;
1207 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1208 return packFloat64( 1, 0x43E, 0 );
1210 zSign = ( a < 0 );
1211 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1215 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1217 if ( a == 0 ) return float64_zero;
1218 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1222 #ifdef FLOATX80
1224 /*----------------------------------------------------------------------------
1225 | Returns the result of converting the 64-bit two's complement integer `a'
1226 | to the extended double-precision floating-point format. The conversion
1227 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1228 | Arithmetic.
1229 *----------------------------------------------------------------------------*/
1231 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1233 flag zSign;
1234 uint64 absA;
1235 int8 shiftCount;
1237 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1238 zSign = ( a < 0 );
1239 absA = zSign ? - a : a;
1240 shiftCount = countLeadingZeros64( absA );
1241 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1245 #endif
1247 #ifdef FLOAT128
1249 /*----------------------------------------------------------------------------
1250 | Returns the result of converting the 64-bit two's complement integer `a' to
1251 | the quadruple-precision floating-point format. The conversion is performed
1252 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1253 *----------------------------------------------------------------------------*/
1255 float128 int64_to_float128( int64 a STATUS_PARAM )
1257 flag zSign;
1258 uint64 absA;
1259 int8 shiftCount;
1260 int32 zExp;
1261 bits64 zSig0, zSig1;
1263 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1264 zSign = ( a < 0 );
1265 absA = zSign ? - a : a;
1266 shiftCount = countLeadingZeros64( absA ) + 49;
1267 zExp = 0x406E - shiftCount;
1268 if ( 64 <= shiftCount ) {
1269 zSig1 = 0;
1270 zSig0 = absA;
1271 shiftCount -= 64;
1273 else {
1274 zSig1 = absA;
1275 zSig0 = 0;
1277 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1278 return packFloat128( zSign, zExp, zSig0, zSig1 );
1282 #endif
1284 /*----------------------------------------------------------------------------
1285 | Returns the result of converting the single-precision floating-point value
1286 | `a' to the 32-bit two's complement integer format. The conversion is
1287 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1288 | Arithmetic---which means in particular that the conversion is rounded
1289 | according to the current rounding mode. If `a' is a NaN, the largest
1290 | positive integer is returned. Otherwise, if the conversion overflows, the
1291 | largest integer with the same sign as `a' is returned.
1292 *----------------------------------------------------------------------------*/
1294 int32 float32_to_int32( float32 a STATUS_PARAM )
1296 flag aSign;
1297 int16 aExp, shiftCount;
1298 bits32 aSig;
1299 bits64 aSig64;
1301 aSig = extractFloat32Frac( a );
1302 aExp = extractFloat32Exp( a );
1303 aSign = extractFloat32Sign( a );
1304 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1305 if ( aExp ) aSig |= 0x00800000;
1306 shiftCount = 0xAF - aExp;
1307 aSig64 = aSig;
1308 aSig64 <<= 32;
1309 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1310 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1314 /*----------------------------------------------------------------------------
1315 | Returns the result of converting the single-precision floating-point value
1316 | `a' to the 32-bit two's complement integer format. The conversion is
1317 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1318 | Arithmetic, except that the conversion is always rounded toward zero.
1319 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1320 | the conversion overflows, the largest integer with the same sign as `a' is
1321 | returned.
1322 *----------------------------------------------------------------------------*/
1324 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1326 flag aSign;
1327 int16 aExp, shiftCount;
1328 bits32 aSig;
1329 int32 z;
1331 aSig = extractFloat32Frac( a );
1332 aExp = extractFloat32Exp( a );
1333 aSign = extractFloat32Sign( a );
1334 shiftCount = aExp - 0x9E;
1335 if ( 0 <= shiftCount ) {
1336 if ( float32_val(a) != 0xCF000000 ) {
1337 float_raise( float_flag_invalid STATUS_VAR);
1338 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1340 return (sbits32) 0x80000000;
1342 else if ( aExp <= 0x7E ) {
1343 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1344 return 0;
1346 aSig = ( aSig | 0x00800000 )<<8;
1347 z = aSig>>( - shiftCount );
1348 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1349 STATUS(float_exception_flags) |= float_flag_inexact;
1351 if ( aSign ) z = - z;
1352 return z;
1356 /*----------------------------------------------------------------------------
1357 | Returns the result of converting the single-precision floating-point value
1358 | `a' to the 64-bit two's complement integer format. The conversion is
1359 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1360 | Arithmetic---which means in particular that the conversion is rounded
1361 | according to the current rounding mode. If `a' is a NaN, the largest
1362 | positive integer is returned. Otherwise, if the conversion overflows, the
1363 | largest integer with the same sign as `a' is returned.
1364 *----------------------------------------------------------------------------*/
1366 int64 float32_to_int64( float32 a STATUS_PARAM )
1368 flag aSign;
1369 int16 aExp, shiftCount;
1370 bits32 aSig;
1371 bits64 aSig64, aSigExtra;
1373 aSig = extractFloat32Frac( a );
1374 aExp = extractFloat32Exp( a );
1375 aSign = extractFloat32Sign( a );
1376 shiftCount = 0xBE - aExp;
1377 if ( shiftCount < 0 ) {
1378 float_raise( float_flag_invalid STATUS_VAR);
1379 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1380 return LIT64( 0x7FFFFFFFFFFFFFFF );
1382 return (sbits64) LIT64( 0x8000000000000000 );
1384 if ( aExp ) aSig |= 0x00800000;
1385 aSig64 = aSig;
1386 aSig64 <<= 40;
1387 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1388 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1392 /*----------------------------------------------------------------------------
1393 | Returns the result of converting the single-precision floating-point value
1394 | `a' to the 64-bit two's complement integer format. The conversion is
1395 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1396 | Arithmetic, except that the conversion is always rounded toward zero. If
1397 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1398 | conversion overflows, the largest integer with the same sign as `a' is
1399 | returned.
1400 *----------------------------------------------------------------------------*/
1402 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1404 flag aSign;
1405 int16 aExp, shiftCount;
1406 bits32 aSig;
1407 bits64 aSig64;
1408 int64 z;
1410 aSig = extractFloat32Frac( a );
1411 aExp = extractFloat32Exp( a );
1412 aSign = extractFloat32Sign( a );
1413 shiftCount = aExp - 0xBE;
1414 if ( 0 <= shiftCount ) {
1415 if ( float32_val(a) != 0xDF000000 ) {
1416 float_raise( float_flag_invalid STATUS_VAR);
1417 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1418 return LIT64( 0x7FFFFFFFFFFFFFFF );
1421 return (sbits64) LIT64( 0x8000000000000000 );
1423 else if ( aExp <= 0x7E ) {
1424 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1425 return 0;
1427 aSig64 = aSig | 0x00800000;
1428 aSig64 <<= 40;
1429 z = aSig64>>( - shiftCount );
1430 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1431 STATUS(float_exception_flags) |= float_flag_inexact;
1433 if ( aSign ) z = - z;
1434 return z;
1438 /*----------------------------------------------------------------------------
1439 | Returns the result of converting the single-precision floating-point value
1440 | `a' to the double-precision floating-point format. The conversion is
1441 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1442 | Arithmetic.
1443 *----------------------------------------------------------------------------*/
1445 float64 float32_to_float64( float32 a STATUS_PARAM )
1447 flag aSign;
1448 int16 aExp;
1449 bits32 aSig;
1451 aSig = extractFloat32Frac( a );
1452 aExp = extractFloat32Exp( a );
1453 aSign = extractFloat32Sign( a );
1454 if ( aExp == 0xFF ) {
1455 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1456 return packFloat64( aSign, 0x7FF, 0 );
1458 if ( aExp == 0 ) {
1459 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1460 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1461 --aExp;
1463 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1467 #ifdef FLOATX80
1469 /*----------------------------------------------------------------------------
1470 | Returns the result of converting the single-precision floating-point value
1471 | `a' to the extended double-precision floating-point format. The conversion
1472 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1473 | Arithmetic.
1474 *----------------------------------------------------------------------------*/
1476 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1478 flag aSign;
1479 int16 aExp;
1480 bits32 aSig;
1482 aSig = extractFloat32Frac( a );
1483 aExp = extractFloat32Exp( a );
1484 aSign = extractFloat32Sign( a );
1485 if ( aExp == 0xFF ) {
1486 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1487 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1489 if ( aExp == 0 ) {
1490 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1491 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1493 aSig |= 0x00800000;
1494 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1498 #endif
1500 #ifdef FLOAT128
1502 /*----------------------------------------------------------------------------
1503 | Returns the result of converting the single-precision floating-point value
1504 | `a' to the double-precision floating-point format. The conversion is
1505 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1506 | Arithmetic.
1507 *----------------------------------------------------------------------------*/
1509 float128 float32_to_float128( float32 a STATUS_PARAM )
1511 flag aSign;
1512 int16 aExp;
1513 bits32 aSig;
1515 aSig = extractFloat32Frac( a );
1516 aExp = extractFloat32Exp( a );
1517 aSign = extractFloat32Sign( a );
1518 if ( aExp == 0xFF ) {
1519 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1520 return packFloat128( aSign, 0x7FFF, 0, 0 );
1522 if ( aExp == 0 ) {
1523 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1524 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1525 --aExp;
1527 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1531 #endif
1533 /*----------------------------------------------------------------------------
1534 | Rounds the single-precision floating-point value `a' to an integer, and
1535 | returns the result as a single-precision floating-point value. The
1536 | operation is performed according to the IEC/IEEE Standard for Binary
1537 | Floating-Point Arithmetic.
1538 *----------------------------------------------------------------------------*/
1540 float32 float32_round_to_int( float32 a STATUS_PARAM)
1542 flag aSign;
1543 int16 aExp;
1544 bits32 lastBitMask, roundBitsMask;
1545 int8 roundingMode;
1546 bits32 z;
1548 aExp = extractFloat32Exp( a );
1549 if ( 0x96 <= aExp ) {
1550 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1551 return propagateFloat32NaN( a, a STATUS_VAR );
1553 return a;
1555 if ( aExp <= 0x7E ) {
1556 if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1557 STATUS(float_exception_flags) |= float_flag_inexact;
1558 aSign = extractFloat32Sign( a );
1559 switch ( STATUS(float_rounding_mode) ) {
1560 case float_round_nearest_even:
1561 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1562 return packFloat32( aSign, 0x7F, 0 );
1564 break;
1565 case float_round_down:
1566 return make_float32(aSign ? 0xBF800000 : 0);
1567 case float_round_up:
1568 return make_float32(aSign ? 0x80000000 : 0x3F800000);
1570 return packFloat32( aSign, 0, 0 );
1572 lastBitMask = 1;
1573 lastBitMask <<= 0x96 - aExp;
1574 roundBitsMask = lastBitMask - 1;
1575 z = float32_val(a);
1576 roundingMode = STATUS(float_rounding_mode);
1577 if ( roundingMode == float_round_nearest_even ) {
1578 z += lastBitMask>>1;
1579 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1581 else if ( roundingMode != float_round_to_zero ) {
1582 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1583 z += roundBitsMask;
1586 z &= ~ roundBitsMask;
1587 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1588 return make_float32(z);
1592 /*----------------------------------------------------------------------------
1593 | Returns the result of adding the absolute values of the single-precision
1594 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1595 | before being returned. `zSign' is ignored if the result is a NaN.
1596 | The addition is performed according to the IEC/IEEE Standard for Binary
1597 | Floating-Point Arithmetic.
1598 *----------------------------------------------------------------------------*/
1600 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1602 int16 aExp, bExp, zExp;
1603 bits32 aSig, bSig, zSig;
1604 int16 expDiff;
1606 aSig = extractFloat32Frac( a );
1607 aExp = extractFloat32Exp( a );
1608 bSig = extractFloat32Frac( b );
1609 bExp = extractFloat32Exp( b );
1610 expDiff = aExp - bExp;
1611 aSig <<= 6;
1612 bSig <<= 6;
1613 if ( 0 < expDiff ) {
1614 if ( aExp == 0xFF ) {
1615 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1616 return a;
1618 if ( bExp == 0 ) {
1619 --expDiff;
1621 else {
1622 bSig |= 0x20000000;
1624 shift32RightJamming( bSig, expDiff, &bSig );
1625 zExp = aExp;
1627 else if ( expDiff < 0 ) {
1628 if ( bExp == 0xFF ) {
1629 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1630 return packFloat32( zSign, 0xFF, 0 );
1632 if ( aExp == 0 ) {
1633 ++expDiff;
1635 else {
1636 aSig |= 0x20000000;
1638 shift32RightJamming( aSig, - expDiff, &aSig );
1639 zExp = bExp;
1641 else {
1642 if ( aExp == 0xFF ) {
1643 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1644 return a;
1646 if ( aExp == 0 ) {
1647 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1648 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1650 zSig = 0x40000000 + aSig + bSig;
1651 zExp = aExp;
1652 goto roundAndPack;
1654 aSig |= 0x20000000;
1655 zSig = ( aSig + bSig )<<1;
1656 --zExp;
1657 if ( (sbits32) zSig < 0 ) {
1658 zSig = aSig + bSig;
1659 ++zExp;
1661 roundAndPack:
1662 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1666 /*----------------------------------------------------------------------------
1667 | Returns the result of subtracting the absolute values of the single-
1668 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1669 | difference is negated before being returned. `zSign' is ignored if the
1670 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1671 | Standard for Binary Floating-Point Arithmetic.
1672 *----------------------------------------------------------------------------*/
1674 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1676 int16 aExp, bExp, zExp;
1677 bits32 aSig, bSig, zSig;
1678 int16 expDiff;
1680 aSig = extractFloat32Frac( a );
1681 aExp = extractFloat32Exp( a );
1682 bSig = extractFloat32Frac( b );
1683 bExp = extractFloat32Exp( b );
1684 expDiff = aExp - bExp;
1685 aSig <<= 7;
1686 bSig <<= 7;
1687 if ( 0 < expDiff ) goto aExpBigger;
1688 if ( expDiff < 0 ) goto bExpBigger;
1689 if ( aExp == 0xFF ) {
1690 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1691 float_raise( float_flag_invalid STATUS_VAR);
1692 return float32_default_nan;
1694 if ( aExp == 0 ) {
1695 aExp = 1;
1696 bExp = 1;
1698 if ( bSig < aSig ) goto aBigger;
1699 if ( aSig < bSig ) goto bBigger;
1700 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1701 bExpBigger:
1702 if ( bExp == 0xFF ) {
1703 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1704 return packFloat32( zSign ^ 1, 0xFF, 0 );
1706 if ( aExp == 0 ) {
1707 ++expDiff;
1709 else {
1710 aSig |= 0x40000000;
1712 shift32RightJamming( aSig, - expDiff, &aSig );
1713 bSig |= 0x40000000;
1714 bBigger:
1715 zSig = bSig - aSig;
1716 zExp = bExp;
1717 zSign ^= 1;
1718 goto normalizeRoundAndPack;
1719 aExpBigger:
1720 if ( aExp == 0xFF ) {
1721 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1722 return a;
1724 if ( bExp == 0 ) {
1725 --expDiff;
1727 else {
1728 bSig |= 0x40000000;
1730 shift32RightJamming( bSig, expDiff, &bSig );
1731 aSig |= 0x40000000;
1732 aBigger:
1733 zSig = aSig - bSig;
1734 zExp = aExp;
1735 normalizeRoundAndPack:
1736 --zExp;
1737 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1741 /*----------------------------------------------------------------------------
1742 | Returns the result of adding the single-precision floating-point values `a'
1743 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1744 | Binary Floating-Point Arithmetic.
1745 *----------------------------------------------------------------------------*/
1747 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1749 flag aSign, bSign;
1751 aSign = extractFloat32Sign( a );
1752 bSign = extractFloat32Sign( b );
1753 if ( aSign == bSign ) {
1754 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1756 else {
1757 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1762 /*----------------------------------------------------------------------------
1763 | Returns the result of subtracting the single-precision floating-point values
1764 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1765 | for Binary Floating-Point Arithmetic.
1766 *----------------------------------------------------------------------------*/
1768 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1770 flag aSign, bSign;
1772 aSign = extractFloat32Sign( a );
1773 bSign = extractFloat32Sign( b );
1774 if ( aSign == bSign ) {
1775 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1777 else {
1778 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1783 /*----------------------------------------------------------------------------
1784 | Returns the result of multiplying the single-precision floating-point values
1785 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1786 | for Binary Floating-Point Arithmetic.
1787 *----------------------------------------------------------------------------*/
1789 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1791 flag aSign, bSign, zSign;
1792 int16 aExp, bExp, zExp;
1793 bits32 aSig, bSig;
1794 bits64 zSig64;
1795 bits32 zSig;
1797 aSig = extractFloat32Frac( a );
1798 aExp = extractFloat32Exp( a );
1799 aSign = extractFloat32Sign( a );
1800 bSig = extractFloat32Frac( b );
1801 bExp = extractFloat32Exp( b );
1802 bSign = extractFloat32Sign( b );
1803 zSign = aSign ^ bSign;
1804 if ( aExp == 0xFF ) {
1805 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1806 return propagateFloat32NaN( a, b STATUS_VAR );
1808 if ( ( bExp | bSig ) == 0 ) {
1809 float_raise( float_flag_invalid STATUS_VAR);
1810 return float32_default_nan;
1812 return packFloat32( zSign, 0xFF, 0 );
1814 if ( bExp == 0xFF ) {
1815 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1816 if ( ( aExp | aSig ) == 0 ) {
1817 float_raise( float_flag_invalid STATUS_VAR);
1818 return float32_default_nan;
1820 return packFloat32( zSign, 0xFF, 0 );
1822 if ( aExp == 0 ) {
1823 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1824 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1826 if ( bExp == 0 ) {
1827 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1828 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1830 zExp = aExp + bExp - 0x7F;
1831 aSig = ( aSig | 0x00800000 )<<7;
1832 bSig = ( bSig | 0x00800000 )<<8;
1833 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1834 zSig = zSig64;
1835 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1836 zSig <<= 1;
1837 --zExp;
1839 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1843 /*----------------------------------------------------------------------------
1844 | Returns the result of dividing the single-precision floating-point value `a'
1845 | by the corresponding value `b'. The operation is performed according to the
1846 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1847 *----------------------------------------------------------------------------*/
1849 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1851 flag aSign, bSign, zSign;
1852 int16 aExp, bExp, zExp;
1853 bits32 aSig, bSig, zSig;
1855 aSig = extractFloat32Frac( a );
1856 aExp = extractFloat32Exp( a );
1857 aSign = extractFloat32Sign( a );
1858 bSig = extractFloat32Frac( b );
1859 bExp = extractFloat32Exp( b );
1860 bSign = extractFloat32Sign( b );
1861 zSign = aSign ^ bSign;
1862 if ( aExp == 0xFF ) {
1863 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1864 if ( bExp == 0xFF ) {
1865 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1866 float_raise( float_flag_invalid STATUS_VAR);
1867 return float32_default_nan;
1869 return packFloat32( zSign, 0xFF, 0 );
1871 if ( bExp == 0xFF ) {
1872 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1873 return packFloat32( zSign, 0, 0 );
1875 if ( bExp == 0 ) {
1876 if ( bSig == 0 ) {
1877 if ( ( aExp | aSig ) == 0 ) {
1878 float_raise( float_flag_invalid STATUS_VAR);
1879 return float32_default_nan;
1881 float_raise( float_flag_divbyzero STATUS_VAR);
1882 return packFloat32( zSign, 0xFF, 0 );
1884 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1886 if ( aExp == 0 ) {
1887 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1888 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1890 zExp = aExp - bExp + 0x7D;
1891 aSig = ( aSig | 0x00800000 )<<7;
1892 bSig = ( bSig | 0x00800000 )<<8;
1893 if ( bSig <= ( aSig + aSig ) ) {
1894 aSig >>= 1;
1895 ++zExp;
1897 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1898 if ( ( zSig & 0x3F ) == 0 ) {
1899 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1901 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1905 /*----------------------------------------------------------------------------
1906 | Returns the remainder of the single-precision floating-point value `a'
1907 | with respect to the corresponding value `b'. The operation is performed
1908 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1909 *----------------------------------------------------------------------------*/
1911 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1913 flag aSign, zSign;
1914 int16 aExp, bExp, expDiff;
1915 bits32 aSig, bSig;
1916 bits32 q;
1917 bits64 aSig64, bSig64, q64;
1918 bits32 alternateASig;
1919 sbits32 sigMean;
1921 aSig = extractFloat32Frac( a );
1922 aExp = extractFloat32Exp( a );
1923 aSign = extractFloat32Sign( a );
1924 bSig = extractFloat32Frac( b );
1925 bExp = extractFloat32Exp( b );
1926 if ( aExp == 0xFF ) {
1927 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1928 return propagateFloat32NaN( a, b STATUS_VAR );
1930 float_raise( float_flag_invalid STATUS_VAR);
1931 return float32_default_nan;
1933 if ( bExp == 0xFF ) {
1934 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1935 return a;
1937 if ( bExp == 0 ) {
1938 if ( bSig == 0 ) {
1939 float_raise( float_flag_invalid STATUS_VAR);
1940 return float32_default_nan;
1942 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1944 if ( aExp == 0 ) {
1945 if ( aSig == 0 ) return a;
1946 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1948 expDiff = aExp - bExp;
1949 aSig |= 0x00800000;
1950 bSig |= 0x00800000;
1951 if ( expDiff < 32 ) {
1952 aSig <<= 8;
1953 bSig <<= 8;
1954 if ( expDiff < 0 ) {
1955 if ( expDiff < -1 ) return a;
1956 aSig >>= 1;
1958 q = ( bSig <= aSig );
1959 if ( q ) aSig -= bSig;
1960 if ( 0 < expDiff ) {
1961 q = ( ( (bits64) aSig )<<32 ) / bSig;
1962 q >>= 32 - expDiff;
1963 bSig >>= 2;
1964 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1966 else {
1967 aSig >>= 2;
1968 bSig >>= 2;
1971 else {
1972 if ( bSig <= aSig ) aSig -= bSig;
1973 aSig64 = ( (bits64) aSig )<<40;
1974 bSig64 = ( (bits64) bSig )<<40;
1975 expDiff -= 64;
1976 while ( 0 < expDiff ) {
1977 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1978 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1979 aSig64 = - ( ( bSig * q64 )<<38 );
1980 expDiff -= 62;
1982 expDiff += 64;
1983 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1984 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1985 q = q64>>( 64 - expDiff );
1986 bSig <<= 6;
1987 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1989 do {
1990 alternateASig = aSig;
1991 ++q;
1992 aSig -= bSig;
1993 } while ( 0 <= (sbits32) aSig );
1994 sigMean = aSig + alternateASig;
1995 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1996 aSig = alternateASig;
1998 zSign = ( (sbits32) aSig < 0 );
1999 if ( zSign ) aSig = - aSig;
2000 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2004 /*----------------------------------------------------------------------------
2005 | Returns the square root of the single-precision floating-point value `a'.
2006 | The operation is performed according to the IEC/IEEE Standard for Binary
2007 | Floating-Point Arithmetic.
2008 *----------------------------------------------------------------------------*/
2010 float32 float32_sqrt( float32 a STATUS_PARAM )
2012 flag aSign;
2013 int16 aExp, zExp;
2014 bits32 aSig, zSig;
2015 bits64 rem, term;
2017 aSig = extractFloat32Frac( a );
2018 aExp = extractFloat32Exp( a );
2019 aSign = extractFloat32Sign( a );
2020 if ( aExp == 0xFF ) {
2021 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2022 if ( ! aSign ) return a;
2023 float_raise( float_flag_invalid STATUS_VAR);
2024 return float32_default_nan;
2026 if ( aSign ) {
2027 if ( ( aExp | aSig ) == 0 ) return a;
2028 float_raise( float_flag_invalid STATUS_VAR);
2029 return float32_default_nan;
2031 if ( aExp == 0 ) {
2032 if ( aSig == 0 ) return float32_zero;
2033 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2035 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2036 aSig = ( aSig | 0x00800000 )<<8;
2037 zSig = estimateSqrt32( aExp, aSig ) + 2;
2038 if ( ( zSig & 0x7F ) <= 5 ) {
2039 if ( zSig < 2 ) {
2040 zSig = 0x7FFFFFFF;
2041 goto roundAndPack;
2043 aSig >>= aExp & 1;
2044 term = ( (bits64) zSig ) * zSig;
2045 rem = ( ( (bits64) aSig )<<32 ) - term;
2046 while ( (sbits64) rem < 0 ) {
2047 --zSig;
2048 rem += ( ( (bits64) zSig )<<1 ) | 1;
2050 zSig |= ( rem != 0 );
2052 shift32RightJamming( zSig, 1, &zSig );
2053 roundAndPack:
2054 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2058 /*----------------------------------------------------------------------------
2059 | Returns the binary exponential of the single-precision floating-point value
2060 | `a'. The operation is performed according to the IEC/IEEE Standard for
2061 | Binary Floating-Point Arithmetic.
2063 | Uses the following identities:
2065 | 1. -------------------------------------------------------------------------
2066 | x x*ln(2)
2067 | 2 = e
2069 | 2. -------------------------------------------------------------------------
2070 | 2 3 4 5 n
2071 | x x x x x x x
2072 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2073 | 1! 2! 3! 4! 5! n!
2074 *----------------------------------------------------------------------------*/
2076 static const float64 float32_exp2_coefficients[15] =
2078 make_float64( 0x3ff0000000000000ll ), /* 1 */
2079 make_float64( 0x3fe0000000000000ll ), /* 2 */
2080 make_float64( 0x3fc5555555555555ll ), /* 3 */
2081 make_float64( 0x3fa5555555555555ll ), /* 4 */
2082 make_float64( 0x3f81111111111111ll ), /* 5 */
2083 make_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2084 make_float64( 0x3f2a01a01a01a01all ), /* 7 */
2085 make_float64( 0x3efa01a01a01a01all ), /* 8 */
2086 make_float64( 0x3ec71de3a556c734ll ), /* 9 */
2087 make_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2088 make_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2089 make_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2090 make_float64( 0x3de6124613a86d09ll ), /* 13 */
2091 make_float64( 0x3da93974a8c07c9dll ), /* 14 */
2092 make_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2095 float32 float32_exp2( float32 a STATUS_PARAM )
2097 flag aSign;
2098 int16 aExp;
2099 bits32 aSig;
2100 float64 r, x, xn;
2101 int i;
2103 aSig = extractFloat32Frac( a );
2104 aExp = extractFloat32Exp( a );
2105 aSign = extractFloat32Sign( a );
2107 if ( aExp == 0xFF) {
2108 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2109 return (aSign) ? float32_zero : a;
2111 if (aExp == 0) {
2112 if (aSig == 0) return float32_one;
2115 float_raise( float_flag_inexact STATUS_VAR);
2117 /* ******************************* */
2118 /* using float64 for approximation */
2119 /* ******************************* */
2120 x = float32_to_float64(a STATUS_VAR);
2121 x = float64_mul(x, float64_ln2 STATUS_VAR);
2123 xn = x;
2124 r = float64_one;
2125 for (i = 0 ; i < 15 ; i++) {
2126 float64 f;
2128 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2129 r = float64_add(r, f STATUS_VAR);
2131 xn = float64_mul(xn, x STATUS_VAR);
2134 return float64_to_float32(r, status);
2137 /*----------------------------------------------------------------------------
2138 | Returns the binary log of the single-precision floating-point value `a'.
2139 | The operation is performed according to the IEC/IEEE Standard for Binary
2140 | Floating-Point Arithmetic.
2141 *----------------------------------------------------------------------------*/
2142 float32 float32_log2( float32 a STATUS_PARAM )
2144 flag aSign, zSign;
2145 int16 aExp;
2146 bits32 aSig, zSig, i;
2148 aSig = extractFloat32Frac( a );
2149 aExp = extractFloat32Exp( a );
2150 aSign = extractFloat32Sign( a );
2152 if ( aExp == 0 ) {
2153 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2154 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2156 if ( aSign ) {
2157 float_raise( float_flag_invalid STATUS_VAR);
2158 return float32_default_nan;
2160 if ( aExp == 0xFF ) {
2161 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2162 return a;
2165 aExp -= 0x7F;
2166 aSig |= 0x00800000;
2167 zSign = aExp < 0;
2168 zSig = aExp << 23;
2170 for (i = 1 << 22; i > 0; i >>= 1) {
2171 aSig = ( (bits64)aSig * aSig ) >> 23;
2172 if ( aSig & 0x01000000 ) {
2173 aSig >>= 1;
2174 zSig |= i;
2178 if ( zSign )
2179 zSig = -zSig;
2181 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2184 /*----------------------------------------------------------------------------
2185 | Returns 1 if the single-precision floating-point value `a' is equal to
2186 | the corresponding value `b', and 0 otherwise. The comparison is performed
2187 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2188 *----------------------------------------------------------------------------*/
2190 int float32_eq( float32 a, float32 b STATUS_PARAM )
2193 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2194 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2196 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2197 float_raise( float_flag_invalid STATUS_VAR);
2199 return 0;
2201 return ( float32_val(a) == float32_val(b) ) ||
2202 ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2206 /*----------------------------------------------------------------------------
2207 | Returns 1 if the single-precision floating-point value `a' is less than
2208 | or equal to the corresponding value `b', and 0 otherwise. The comparison
2209 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2210 | Arithmetic.
2211 *----------------------------------------------------------------------------*/
2213 int float32_le( float32 a, float32 b STATUS_PARAM )
2215 flag aSign, bSign;
2216 bits32 av, bv;
2218 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2219 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2221 float_raise( float_flag_invalid STATUS_VAR);
2222 return 0;
2224 aSign = extractFloat32Sign( a );
2225 bSign = extractFloat32Sign( b );
2226 av = float32_val(a);
2227 bv = float32_val(b);
2228 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2229 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2233 /*----------------------------------------------------------------------------
2234 | Returns 1 if the single-precision floating-point value `a' is less than
2235 | the corresponding value `b', and 0 otherwise. The comparison is performed
2236 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2237 *----------------------------------------------------------------------------*/
2239 int float32_lt( float32 a, float32 b STATUS_PARAM )
2241 flag aSign, bSign;
2242 bits32 av, bv;
2244 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2245 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2247 float_raise( float_flag_invalid STATUS_VAR);
2248 return 0;
2250 aSign = extractFloat32Sign( a );
2251 bSign = extractFloat32Sign( b );
2252 av = float32_val(a);
2253 bv = float32_val(b);
2254 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2255 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2259 /*----------------------------------------------------------------------------
2260 | Returns 1 if the single-precision floating-point value `a' is equal to
2261 | the corresponding value `b', and 0 otherwise. The invalid exception is
2262 | raised if either operand is a NaN. Otherwise, the comparison is performed
2263 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2264 *----------------------------------------------------------------------------*/
2266 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2268 bits32 av, bv;
2270 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2271 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2273 float_raise( float_flag_invalid STATUS_VAR);
2274 return 0;
2276 av = float32_val(a);
2277 bv = float32_val(b);
2278 return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2282 /*----------------------------------------------------------------------------
2283 | Returns 1 if the single-precision floating-point value `a' is less than or
2284 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2285 | cause an exception. Otherwise, the comparison is performed according to the
2286 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2287 *----------------------------------------------------------------------------*/
2289 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2291 flag aSign, bSign;
2292 bits32 av, bv;
2294 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2295 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2297 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2298 float_raise( float_flag_invalid STATUS_VAR);
2300 return 0;
2302 aSign = extractFloat32Sign( a );
2303 bSign = extractFloat32Sign( b );
2304 av = float32_val(a);
2305 bv = float32_val(b);
2306 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2307 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2311 /*----------------------------------------------------------------------------
2312 | Returns 1 if the single-precision floating-point value `a' is less than
2313 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2314 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2315 | Standard for Binary Floating-Point Arithmetic.
2316 *----------------------------------------------------------------------------*/
2318 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2320 flag aSign, bSign;
2321 bits32 av, bv;
2323 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2324 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2326 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2327 float_raise( float_flag_invalid STATUS_VAR);
2329 return 0;
2331 aSign = extractFloat32Sign( a );
2332 bSign = extractFloat32Sign( b );
2333 av = float32_val(a);
2334 bv = float32_val(b);
2335 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2336 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2340 /*----------------------------------------------------------------------------
2341 | Returns the result of converting the double-precision floating-point value
2342 | `a' to the 32-bit two's complement integer format. The conversion is
2343 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2344 | Arithmetic---which means in particular that the conversion is rounded
2345 | according to the current rounding mode. If `a' is a NaN, the largest
2346 | positive integer is returned. Otherwise, if the conversion overflows, the
2347 | largest integer with the same sign as `a' is returned.
2348 *----------------------------------------------------------------------------*/
2350 int32 float64_to_int32( float64 a STATUS_PARAM )
2352 flag aSign;
2353 int16 aExp, shiftCount;
2354 bits64 aSig;
2356 aSig = extractFloat64Frac( a );
2357 aExp = extractFloat64Exp( a );
2358 aSign = extractFloat64Sign( a );
2359 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2360 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2361 shiftCount = 0x42C - aExp;
2362 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2363 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2367 /*----------------------------------------------------------------------------
2368 | Returns the result of converting the double-precision floating-point value
2369 | `a' to the 32-bit two's complement integer format. The conversion is
2370 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2371 | Arithmetic, except that the conversion is always rounded toward zero.
2372 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2373 | the conversion overflows, the largest integer with the same sign as `a' is
2374 | returned.
2375 *----------------------------------------------------------------------------*/
2377 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2379 flag aSign;
2380 int16 aExp, shiftCount;
2381 bits64 aSig, savedASig;
2382 int32 z;
2384 aSig = extractFloat64Frac( a );
2385 aExp = extractFloat64Exp( a );
2386 aSign = extractFloat64Sign( a );
2387 if ( 0x41E < aExp ) {
2388 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2389 goto invalid;
2391 else if ( aExp < 0x3FF ) {
2392 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2393 return 0;
2395 aSig |= LIT64( 0x0010000000000000 );
2396 shiftCount = 0x433 - aExp;
2397 savedASig = aSig;
2398 aSig >>= shiftCount;
2399 z = aSig;
2400 if ( aSign ) z = - z;
2401 if ( ( z < 0 ) ^ aSign ) {
2402 invalid:
2403 float_raise( float_flag_invalid STATUS_VAR);
2404 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2406 if ( ( aSig<<shiftCount ) != savedASig ) {
2407 STATUS(float_exception_flags) |= float_flag_inexact;
2409 return z;
2413 /*----------------------------------------------------------------------------
2414 | Returns the result of converting the double-precision floating-point value
2415 | `a' to the 64-bit two's complement integer format. The conversion is
2416 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2417 | Arithmetic---which means in particular that the conversion is rounded
2418 | according to the current rounding mode. If `a' is a NaN, the largest
2419 | positive integer is returned. Otherwise, if the conversion overflows, the
2420 | largest integer with the same sign as `a' is returned.
2421 *----------------------------------------------------------------------------*/
2423 int64 float64_to_int64( float64 a STATUS_PARAM )
2425 flag aSign;
2426 int16 aExp, shiftCount;
2427 bits64 aSig, aSigExtra;
2429 aSig = extractFloat64Frac( a );
2430 aExp = extractFloat64Exp( a );
2431 aSign = extractFloat64Sign( a );
2432 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2433 shiftCount = 0x433 - aExp;
2434 if ( shiftCount <= 0 ) {
2435 if ( 0x43E < aExp ) {
2436 float_raise( float_flag_invalid STATUS_VAR);
2437 if ( ! aSign
2438 || ( ( aExp == 0x7FF )
2439 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2441 return LIT64( 0x7FFFFFFFFFFFFFFF );
2443 return (sbits64) LIT64( 0x8000000000000000 );
2445 aSigExtra = 0;
2446 aSig <<= - shiftCount;
2448 else {
2449 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2451 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2455 /*----------------------------------------------------------------------------
2456 | Returns the result of converting the double-precision floating-point value
2457 | `a' to the 64-bit two's complement integer format. The conversion is
2458 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2459 | Arithmetic, except that the conversion is always rounded toward zero.
2460 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2461 | the conversion overflows, the largest integer with the same sign as `a' is
2462 | returned.
2463 *----------------------------------------------------------------------------*/
2465 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2467 flag aSign;
2468 int16 aExp, shiftCount;
2469 bits64 aSig;
2470 int64 z;
2472 aSig = extractFloat64Frac( a );
2473 aExp = extractFloat64Exp( a );
2474 aSign = extractFloat64Sign( a );
2475 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2476 shiftCount = aExp - 0x433;
2477 if ( 0 <= shiftCount ) {
2478 if ( 0x43E <= aExp ) {
2479 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2480 float_raise( float_flag_invalid STATUS_VAR);
2481 if ( ! aSign
2482 || ( ( aExp == 0x7FF )
2483 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2485 return LIT64( 0x7FFFFFFFFFFFFFFF );
2488 return (sbits64) LIT64( 0x8000000000000000 );
2490 z = aSig<<shiftCount;
2492 else {
2493 if ( aExp < 0x3FE ) {
2494 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2495 return 0;
2497 z = aSig>>( - shiftCount );
2498 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2499 STATUS(float_exception_flags) |= float_flag_inexact;
2502 if ( aSign ) z = - z;
2503 return z;
2507 /*----------------------------------------------------------------------------
2508 | Returns the result of converting the double-precision floating-point value
2509 | `a' to the single-precision floating-point format. The conversion is
2510 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2511 | Arithmetic.
2512 *----------------------------------------------------------------------------*/
2514 float32 float64_to_float32( float64 a STATUS_PARAM )
2516 flag aSign;
2517 int16 aExp;
2518 bits64 aSig;
2519 bits32 zSig;
2521 aSig = extractFloat64Frac( a );
2522 aExp = extractFloat64Exp( a );
2523 aSign = extractFloat64Sign( a );
2524 if ( aExp == 0x7FF ) {
2525 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2526 return packFloat32( aSign, 0xFF, 0 );
2528 shift64RightJamming( aSig, 22, &aSig );
2529 zSig = aSig;
2530 if ( aExp || zSig ) {
2531 zSig |= 0x40000000;
2532 aExp -= 0x381;
2534 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2539 /*----------------------------------------------------------------------------
2540 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2541 | half-precision floating-point value, returning the result. After being
2542 | shifted into the proper positions, the three fields are simply added
2543 | together to form the result. This means that any integer portion of `zSig'
2544 | will be added into the exponent. Since a properly normalized significand
2545 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2546 | than the desired result exponent whenever `zSig' is a complete, normalized
2547 | significand.
2548 *----------------------------------------------------------------------------*/
2549 static bits16 packFloat16(flag zSign, int16 zExp, bits16 zSig)
2551 return (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig;
2554 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2555 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
2557 float32 float16_to_float32( bits16 a, flag ieee STATUS_PARAM )
2559 flag aSign;
2560 int16 aExp;
2561 bits32 aSig;
2563 aSign = a >> 15;
2564 aExp = (a >> 10) & 0x1f;
2565 aSig = a & 0x3ff;
2567 if (aExp == 0x1f && ieee) {
2568 if (aSig) {
2569 /* Make sure correct exceptions are raised. */
2570 float32ToCommonNaN(a STATUS_VAR);
2571 aSig |= 0x200;
2573 return packFloat32(aSign, 0xff, aSig << 13);
2575 if (aExp == 0) {
2576 int8 shiftCount;
2578 if (aSig == 0) {
2579 return packFloat32(aSign, 0, 0);
2582 shiftCount = countLeadingZeros32( aSig ) - 21;
2583 aSig = aSig << shiftCount;
2584 aExp = -shiftCount;
2586 return packFloat32( aSign, aExp + 0x70, aSig << 13);
2589 bits16 float32_to_float16( float32 a, flag ieee STATUS_PARAM)
2591 flag aSign;
2592 int16 aExp;
2593 bits32 aSig;
2594 bits32 mask;
2595 bits32 increment;
2596 int8 roundingMode;
2598 aSig = extractFloat32Frac( a );
2599 aExp = extractFloat32Exp( a );
2600 aSign = extractFloat32Sign( a );
2601 if ( aExp == 0xFF ) {
2602 if (aSig) {
2603 /* Make sure correct exceptions are raised. */
2604 float32ToCommonNaN(a STATUS_VAR);
2605 aSig |= 0x00400000;
2607 return packFloat16(aSign, 0x1f, aSig >> 13);
2609 if (aExp == 0 && aSign == 0) {
2610 return packFloat16(aSign, 0, 0);
2612 /* Decimal point between bits 22 and 23. */
2613 aSig |= 0x00800000;
2614 aExp -= 0x7f;
2615 if (aExp < -14) {
2616 mask = 0x007fffff;
2617 if (aExp < -24) {
2618 aExp = -25;
2619 } else {
2620 mask >>= 24 + aExp;
2622 } else {
2623 mask = 0x00001fff;
2625 if (aSig & mask) {
2626 float_raise( float_flag_underflow STATUS_VAR );
2627 roundingMode = STATUS(float_rounding_mode);
2628 switch (roundingMode) {
2629 case float_round_nearest_even:
2630 increment = (mask + 1) >> 1;
2631 if ((aSig & mask) == increment) {
2632 increment = aSig & (increment << 1);
2634 break;
2635 case float_round_up:
2636 increment = aSign ? 0 : mask;
2637 break;
2638 case float_round_down:
2639 increment = aSign ? mask : 0;
2640 break;
2641 default: /* round_to_zero */
2642 increment = 0;
2643 break;
2645 aSig += increment;
2646 if (aSig >= 0x01000000) {
2647 aSig >>= 1;
2648 aExp++;
2650 } else if (aExp < -14
2651 && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2652 float_raise( float_flag_underflow STATUS_VAR);
2655 if (ieee) {
2656 if (aExp > 15) {
2657 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2658 return packFloat16(aSign, 0x1f, 0);
2660 } else {
2661 if (aExp > 16) {
2662 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2663 return packFloat16(aSign, 0x1f, 0x3ff);
2666 if (aExp < -24) {
2667 return packFloat16(aSign, 0, 0);
2669 if (aExp < -14) {
2670 aSig >>= -14 - aExp;
2671 aExp = -14;
2673 return packFloat16(aSign, aExp + 14, aSig >> 13);
2676 #ifdef FLOATX80
2678 /*----------------------------------------------------------------------------
2679 | Returns the result of converting the double-precision floating-point value
2680 | `a' to the extended double-precision floating-point format. The conversion
2681 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2682 | Arithmetic.
2683 *----------------------------------------------------------------------------*/
2685 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2687 flag aSign;
2688 int16 aExp;
2689 bits64 aSig;
2691 aSig = extractFloat64Frac( a );
2692 aExp = extractFloat64Exp( a );
2693 aSign = extractFloat64Sign( a );
2694 if ( aExp == 0x7FF ) {
2695 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2696 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2698 if ( aExp == 0 ) {
2699 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2700 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2702 return
2703 packFloatx80(
2704 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2708 #endif
2710 #ifdef FLOAT128
2712 /*----------------------------------------------------------------------------
2713 | Returns the result of converting the double-precision floating-point value
2714 | `a' to the quadruple-precision floating-point format. The conversion is
2715 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2716 | Arithmetic.
2717 *----------------------------------------------------------------------------*/
2719 float128 float64_to_float128( float64 a STATUS_PARAM )
2721 flag aSign;
2722 int16 aExp;
2723 bits64 aSig, zSig0, zSig1;
2725 aSig = extractFloat64Frac( a );
2726 aExp = extractFloat64Exp( a );
2727 aSign = extractFloat64Sign( a );
2728 if ( aExp == 0x7FF ) {
2729 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2730 return packFloat128( aSign, 0x7FFF, 0, 0 );
2732 if ( aExp == 0 ) {
2733 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2734 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2735 --aExp;
2737 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2738 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2742 #endif
2744 /*----------------------------------------------------------------------------
2745 | Rounds the double-precision floating-point value `a' to an integer, and
2746 | returns the result as a double-precision floating-point value. The
2747 | operation is performed according to the IEC/IEEE Standard for Binary
2748 | Floating-Point Arithmetic.
2749 *----------------------------------------------------------------------------*/
2751 float64 float64_round_to_int( float64 a STATUS_PARAM )
2753 flag aSign;
2754 int16 aExp;
2755 bits64 lastBitMask, roundBitsMask;
2756 int8 roundingMode;
2757 bits64 z;
2759 aExp = extractFloat64Exp( a );
2760 if ( 0x433 <= aExp ) {
2761 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2762 return propagateFloat64NaN( a, a STATUS_VAR );
2764 return a;
2766 if ( aExp < 0x3FF ) {
2767 if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2768 STATUS(float_exception_flags) |= float_flag_inexact;
2769 aSign = extractFloat64Sign( a );
2770 switch ( STATUS(float_rounding_mode) ) {
2771 case float_round_nearest_even:
2772 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2773 return packFloat64( aSign, 0x3FF, 0 );
2775 break;
2776 case float_round_down:
2777 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2778 case float_round_up:
2779 return make_float64(
2780 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2782 return packFloat64( aSign, 0, 0 );
2784 lastBitMask = 1;
2785 lastBitMask <<= 0x433 - aExp;
2786 roundBitsMask = lastBitMask - 1;
2787 z = float64_val(a);
2788 roundingMode = STATUS(float_rounding_mode);
2789 if ( roundingMode == float_round_nearest_even ) {
2790 z += lastBitMask>>1;
2791 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2793 else if ( roundingMode != float_round_to_zero ) {
2794 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2795 z += roundBitsMask;
2798 z &= ~ roundBitsMask;
2799 if ( z != float64_val(a) )
2800 STATUS(float_exception_flags) |= float_flag_inexact;
2801 return make_float64(z);
2805 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2807 int oldmode;
2808 float64 res;
2809 oldmode = STATUS(float_rounding_mode);
2810 STATUS(float_rounding_mode) = float_round_to_zero;
2811 res = float64_round_to_int(a STATUS_VAR);
2812 STATUS(float_rounding_mode) = oldmode;
2813 return res;
2816 /*----------------------------------------------------------------------------
2817 | Returns the result of adding the absolute values of the double-precision
2818 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2819 | before being returned. `zSign' is ignored if the result is a NaN.
2820 | The addition is performed according to the IEC/IEEE Standard for Binary
2821 | Floating-Point Arithmetic.
2822 *----------------------------------------------------------------------------*/
2824 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2826 int16 aExp, bExp, zExp;
2827 bits64 aSig, bSig, zSig;
2828 int16 expDiff;
2830 aSig = extractFloat64Frac( a );
2831 aExp = extractFloat64Exp( a );
2832 bSig = extractFloat64Frac( b );
2833 bExp = extractFloat64Exp( b );
2834 expDiff = aExp - bExp;
2835 aSig <<= 9;
2836 bSig <<= 9;
2837 if ( 0 < expDiff ) {
2838 if ( aExp == 0x7FF ) {
2839 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2840 return a;
2842 if ( bExp == 0 ) {
2843 --expDiff;
2845 else {
2846 bSig |= LIT64( 0x2000000000000000 );
2848 shift64RightJamming( bSig, expDiff, &bSig );
2849 zExp = aExp;
2851 else if ( expDiff < 0 ) {
2852 if ( bExp == 0x7FF ) {
2853 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2854 return packFloat64( zSign, 0x7FF, 0 );
2856 if ( aExp == 0 ) {
2857 ++expDiff;
2859 else {
2860 aSig |= LIT64( 0x2000000000000000 );
2862 shift64RightJamming( aSig, - expDiff, &aSig );
2863 zExp = bExp;
2865 else {
2866 if ( aExp == 0x7FF ) {
2867 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2868 return a;
2870 if ( aExp == 0 ) {
2871 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
2872 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2874 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2875 zExp = aExp;
2876 goto roundAndPack;
2878 aSig |= LIT64( 0x2000000000000000 );
2879 zSig = ( aSig + bSig )<<1;
2880 --zExp;
2881 if ( (sbits64) zSig < 0 ) {
2882 zSig = aSig + bSig;
2883 ++zExp;
2885 roundAndPack:
2886 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2890 /*----------------------------------------------------------------------------
2891 | Returns the result of subtracting the absolute values of the double-
2892 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2893 | difference is negated before being returned. `zSign' is ignored if the
2894 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2895 | Standard for Binary Floating-Point Arithmetic.
2896 *----------------------------------------------------------------------------*/
2898 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2900 int16 aExp, bExp, zExp;
2901 bits64 aSig, bSig, zSig;
2902 int16 expDiff;
2904 aSig = extractFloat64Frac( a );
2905 aExp = extractFloat64Exp( a );
2906 bSig = extractFloat64Frac( b );
2907 bExp = extractFloat64Exp( b );
2908 expDiff = aExp - bExp;
2909 aSig <<= 10;
2910 bSig <<= 10;
2911 if ( 0 < expDiff ) goto aExpBigger;
2912 if ( expDiff < 0 ) goto bExpBigger;
2913 if ( aExp == 0x7FF ) {
2914 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2915 float_raise( float_flag_invalid STATUS_VAR);
2916 return float64_default_nan;
2918 if ( aExp == 0 ) {
2919 aExp = 1;
2920 bExp = 1;
2922 if ( bSig < aSig ) goto aBigger;
2923 if ( aSig < bSig ) goto bBigger;
2924 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2925 bExpBigger:
2926 if ( bExp == 0x7FF ) {
2927 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2928 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2930 if ( aExp == 0 ) {
2931 ++expDiff;
2933 else {
2934 aSig |= LIT64( 0x4000000000000000 );
2936 shift64RightJamming( aSig, - expDiff, &aSig );
2937 bSig |= LIT64( 0x4000000000000000 );
2938 bBigger:
2939 zSig = bSig - aSig;
2940 zExp = bExp;
2941 zSign ^= 1;
2942 goto normalizeRoundAndPack;
2943 aExpBigger:
2944 if ( aExp == 0x7FF ) {
2945 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2946 return a;
2948 if ( bExp == 0 ) {
2949 --expDiff;
2951 else {
2952 bSig |= LIT64( 0x4000000000000000 );
2954 shift64RightJamming( bSig, expDiff, &bSig );
2955 aSig |= LIT64( 0x4000000000000000 );
2956 aBigger:
2957 zSig = aSig - bSig;
2958 zExp = aExp;
2959 normalizeRoundAndPack:
2960 --zExp;
2961 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2965 /*----------------------------------------------------------------------------
2966 | Returns the result of adding the double-precision floating-point values `a'
2967 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2968 | Binary Floating-Point Arithmetic.
2969 *----------------------------------------------------------------------------*/
2971 float64 float64_add( float64 a, float64 b STATUS_PARAM )
2973 flag aSign, bSign;
2975 aSign = extractFloat64Sign( a );
2976 bSign = extractFloat64Sign( b );
2977 if ( aSign == bSign ) {
2978 return addFloat64Sigs( a, b, aSign STATUS_VAR );
2980 else {
2981 return subFloat64Sigs( a, b, aSign STATUS_VAR );
2986 /*----------------------------------------------------------------------------
2987 | Returns the result of subtracting the double-precision floating-point values
2988 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2989 | for Binary Floating-Point Arithmetic.
2990 *----------------------------------------------------------------------------*/
2992 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2994 flag aSign, bSign;
2996 aSign = extractFloat64Sign( a );
2997 bSign = extractFloat64Sign( b );
2998 if ( aSign == bSign ) {
2999 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3001 else {
3002 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3007 /*----------------------------------------------------------------------------
3008 | Returns the result of multiplying the double-precision floating-point values
3009 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3010 | for Binary Floating-Point Arithmetic.
3011 *----------------------------------------------------------------------------*/
3013 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3015 flag aSign, bSign, zSign;
3016 int16 aExp, bExp, zExp;
3017 bits64 aSig, bSig, zSig0, zSig1;
3019 aSig = extractFloat64Frac( a );
3020 aExp = extractFloat64Exp( a );
3021 aSign = extractFloat64Sign( a );
3022 bSig = extractFloat64Frac( b );
3023 bExp = extractFloat64Exp( b );
3024 bSign = extractFloat64Sign( b );
3025 zSign = aSign ^ bSign;
3026 if ( aExp == 0x7FF ) {
3027 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3028 return propagateFloat64NaN( a, b STATUS_VAR );
3030 if ( ( bExp | bSig ) == 0 ) {
3031 float_raise( float_flag_invalid STATUS_VAR);
3032 return float64_default_nan;
3034 return packFloat64( zSign, 0x7FF, 0 );
3036 if ( bExp == 0x7FF ) {
3037 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3038 if ( ( aExp | aSig ) == 0 ) {
3039 float_raise( float_flag_invalid STATUS_VAR);
3040 return float64_default_nan;
3042 return packFloat64( zSign, 0x7FF, 0 );
3044 if ( aExp == 0 ) {
3045 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3046 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3048 if ( bExp == 0 ) {
3049 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3050 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3052 zExp = aExp + bExp - 0x3FF;
3053 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3054 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3055 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3056 zSig0 |= ( zSig1 != 0 );
3057 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
3058 zSig0 <<= 1;
3059 --zExp;
3061 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3065 /*----------------------------------------------------------------------------
3066 | Returns the result of dividing the double-precision floating-point value `a'
3067 | by the corresponding value `b'. The operation is performed according to
3068 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3069 *----------------------------------------------------------------------------*/
3071 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3073 flag aSign, bSign, zSign;
3074 int16 aExp, bExp, zExp;
3075 bits64 aSig, bSig, zSig;
3076 bits64 rem0, rem1;
3077 bits64 term0, term1;
3079 aSig = extractFloat64Frac( a );
3080 aExp = extractFloat64Exp( a );
3081 aSign = extractFloat64Sign( a );
3082 bSig = extractFloat64Frac( b );
3083 bExp = extractFloat64Exp( b );
3084 bSign = extractFloat64Sign( b );
3085 zSign = aSign ^ bSign;
3086 if ( aExp == 0x7FF ) {
3087 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3088 if ( bExp == 0x7FF ) {
3089 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3090 float_raise( float_flag_invalid STATUS_VAR);
3091 return float64_default_nan;
3093 return packFloat64( zSign, 0x7FF, 0 );
3095 if ( bExp == 0x7FF ) {
3096 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3097 return packFloat64( zSign, 0, 0 );
3099 if ( bExp == 0 ) {
3100 if ( bSig == 0 ) {
3101 if ( ( aExp | aSig ) == 0 ) {
3102 float_raise( float_flag_invalid STATUS_VAR);
3103 return float64_default_nan;
3105 float_raise( float_flag_divbyzero STATUS_VAR);
3106 return packFloat64( zSign, 0x7FF, 0 );
3108 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3110 if ( aExp == 0 ) {
3111 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3112 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3114 zExp = aExp - bExp + 0x3FD;
3115 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3116 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3117 if ( bSig <= ( aSig + aSig ) ) {
3118 aSig >>= 1;
3119 ++zExp;
3121 zSig = estimateDiv128To64( aSig, 0, bSig );
3122 if ( ( zSig & 0x1FF ) <= 2 ) {
3123 mul64To128( bSig, zSig, &term0, &term1 );
3124 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3125 while ( (sbits64) rem0 < 0 ) {
3126 --zSig;
3127 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3129 zSig |= ( rem1 != 0 );
3131 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3135 /*----------------------------------------------------------------------------
3136 | Returns the remainder of the double-precision floating-point value `a'
3137 | with respect to the corresponding value `b'. The operation is performed
3138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3139 *----------------------------------------------------------------------------*/
3141 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3143 flag aSign, zSign;
3144 int16 aExp, bExp, expDiff;
3145 bits64 aSig, bSig;
3146 bits64 q, alternateASig;
3147 sbits64 sigMean;
3149 aSig = extractFloat64Frac( a );
3150 aExp = extractFloat64Exp( a );
3151 aSign = extractFloat64Sign( a );
3152 bSig = extractFloat64Frac( b );
3153 bExp = extractFloat64Exp( b );
3154 if ( aExp == 0x7FF ) {
3155 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3156 return propagateFloat64NaN( a, b STATUS_VAR );
3158 float_raise( float_flag_invalid STATUS_VAR);
3159 return float64_default_nan;
3161 if ( bExp == 0x7FF ) {
3162 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3163 return a;
3165 if ( bExp == 0 ) {
3166 if ( bSig == 0 ) {
3167 float_raise( float_flag_invalid STATUS_VAR);
3168 return float64_default_nan;
3170 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3172 if ( aExp == 0 ) {
3173 if ( aSig == 0 ) return a;
3174 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3176 expDiff = aExp - bExp;
3177 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3178 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3179 if ( expDiff < 0 ) {
3180 if ( expDiff < -1 ) return a;
3181 aSig >>= 1;
3183 q = ( bSig <= aSig );
3184 if ( q ) aSig -= bSig;
3185 expDiff -= 64;
3186 while ( 0 < expDiff ) {
3187 q = estimateDiv128To64( aSig, 0, bSig );
3188 q = ( 2 < q ) ? q - 2 : 0;
3189 aSig = - ( ( bSig>>2 ) * q );
3190 expDiff -= 62;
3192 expDiff += 64;
3193 if ( 0 < expDiff ) {
3194 q = estimateDiv128To64( aSig, 0, bSig );
3195 q = ( 2 < q ) ? q - 2 : 0;
3196 q >>= 64 - expDiff;
3197 bSig >>= 2;
3198 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3200 else {
3201 aSig >>= 2;
3202 bSig >>= 2;
3204 do {
3205 alternateASig = aSig;
3206 ++q;
3207 aSig -= bSig;
3208 } while ( 0 <= (sbits64) aSig );
3209 sigMean = aSig + alternateASig;
3210 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3211 aSig = alternateASig;
3213 zSign = ( (sbits64) aSig < 0 );
3214 if ( zSign ) aSig = - aSig;
3215 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3219 /*----------------------------------------------------------------------------
3220 | Returns the square root of the double-precision floating-point value `a'.
3221 | The operation is performed according to the IEC/IEEE Standard for Binary
3222 | Floating-Point Arithmetic.
3223 *----------------------------------------------------------------------------*/
3225 float64 float64_sqrt( float64 a STATUS_PARAM )
3227 flag aSign;
3228 int16 aExp, zExp;
3229 bits64 aSig, zSig, doubleZSig;
3230 bits64 rem0, rem1, term0, term1;
3232 aSig = extractFloat64Frac( a );
3233 aExp = extractFloat64Exp( a );
3234 aSign = extractFloat64Sign( a );
3235 if ( aExp == 0x7FF ) {
3236 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3237 if ( ! aSign ) return a;
3238 float_raise( float_flag_invalid STATUS_VAR);
3239 return float64_default_nan;
3241 if ( aSign ) {
3242 if ( ( aExp | aSig ) == 0 ) return a;
3243 float_raise( float_flag_invalid STATUS_VAR);
3244 return float64_default_nan;
3246 if ( aExp == 0 ) {
3247 if ( aSig == 0 ) return float64_zero;
3248 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3250 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3251 aSig |= LIT64( 0x0010000000000000 );
3252 zSig = estimateSqrt32( aExp, aSig>>21 );
3253 aSig <<= 9 - ( aExp & 1 );
3254 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3255 if ( ( zSig & 0x1FF ) <= 5 ) {
3256 doubleZSig = zSig<<1;
3257 mul64To128( zSig, zSig, &term0, &term1 );
3258 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3259 while ( (sbits64) rem0 < 0 ) {
3260 --zSig;
3261 doubleZSig -= 2;
3262 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3264 zSig |= ( ( rem0 | rem1 ) != 0 );
3266 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3270 /*----------------------------------------------------------------------------
3271 | Returns the binary log of the double-precision floating-point value `a'.
3272 | The operation is performed according to the IEC/IEEE Standard for Binary
3273 | Floating-Point Arithmetic.
3274 *----------------------------------------------------------------------------*/
3275 float64 float64_log2( float64 a STATUS_PARAM )
3277 flag aSign, zSign;
3278 int16 aExp;
3279 bits64 aSig, aSig0, aSig1, zSig, i;
3281 aSig = extractFloat64Frac( a );
3282 aExp = extractFloat64Exp( a );
3283 aSign = extractFloat64Sign( a );
3285 if ( aExp == 0 ) {
3286 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3287 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3289 if ( aSign ) {
3290 float_raise( float_flag_invalid STATUS_VAR);
3291 return float64_default_nan;
3293 if ( aExp == 0x7FF ) {
3294 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3295 return a;
3298 aExp -= 0x3FF;
3299 aSig |= LIT64( 0x0010000000000000 );
3300 zSign = aExp < 0;
3301 zSig = (bits64)aExp << 52;
3302 for (i = 1LL << 51; i > 0; i >>= 1) {
3303 mul64To128( aSig, aSig, &aSig0, &aSig1 );
3304 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3305 if ( aSig & LIT64( 0x0020000000000000 ) ) {
3306 aSig >>= 1;
3307 zSig |= i;
3311 if ( zSign )
3312 zSig = -zSig;
3313 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3316 /*----------------------------------------------------------------------------
3317 | Returns 1 if the double-precision floating-point value `a' is equal to the
3318 | corresponding value `b', and 0 otherwise. The comparison is performed
3319 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3320 *----------------------------------------------------------------------------*/
3322 int float64_eq( float64 a, float64 b STATUS_PARAM )
3324 bits64 av, bv;
3326 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3327 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3329 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3330 float_raise( float_flag_invalid STATUS_VAR);
3332 return 0;
3334 av = float64_val(a);
3335 bv = float64_val(b);
3336 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3340 /*----------------------------------------------------------------------------
3341 | Returns 1 if the double-precision floating-point value `a' is less than or
3342 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3343 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3344 | Arithmetic.
3345 *----------------------------------------------------------------------------*/
3347 int float64_le( float64 a, float64 b STATUS_PARAM )
3349 flag aSign, bSign;
3350 bits64 av, bv;
3352 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3353 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3355 float_raise( float_flag_invalid STATUS_VAR);
3356 return 0;
3358 aSign = extractFloat64Sign( a );
3359 bSign = extractFloat64Sign( b );
3360 av = float64_val(a);
3361 bv = float64_val(b);
3362 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3363 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3367 /*----------------------------------------------------------------------------
3368 | Returns 1 if the double-precision floating-point value `a' is less than
3369 | the corresponding value `b', and 0 otherwise. The comparison is performed
3370 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3371 *----------------------------------------------------------------------------*/
3373 int float64_lt( float64 a, float64 b STATUS_PARAM )
3375 flag aSign, bSign;
3376 bits64 av, bv;
3378 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3379 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3381 float_raise( float_flag_invalid STATUS_VAR);
3382 return 0;
3384 aSign = extractFloat64Sign( a );
3385 bSign = extractFloat64Sign( b );
3386 av = float64_val(a);
3387 bv = float64_val(b);
3388 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3389 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3393 /*----------------------------------------------------------------------------
3394 | Returns 1 if the double-precision floating-point value `a' is equal to the
3395 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3396 | if either operand is a NaN. Otherwise, the comparison is performed
3397 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3398 *----------------------------------------------------------------------------*/
3400 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3402 bits64 av, bv;
3404 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3405 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3407 float_raise( float_flag_invalid STATUS_VAR);
3408 return 0;
3410 av = float64_val(a);
3411 bv = float64_val(b);
3412 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3416 /*----------------------------------------------------------------------------
3417 | Returns 1 if the double-precision floating-point value `a' is less than or
3418 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3419 | cause an exception. Otherwise, the comparison is performed according to the
3420 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3421 *----------------------------------------------------------------------------*/
3423 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3425 flag aSign, bSign;
3426 bits64 av, bv;
3428 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3429 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3431 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3432 float_raise( float_flag_invalid STATUS_VAR);
3434 return 0;
3436 aSign = extractFloat64Sign( a );
3437 bSign = extractFloat64Sign( b );
3438 av = float64_val(a);
3439 bv = float64_val(b);
3440 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3441 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3445 /*----------------------------------------------------------------------------
3446 | Returns 1 if the double-precision floating-point value `a' is less than
3447 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3448 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3449 | Standard for Binary Floating-Point Arithmetic.
3450 *----------------------------------------------------------------------------*/
3452 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3454 flag aSign, bSign;
3455 bits64 av, bv;
3457 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3458 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3460 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3461 float_raise( float_flag_invalid STATUS_VAR);
3463 return 0;
3465 aSign = extractFloat64Sign( a );
3466 bSign = extractFloat64Sign( b );
3467 av = float64_val(a);
3468 bv = float64_val(b);
3469 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3470 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3474 #ifdef FLOATX80
3476 /*----------------------------------------------------------------------------
3477 | Returns the result of converting the extended double-precision floating-
3478 | point value `a' to the 32-bit two's complement integer format. The
3479 | conversion is performed according to the IEC/IEEE Standard for Binary
3480 | Floating-Point Arithmetic---which means in particular that the conversion
3481 | is rounded according to the current rounding mode. If `a' is a NaN, the
3482 | largest positive integer is returned. Otherwise, if the conversion
3483 | overflows, the largest integer with the same sign as `a' is returned.
3484 *----------------------------------------------------------------------------*/
3486 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3488 flag aSign;
3489 int32 aExp, shiftCount;
3490 bits64 aSig;
3492 aSig = extractFloatx80Frac( a );
3493 aExp = extractFloatx80Exp( a );
3494 aSign = extractFloatx80Sign( a );
3495 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3496 shiftCount = 0x4037 - aExp;
3497 if ( shiftCount <= 0 ) shiftCount = 1;
3498 shift64RightJamming( aSig, shiftCount, &aSig );
3499 return roundAndPackInt32( aSign, aSig STATUS_VAR );
3503 /*----------------------------------------------------------------------------
3504 | Returns the result of converting the extended double-precision floating-
3505 | point value `a' to the 32-bit two's complement integer format. The
3506 | conversion is performed according to the IEC/IEEE Standard for Binary
3507 | Floating-Point Arithmetic, except that the conversion is always rounded
3508 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3509 | Otherwise, if the conversion overflows, the largest integer with the same
3510 | sign as `a' is returned.
3511 *----------------------------------------------------------------------------*/
3513 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3515 flag aSign;
3516 int32 aExp, shiftCount;
3517 bits64 aSig, savedASig;
3518 int32 z;
3520 aSig = extractFloatx80Frac( a );
3521 aExp = extractFloatx80Exp( a );
3522 aSign = extractFloatx80Sign( a );
3523 if ( 0x401E < aExp ) {
3524 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3525 goto invalid;
3527 else if ( aExp < 0x3FFF ) {
3528 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3529 return 0;
3531 shiftCount = 0x403E - aExp;
3532 savedASig = aSig;
3533 aSig >>= shiftCount;
3534 z = aSig;
3535 if ( aSign ) z = - z;
3536 if ( ( z < 0 ) ^ aSign ) {
3537 invalid:
3538 float_raise( float_flag_invalid STATUS_VAR);
3539 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3541 if ( ( aSig<<shiftCount ) != savedASig ) {
3542 STATUS(float_exception_flags) |= float_flag_inexact;
3544 return z;
3548 /*----------------------------------------------------------------------------
3549 | Returns the result of converting the extended double-precision floating-
3550 | point value `a' to the 64-bit two's complement integer format. The
3551 | conversion is performed according to the IEC/IEEE Standard for Binary
3552 | Floating-Point Arithmetic---which means in particular that the conversion
3553 | is rounded according to the current rounding mode. If `a' is a NaN,
3554 | the largest positive integer is returned. Otherwise, if the conversion
3555 | overflows, the largest integer with the same sign as `a' is returned.
3556 *----------------------------------------------------------------------------*/
3558 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3560 flag aSign;
3561 int32 aExp, shiftCount;
3562 bits64 aSig, aSigExtra;
3564 aSig = extractFloatx80Frac( a );
3565 aExp = extractFloatx80Exp( a );
3566 aSign = extractFloatx80Sign( a );
3567 shiftCount = 0x403E - aExp;
3568 if ( shiftCount <= 0 ) {
3569 if ( shiftCount ) {
3570 float_raise( float_flag_invalid STATUS_VAR);
3571 if ( ! aSign
3572 || ( ( aExp == 0x7FFF )
3573 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3575 return LIT64( 0x7FFFFFFFFFFFFFFF );
3577 return (sbits64) LIT64( 0x8000000000000000 );
3579 aSigExtra = 0;
3581 else {
3582 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3584 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3588 /*----------------------------------------------------------------------------
3589 | Returns the result of converting the extended double-precision floating-
3590 | point value `a' to the 64-bit two's complement integer format. The
3591 | conversion is performed according to the IEC/IEEE Standard for Binary
3592 | Floating-Point Arithmetic, except that the conversion is always rounded
3593 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3594 | Otherwise, if the conversion overflows, the largest integer with the same
3595 | sign as `a' is returned.
3596 *----------------------------------------------------------------------------*/
3598 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3600 flag aSign;
3601 int32 aExp, shiftCount;
3602 bits64 aSig;
3603 int64 z;
3605 aSig = extractFloatx80Frac( a );
3606 aExp = extractFloatx80Exp( a );
3607 aSign = extractFloatx80Sign( a );
3608 shiftCount = aExp - 0x403E;
3609 if ( 0 <= shiftCount ) {
3610 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3611 if ( ( a.high != 0xC03E ) || aSig ) {
3612 float_raise( float_flag_invalid STATUS_VAR);
3613 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3614 return LIT64( 0x7FFFFFFFFFFFFFFF );
3617 return (sbits64) LIT64( 0x8000000000000000 );
3619 else if ( aExp < 0x3FFF ) {
3620 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3621 return 0;
3623 z = aSig>>( - shiftCount );
3624 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3625 STATUS(float_exception_flags) |= float_flag_inexact;
3627 if ( aSign ) z = - z;
3628 return z;
3632 /*----------------------------------------------------------------------------
3633 | Returns the result of converting the extended double-precision floating-
3634 | point value `a' to the single-precision floating-point format. The
3635 | conversion is performed according to the IEC/IEEE Standard for Binary
3636 | Floating-Point Arithmetic.
3637 *----------------------------------------------------------------------------*/
3639 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3641 flag aSign;
3642 int32 aExp;
3643 bits64 aSig;
3645 aSig = extractFloatx80Frac( a );
3646 aExp = extractFloatx80Exp( a );
3647 aSign = extractFloatx80Sign( a );
3648 if ( aExp == 0x7FFF ) {
3649 if ( (bits64) ( aSig<<1 ) ) {
3650 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3652 return packFloat32( aSign, 0xFF, 0 );
3654 shift64RightJamming( aSig, 33, &aSig );
3655 if ( aExp || aSig ) aExp -= 0x3F81;
3656 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3660 /*----------------------------------------------------------------------------
3661 | Returns the result of converting the extended double-precision floating-
3662 | point value `a' to the double-precision floating-point format. The
3663 | conversion is performed according to the IEC/IEEE Standard for Binary
3664 | Floating-Point Arithmetic.
3665 *----------------------------------------------------------------------------*/
3667 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3669 flag aSign;
3670 int32 aExp;
3671 bits64 aSig, zSig;
3673 aSig = extractFloatx80Frac( a );
3674 aExp = extractFloatx80Exp( a );
3675 aSign = extractFloatx80Sign( a );
3676 if ( aExp == 0x7FFF ) {
3677 if ( (bits64) ( aSig<<1 ) ) {
3678 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3680 return packFloat64( aSign, 0x7FF, 0 );
3682 shift64RightJamming( aSig, 1, &zSig );
3683 if ( aExp || aSig ) aExp -= 0x3C01;
3684 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3688 #ifdef FLOAT128
3690 /*----------------------------------------------------------------------------
3691 | Returns the result of converting the extended double-precision floating-
3692 | point value `a' to the quadruple-precision floating-point format. The
3693 | conversion is performed according to the IEC/IEEE Standard for Binary
3694 | Floating-Point Arithmetic.
3695 *----------------------------------------------------------------------------*/
3697 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3699 flag aSign;
3700 int16 aExp;
3701 bits64 aSig, zSig0, zSig1;
3703 aSig = extractFloatx80Frac( a );
3704 aExp = extractFloatx80Exp( a );
3705 aSign = extractFloatx80Sign( a );
3706 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3707 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3709 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3710 return packFloat128( aSign, aExp, zSig0, zSig1 );
3714 #endif
3716 /*----------------------------------------------------------------------------
3717 | Rounds the extended double-precision floating-point value `a' to an integer,
3718 | and returns the result as an extended quadruple-precision floating-point
3719 | value. The operation is performed according to the IEC/IEEE Standard for
3720 | Binary Floating-Point Arithmetic.
3721 *----------------------------------------------------------------------------*/
3723 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3725 flag aSign;
3726 int32 aExp;
3727 bits64 lastBitMask, roundBitsMask;
3728 int8 roundingMode;
3729 floatx80 z;
3731 aExp = extractFloatx80Exp( a );
3732 if ( 0x403E <= aExp ) {
3733 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3734 return propagateFloatx80NaN( a, a STATUS_VAR );
3736 return a;
3738 if ( aExp < 0x3FFF ) {
3739 if ( ( aExp == 0 )
3740 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3741 return a;
3743 STATUS(float_exception_flags) |= float_flag_inexact;
3744 aSign = extractFloatx80Sign( a );
3745 switch ( STATUS(float_rounding_mode) ) {
3746 case float_round_nearest_even:
3747 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3749 return
3750 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3752 break;
3753 case float_round_down:
3754 return
3755 aSign ?
3756 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3757 : packFloatx80( 0, 0, 0 );
3758 case float_round_up:
3759 return
3760 aSign ? packFloatx80( 1, 0, 0 )
3761 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3763 return packFloatx80( aSign, 0, 0 );
3765 lastBitMask = 1;
3766 lastBitMask <<= 0x403E - aExp;
3767 roundBitsMask = lastBitMask - 1;
3768 z = a;
3769 roundingMode = STATUS(float_rounding_mode);
3770 if ( roundingMode == float_round_nearest_even ) {
3771 z.low += lastBitMask>>1;
3772 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3774 else if ( roundingMode != float_round_to_zero ) {
3775 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3776 z.low += roundBitsMask;
3779 z.low &= ~ roundBitsMask;
3780 if ( z.low == 0 ) {
3781 ++z.high;
3782 z.low = LIT64( 0x8000000000000000 );
3784 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3785 return z;
3789 /*----------------------------------------------------------------------------
3790 | Returns the result of adding the absolute values of the extended double-
3791 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3792 | negated before being returned. `zSign' is ignored if the result is a NaN.
3793 | The addition is performed according to the IEC/IEEE Standard for Binary
3794 | Floating-Point Arithmetic.
3795 *----------------------------------------------------------------------------*/
3797 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3799 int32 aExp, bExp, zExp;
3800 bits64 aSig, bSig, zSig0, zSig1;
3801 int32 expDiff;
3803 aSig = extractFloatx80Frac( a );
3804 aExp = extractFloatx80Exp( a );
3805 bSig = extractFloatx80Frac( b );
3806 bExp = extractFloatx80Exp( b );
3807 expDiff = aExp - bExp;
3808 if ( 0 < expDiff ) {
3809 if ( aExp == 0x7FFF ) {
3810 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3811 return a;
3813 if ( bExp == 0 ) --expDiff;
3814 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3815 zExp = aExp;
3817 else if ( expDiff < 0 ) {
3818 if ( bExp == 0x7FFF ) {
3819 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3820 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3822 if ( aExp == 0 ) ++expDiff;
3823 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3824 zExp = bExp;
3826 else {
3827 if ( aExp == 0x7FFF ) {
3828 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3829 return propagateFloatx80NaN( a, b STATUS_VAR );
3831 return a;
3833 zSig1 = 0;
3834 zSig0 = aSig + bSig;
3835 if ( aExp == 0 ) {
3836 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3837 goto roundAndPack;
3839 zExp = aExp;
3840 goto shiftRight1;
3842 zSig0 = aSig + bSig;
3843 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3844 shiftRight1:
3845 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3846 zSig0 |= LIT64( 0x8000000000000000 );
3847 ++zExp;
3848 roundAndPack:
3849 return
3850 roundAndPackFloatx80(
3851 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3855 /*----------------------------------------------------------------------------
3856 | Returns the result of subtracting the absolute values of the extended
3857 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3858 | difference is negated before being returned. `zSign' is ignored if the
3859 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3860 | Standard for Binary Floating-Point Arithmetic.
3861 *----------------------------------------------------------------------------*/
3863 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3865 int32 aExp, bExp, zExp;
3866 bits64 aSig, bSig, zSig0, zSig1;
3867 int32 expDiff;
3868 floatx80 z;
3870 aSig = extractFloatx80Frac( a );
3871 aExp = extractFloatx80Exp( a );
3872 bSig = extractFloatx80Frac( b );
3873 bExp = extractFloatx80Exp( b );
3874 expDiff = aExp - bExp;
3875 if ( 0 < expDiff ) goto aExpBigger;
3876 if ( expDiff < 0 ) goto bExpBigger;
3877 if ( aExp == 0x7FFF ) {
3878 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3879 return propagateFloatx80NaN( a, b STATUS_VAR );
3881 float_raise( float_flag_invalid STATUS_VAR);
3882 z.low = floatx80_default_nan_low;
3883 z.high = floatx80_default_nan_high;
3884 return z;
3886 if ( aExp == 0 ) {
3887 aExp = 1;
3888 bExp = 1;
3890 zSig1 = 0;
3891 if ( bSig < aSig ) goto aBigger;
3892 if ( aSig < bSig ) goto bBigger;
3893 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3894 bExpBigger:
3895 if ( bExp == 0x7FFF ) {
3896 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3897 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3899 if ( aExp == 0 ) ++expDiff;
3900 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3901 bBigger:
3902 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3903 zExp = bExp;
3904 zSign ^= 1;
3905 goto normalizeRoundAndPack;
3906 aExpBigger:
3907 if ( aExp == 0x7FFF ) {
3908 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3909 return a;
3911 if ( bExp == 0 ) --expDiff;
3912 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3913 aBigger:
3914 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3915 zExp = aExp;
3916 normalizeRoundAndPack:
3917 return
3918 normalizeRoundAndPackFloatx80(
3919 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3923 /*----------------------------------------------------------------------------
3924 | Returns the result of adding the extended double-precision floating-point
3925 | values `a' and `b'. The operation is performed according to the IEC/IEEE
3926 | Standard for Binary Floating-Point Arithmetic.
3927 *----------------------------------------------------------------------------*/
3929 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3931 flag aSign, bSign;
3933 aSign = extractFloatx80Sign( a );
3934 bSign = extractFloatx80Sign( b );
3935 if ( aSign == bSign ) {
3936 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3938 else {
3939 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3944 /*----------------------------------------------------------------------------
3945 | Returns the result of subtracting the extended double-precision floating-
3946 | point values `a' and `b'. The operation is performed according to the
3947 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3948 *----------------------------------------------------------------------------*/
3950 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3952 flag aSign, bSign;
3954 aSign = extractFloatx80Sign( a );
3955 bSign = extractFloatx80Sign( b );
3956 if ( aSign == bSign ) {
3957 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3959 else {
3960 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3965 /*----------------------------------------------------------------------------
3966 | Returns the result of multiplying the extended double-precision floating-
3967 | point values `a' and `b'. The operation is performed according to the
3968 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3969 *----------------------------------------------------------------------------*/
3971 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3973 flag aSign, bSign, zSign;
3974 int32 aExp, bExp, zExp;
3975 bits64 aSig, bSig, zSig0, zSig1;
3976 floatx80 z;
3978 aSig = extractFloatx80Frac( a );
3979 aExp = extractFloatx80Exp( a );
3980 aSign = extractFloatx80Sign( a );
3981 bSig = extractFloatx80Frac( b );
3982 bExp = extractFloatx80Exp( b );
3983 bSign = extractFloatx80Sign( b );
3984 zSign = aSign ^ bSign;
3985 if ( aExp == 0x7FFF ) {
3986 if ( (bits64) ( aSig<<1 )
3987 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3988 return propagateFloatx80NaN( a, b STATUS_VAR );
3990 if ( ( bExp | bSig ) == 0 ) goto invalid;
3991 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3993 if ( bExp == 0x7FFF ) {
3994 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3995 if ( ( aExp | aSig ) == 0 ) {
3996 invalid:
3997 float_raise( float_flag_invalid STATUS_VAR);
3998 z.low = floatx80_default_nan_low;
3999 z.high = floatx80_default_nan_high;
4000 return z;
4002 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4004 if ( aExp == 0 ) {
4005 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4006 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4008 if ( bExp == 0 ) {
4009 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4010 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4012 zExp = aExp + bExp - 0x3FFE;
4013 mul64To128( aSig, bSig, &zSig0, &zSig1 );
4014 if ( 0 < (sbits64) zSig0 ) {
4015 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4016 --zExp;
4018 return
4019 roundAndPackFloatx80(
4020 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4024 /*----------------------------------------------------------------------------
4025 | Returns the result of dividing the extended double-precision floating-point
4026 | value `a' by the corresponding value `b'. The operation is performed
4027 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4028 *----------------------------------------------------------------------------*/
4030 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4032 flag aSign, bSign, zSign;
4033 int32 aExp, bExp, zExp;
4034 bits64 aSig, bSig, zSig0, zSig1;
4035 bits64 rem0, rem1, rem2, term0, term1, term2;
4036 floatx80 z;
4038 aSig = extractFloatx80Frac( a );
4039 aExp = extractFloatx80Exp( a );
4040 aSign = extractFloatx80Sign( a );
4041 bSig = extractFloatx80Frac( b );
4042 bExp = extractFloatx80Exp( b );
4043 bSign = extractFloatx80Sign( b );
4044 zSign = aSign ^ bSign;
4045 if ( aExp == 0x7FFF ) {
4046 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4047 if ( bExp == 0x7FFF ) {
4048 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4049 goto invalid;
4051 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4053 if ( bExp == 0x7FFF ) {
4054 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4055 return packFloatx80( zSign, 0, 0 );
4057 if ( bExp == 0 ) {
4058 if ( bSig == 0 ) {
4059 if ( ( aExp | aSig ) == 0 ) {
4060 invalid:
4061 float_raise( float_flag_invalid STATUS_VAR);
4062 z.low = floatx80_default_nan_low;
4063 z.high = floatx80_default_nan_high;
4064 return z;
4066 float_raise( float_flag_divbyzero STATUS_VAR);
4067 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4069 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4071 if ( aExp == 0 ) {
4072 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4073 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4075 zExp = aExp - bExp + 0x3FFE;
4076 rem1 = 0;
4077 if ( bSig <= aSig ) {
4078 shift128Right( aSig, 0, 1, &aSig, &rem1 );
4079 ++zExp;
4081 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4082 mul64To128( bSig, zSig0, &term0, &term1 );
4083 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4084 while ( (sbits64) rem0 < 0 ) {
4085 --zSig0;
4086 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4088 zSig1 = estimateDiv128To64( rem1, 0, bSig );
4089 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
4090 mul64To128( bSig, zSig1, &term1, &term2 );
4091 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4092 while ( (sbits64) rem1 < 0 ) {
4093 --zSig1;
4094 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4096 zSig1 |= ( ( rem1 | rem2 ) != 0 );
4098 return
4099 roundAndPackFloatx80(
4100 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4104 /*----------------------------------------------------------------------------
4105 | Returns the remainder of the extended double-precision floating-point value
4106 | `a' with respect to the corresponding value `b'. The operation is performed
4107 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4108 *----------------------------------------------------------------------------*/
4110 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4112 flag aSign, zSign;
4113 int32 aExp, bExp, expDiff;
4114 bits64 aSig0, aSig1, bSig;
4115 bits64 q, term0, term1, alternateASig0, alternateASig1;
4116 floatx80 z;
4118 aSig0 = extractFloatx80Frac( a );
4119 aExp = extractFloatx80Exp( a );
4120 aSign = extractFloatx80Sign( a );
4121 bSig = extractFloatx80Frac( b );
4122 bExp = extractFloatx80Exp( b );
4123 if ( aExp == 0x7FFF ) {
4124 if ( (bits64) ( aSig0<<1 )
4125 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4126 return propagateFloatx80NaN( a, b STATUS_VAR );
4128 goto invalid;
4130 if ( bExp == 0x7FFF ) {
4131 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4132 return a;
4134 if ( bExp == 0 ) {
4135 if ( bSig == 0 ) {
4136 invalid:
4137 float_raise( float_flag_invalid STATUS_VAR);
4138 z.low = floatx80_default_nan_low;
4139 z.high = floatx80_default_nan_high;
4140 return z;
4142 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4144 if ( aExp == 0 ) {
4145 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4146 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4148 bSig |= LIT64( 0x8000000000000000 );
4149 zSign = aSign;
4150 expDiff = aExp - bExp;
4151 aSig1 = 0;
4152 if ( expDiff < 0 ) {
4153 if ( expDiff < -1 ) return a;
4154 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4155 expDiff = 0;
4157 q = ( bSig <= aSig0 );
4158 if ( q ) aSig0 -= bSig;
4159 expDiff -= 64;
4160 while ( 0 < expDiff ) {
4161 q = estimateDiv128To64( aSig0, aSig1, bSig );
4162 q = ( 2 < q ) ? q - 2 : 0;
4163 mul64To128( bSig, q, &term0, &term1 );
4164 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4165 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4166 expDiff -= 62;
4168 expDiff += 64;
4169 if ( 0 < expDiff ) {
4170 q = estimateDiv128To64( aSig0, aSig1, bSig );
4171 q = ( 2 < q ) ? q - 2 : 0;
4172 q >>= 64 - expDiff;
4173 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4174 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4175 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4176 while ( le128( term0, term1, aSig0, aSig1 ) ) {
4177 ++q;
4178 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4181 else {
4182 term1 = 0;
4183 term0 = bSig;
4185 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4186 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4187 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4188 && ( q & 1 ) )
4190 aSig0 = alternateASig0;
4191 aSig1 = alternateASig1;
4192 zSign = ! zSign;
4194 return
4195 normalizeRoundAndPackFloatx80(
4196 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4200 /*----------------------------------------------------------------------------
4201 | Returns the square root of the extended double-precision floating-point
4202 | value `a'. The operation is performed according to the IEC/IEEE Standard
4203 | for Binary Floating-Point Arithmetic.
4204 *----------------------------------------------------------------------------*/
4206 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4208 flag aSign;
4209 int32 aExp, zExp;
4210 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4211 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4212 floatx80 z;
4214 aSig0 = extractFloatx80Frac( a );
4215 aExp = extractFloatx80Exp( a );
4216 aSign = extractFloatx80Sign( a );
4217 if ( aExp == 0x7FFF ) {
4218 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4219 if ( ! aSign ) return a;
4220 goto invalid;
4222 if ( aSign ) {
4223 if ( ( aExp | aSig0 ) == 0 ) return a;
4224 invalid:
4225 float_raise( float_flag_invalid STATUS_VAR);
4226 z.low = floatx80_default_nan_low;
4227 z.high = floatx80_default_nan_high;
4228 return z;
4230 if ( aExp == 0 ) {
4231 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4232 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4234 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4235 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4236 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4237 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4238 doubleZSig0 = zSig0<<1;
4239 mul64To128( zSig0, zSig0, &term0, &term1 );
4240 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4241 while ( (sbits64) rem0 < 0 ) {
4242 --zSig0;
4243 doubleZSig0 -= 2;
4244 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4246 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4247 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4248 if ( zSig1 == 0 ) zSig1 = 1;
4249 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4250 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4251 mul64To128( zSig1, zSig1, &term2, &term3 );
4252 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4253 while ( (sbits64) rem1 < 0 ) {
4254 --zSig1;
4255 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4256 term3 |= 1;
4257 term2 |= doubleZSig0;
4258 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4260 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4262 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4263 zSig0 |= doubleZSig0;
4264 return
4265 roundAndPackFloatx80(
4266 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4270 /*----------------------------------------------------------------------------
4271 | Returns 1 if the extended double-precision floating-point value `a' is
4272 | equal to the corresponding value `b', and 0 otherwise. The comparison is
4273 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4274 | Arithmetic.
4275 *----------------------------------------------------------------------------*/
4277 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4280 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4281 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4282 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4283 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4285 if ( floatx80_is_signaling_nan( a )
4286 || floatx80_is_signaling_nan( b ) ) {
4287 float_raise( float_flag_invalid STATUS_VAR);
4289 return 0;
4291 return
4292 ( a.low == b.low )
4293 && ( ( a.high == b.high )
4294 || ( ( a.low == 0 )
4295 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4300 /*----------------------------------------------------------------------------
4301 | Returns 1 if the extended double-precision floating-point value `a' is
4302 | less than or equal to the corresponding value `b', and 0 otherwise. The
4303 | comparison is performed according to the IEC/IEEE Standard for Binary
4304 | Floating-Point Arithmetic.
4305 *----------------------------------------------------------------------------*/
4307 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4309 flag aSign, bSign;
4311 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4312 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4313 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4314 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4316 float_raise( float_flag_invalid STATUS_VAR);
4317 return 0;
4319 aSign = extractFloatx80Sign( a );
4320 bSign = extractFloatx80Sign( b );
4321 if ( aSign != bSign ) {
4322 return
4323 aSign
4324 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4325 == 0 );
4327 return
4328 aSign ? le128( b.high, b.low, a.high, a.low )
4329 : le128( a.high, a.low, b.high, b.low );
4333 /*----------------------------------------------------------------------------
4334 | Returns 1 if the extended double-precision floating-point value `a' is
4335 | less than the corresponding value `b', and 0 otherwise. The comparison
4336 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4337 | Arithmetic.
4338 *----------------------------------------------------------------------------*/
4340 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4342 flag aSign, bSign;
4344 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4345 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4346 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4347 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4349 float_raise( float_flag_invalid STATUS_VAR);
4350 return 0;
4352 aSign = extractFloatx80Sign( a );
4353 bSign = extractFloatx80Sign( b );
4354 if ( aSign != bSign ) {
4355 return
4356 aSign
4357 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4358 != 0 );
4360 return
4361 aSign ? lt128( b.high, b.low, a.high, a.low )
4362 : lt128( a.high, a.low, b.high, b.low );
4366 /*----------------------------------------------------------------------------
4367 | Returns 1 if the extended double-precision floating-point value `a' is equal
4368 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4369 | raised if either operand is a NaN. Otherwise, the comparison is performed
4370 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4371 *----------------------------------------------------------------------------*/
4373 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4376 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4377 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4378 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4379 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4381 float_raise( float_flag_invalid STATUS_VAR);
4382 return 0;
4384 return
4385 ( a.low == b.low )
4386 && ( ( a.high == b.high )
4387 || ( ( a.low == 0 )
4388 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4393 /*----------------------------------------------------------------------------
4394 | Returns 1 if the extended double-precision floating-point value `a' is less
4395 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4396 | do not cause an exception. Otherwise, the comparison is performed according
4397 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4398 *----------------------------------------------------------------------------*/
4400 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4402 flag aSign, bSign;
4404 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4405 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4406 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4407 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4409 if ( floatx80_is_signaling_nan( a )
4410 || floatx80_is_signaling_nan( b ) ) {
4411 float_raise( float_flag_invalid STATUS_VAR);
4413 return 0;
4415 aSign = extractFloatx80Sign( a );
4416 bSign = extractFloatx80Sign( b );
4417 if ( aSign != bSign ) {
4418 return
4419 aSign
4420 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4421 == 0 );
4423 return
4424 aSign ? le128( b.high, b.low, a.high, a.low )
4425 : le128( a.high, a.low, b.high, b.low );
4429 /*----------------------------------------------------------------------------
4430 | Returns 1 if the extended double-precision floating-point value `a' is less
4431 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4432 | an exception. Otherwise, the comparison is performed according to the
4433 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4434 *----------------------------------------------------------------------------*/
4436 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4438 flag aSign, bSign;
4440 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4441 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4442 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4443 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4445 if ( floatx80_is_signaling_nan( a )
4446 || floatx80_is_signaling_nan( b ) ) {
4447 float_raise( float_flag_invalid STATUS_VAR);
4449 return 0;
4451 aSign = extractFloatx80Sign( a );
4452 bSign = extractFloatx80Sign( b );
4453 if ( aSign != bSign ) {
4454 return
4455 aSign
4456 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4457 != 0 );
4459 return
4460 aSign ? lt128( b.high, b.low, a.high, a.low )
4461 : lt128( a.high, a.low, b.high, b.low );
4465 #endif
4467 #ifdef FLOAT128
4469 /*----------------------------------------------------------------------------
4470 | Returns the result of converting the quadruple-precision floating-point
4471 | value `a' to the 32-bit two's complement integer format. The conversion
4472 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4473 | Arithmetic---which means in particular that the conversion is rounded
4474 | according to the current rounding mode. If `a' is a NaN, the largest
4475 | positive integer is returned. Otherwise, if the conversion overflows, the
4476 | largest integer with the same sign as `a' is returned.
4477 *----------------------------------------------------------------------------*/
4479 int32 float128_to_int32( float128 a STATUS_PARAM )
4481 flag aSign;
4482 int32 aExp, shiftCount;
4483 bits64 aSig0, aSig1;
4485 aSig1 = extractFloat128Frac1( a );
4486 aSig0 = extractFloat128Frac0( a );
4487 aExp = extractFloat128Exp( a );
4488 aSign = extractFloat128Sign( a );
4489 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4490 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4491 aSig0 |= ( aSig1 != 0 );
4492 shiftCount = 0x4028 - aExp;
4493 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4494 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4498 /*----------------------------------------------------------------------------
4499 | Returns the result of converting the quadruple-precision floating-point
4500 | value `a' to the 32-bit two's complement integer format. The conversion
4501 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4502 | Arithmetic, except that the conversion is always rounded toward zero. If
4503 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4504 | conversion overflows, the largest integer with the same sign as `a' is
4505 | returned.
4506 *----------------------------------------------------------------------------*/
4508 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4510 flag aSign;
4511 int32 aExp, shiftCount;
4512 bits64 aSig0, aSig1, savedASig;
4513 int32 z;
4515 aSig1 = extractFloat128Frac1( a );
4516 aSig0 = extractFloat128Frac0( a );
4517 aExp = extractFloat128Exp( a );
4518 aSign = extractFloat128Sign( a );
4519 aSig0 |= ( aSig1 != 0 );
4520 if ( 0x401E < aExp ) {
4521 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4522 goto invalid;
4524 else if ( aExp < 0x3FFF ) {
4525 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4526 return 0;
4528 aSig0 |= LIT64( 0x0001000000000000 );
4529 shiftCount = 0x402F - aExp;
4530 savedASig = aSig0;
4531 aSig0 >>= shiftCount;
4532 z = aSig0;
4533 if ( aSign ) z = - z;
4534 if ( ( z < 0 ) ^ aSign ) {
4535 invalid:
4536 float_raise( float_flag_invalid STATUS_VAR);
4537 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4539 if ( ( aSig0<<shiftCount ) != savedASig ) {
4540 STATUS(float_exception_flags) |= float_flag_inexact;
4542 return z;
4546 /*----------------------------------------------------------------------------
4547 | Returns the result of converting the quadruple-precision floating-point
4548 | value `a' to the 64-bit two's complement integer format. The conversion
4549 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4550 | Arithmetic---which means in particular that the conversion is rounded
4551 | according to the current rounding mode. If `a' is a NaN, the largest
4552 | positive integer is returned. Otherwise, if the conversion overflows, the
4553 | largest integer with the same sign as `a' is returned.
4554 *----------------------------------------------------------------------------*/
4556 int64 float128_to_int64( float128 a STATUS_PARAM )
4558 flag aSign;
4559 int32 aExp, shiftCount;
4560 bits64 aSig0, aSig1;
4562 aSig1 = extractFloat128Frac1( a );
4563 aSig0 = extractFloat128Frac0( a );
4564 aExp = extractFloat128Exp( a );
4565 aSign = extractFloat128Sign( a );
4566 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4567 shiftCount = 0x402F - aExp;
4568 if ( shiftCount <= 0 ) {
4569 if ( 0x403E < aExp ) {
4570 float_raise( float_flag_invalid STATUS_VAR);
4571 if ( ! aSign
4572 || ( ( aExp == 0x7FFF )
4573 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4576 return LIT64( 0x7FFFFFFFFFFFFFFF );
4578 return (sbits64) LIT64( 0x8000000000000000 );
4580 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4582 else {
4583 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4585 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4589 /*----------------------------------------------------------------------------
4590 | Returns the result of converting the quadruple-precision floating-point
4591 | value `a' to the 64-bit two's complement integer format. The conversion
4592 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4593 | Arithmetic, except that the conversion is always rounded toward zero.
4594 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4595 | the conversion overflows, the largest integer with the same sign as `a' is
4596 | returned.
4597 *----------------------------------------------------------------------------*/
4599 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4601 flag aSign;
4602 int32 aExp, shiftCount;
4603 bits64 aSig0, aSig1;
4604 int64 z;
4606 aSig1 = extractFloat128Frac1( a );
4607 aSig0 = extractFloat128Frac0( a );
4608 aExp = extractFloat128Exp( a );
4609 aSign = extractFloat128Sign( a );
4610 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4611 shiftCount = aExp - 0x402F;
4612 if ( 0 < shiftCount ) {
4613 if ( 0x403E <= aExp ) {
4614 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4615 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4616 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4617 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4619 else {
4620 float_raise( float_flag_invalid STATUS_VAR);
4621 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4622 return LIT64( 0x7FFFFFFFFFFFFFFF );
4625 return (sbits64) LIT64( 0x8000000000000000 );
4627 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4628 if ( (bits64) ( aSig1<<shiftCount ) ) {
4629 STATUS(float_exception_flags) |= float_flag_inexact;
4632 else {
4633 if ( aExp < 0x3FFF ) {
4634 if ( aExp | aSig0 | aSig1 ) {
4635 STATUS(float_exception_flags) |= float_flag_inexact;
4637 return 0;
4639 z = aSig0>>( - shiftCount );
4640 if ( aSig1
4641 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4642 STATUS(float_exception_flags) |= float_flag_inexact;
4645 if ( aSign ) z = - z;
4646 return z;
4650 /*----------------------------------------------------------------------------
4651 | Returns the result of converting the quadruple-precision floating-point
4652 | value `a' to the single-precision floating-point format. The conversion
4653 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4654 | Arithmetic.
4655 *----------------------------------------------------------------------------*/
4657 float32 float128_to_float32( float128 a STATUS_PARAM )
4659 flag aSign;
4660 int32 aExp;
4661 bits64 aSig0, aSig1;
4662 bits32 zSig;
4664 aSig1 = extractFloat128Frac1( a );
4665 aSig0 = extractFloat128Frac0( a );
4666 aExp = extractFloat128Exp( a );
4667 aSign = extractFloat128Sign( a );
4668 if ( aExp == 0x7FFF ) {
4669 if ( aSig0 | aSig1 ) {
4670 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4672 return packFloat32( aSign, 0xFF, 0 );
4674 aSig0 |= ( aSig1 != 0 );
4675 shift64RightJamming( aSig0, 18, &aSig0 );
4676 zSig = aSig0;
4677 if ( aExp || zSig ) {
4678 zSig |= 0x40000000;
4679 aExp -= 0x3F81;
4681 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4685 /*----------------------------------------------------------------------------
4686 | Returns the result of converting the quadruple-precision floating-point
4687 | value `a' to the double-precision floating-point format. The conversion
4688 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4689 | Arithmetic.
4690 *----------------------------------------------------------------------------*/
4692 float64 float128_to_float64( float128 a STATUS_PARAM )
4694 flag aSign;
4695 int32 aExp;
4696 bits64 aSig0, aSig1;
4698 aSig1 = extractFloat128Frac1( a );
4699 aSig0 = extractFloat128Frac0( a );
4700 aExp = extractFloat128Exp( a );
4701 aSign = extractFloat128Sign( a );
4702 if ( aExp == 0x7FFF ) {
4703 if ( aSig0 | aSig1 ) {
4704 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4706 return packFloat64( aSign, 0x7FF, 0 );
4708 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4709 aSig0 |= ( aSig1 != 0 );
4710 if ( aExp || aSig0 ) {
4711 aSig0 |= LIT64( 0x4000000000000000 );
4712 aExp -= 0x3C01;
4714 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4718 #ifdef FLOATX80
4720 /*----------------------------------------------------------------------------
4721 | Returns the result of converting the quadruple-precision floating-point
4722 | value `a' to the extended double-precision floating-point format. The
4723 | conversion is performed according to the IEC/IEEE Standard for Binary
4724 | Floating-Point Arithmetic.
4725 *----------------------------------------------------------------------------*/
4727 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4729 flag aSign;
4730 int32 aExp;
4731 bits64 aSig0, aSig1;
4733 aSig1 = extractFloat128Frac1( a );
4734 aSig0 = extractFloat128Frac0( a );
4735 aExp = extractFloat128Exp( a );
4736 aSign = extractFloat128Sign( a );
4737 if ( aExp == 0x7FFF ) {
4738 if ( aSig0 | aSig1 ) {
4739 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4741 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4743 if ( aExp == 0 ) {
4744 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4745 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4747 else {
4748 aSig0 |= LIT64( 0x0001000000000000 );
4750 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4751 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4755 #endif
4757 /*----------------------------------------------------------------------------
4758 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4759 | returns the result as a quadruple-precision floating-point value. The
4760 | operation is performed according to the IEC/IEEE Standard for Binary
4761 | Floating-Point Arithmetic.
4762 *----------------------------------------------------------------------------*/
4764 float128 float128_round_to_int( float128 a STATUS_PARAM )
4766 flag aSign;
4767 int32 aExp;
4768 bits64 lastBitMask, roundBitsMask;
4769 int8 roundingMode;
4770 float128 z;
4772 aExp = extractFloat128Exp( a );
4773 if ( 0x402F <= aExp ) {
4774 if ( 0x406F <= aExp ) {
4775 if ( ( aExp == 0x7FFF )
4776 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4778 return propagateFloat128NaN( a, a STATUS_VAR );
4780 return a;
4782 lastBitMask = 1;
4783 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4784 roundBitsMask = lastBitMask - 1;
4785 z = a;
4786 roundingMode = STATUS(float_rounding_mode);
4787 if ( roundingMode == float_round_nearest_even ) {
4788 if ( lastBitMask ) {
4789 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4790 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4792 else {
4793 if ( (sbits64) z.low < 0 ) {
4794 ++z.high;
4795 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4799 else if ( roundingMode != float_round_to_zero ) {
4800 if ( extractFloat128Sign( z )
4801 ^ ( roundingMode == float_round_up ) ) {
4802 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4805 z.low &= ~ roundBitsMask;
4807 else {
4808 if ( aExp < 0x3FFF ) {
4809 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4810 STATUS(float_exception_flags) |= float_flag_inexact;
4811 aSign = extractFloat128Sign( a );
4812 switch ( STATUS(float_rounding_mode) ) {
4813 case float_round_nearest_even:
4814 if ( ( aExp == 0x3FFE )
4815 && ( extractFloat128Frac0( a )
4816 | extractFloat128Frac1( a ) )
4818 return packFloat128( aSign, 0x3FFF, 0, 0 );
4820 break;
4821 case float_round_down:
4822 return
4823 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4824 : packFloat128( 0, 0, 0, 0 );
4825 case float_round_up:
4826 return
4827 aSign ? packFloat128( 1, 0, 0, 0 )
4828 : packFloat128( 0, 0x3FFF, 0, 0 );
4830 return packFloat128( aSign, 0, 0, 0 );
4832 lastBitMask = 1;
4833 lastBitMask <<= 0x402F - aExp;
4834 roundBitsMask = lastBitMask - 1;
4835 z.low = 0;
4836 z.high = a.high;
4837 roundingMode = STATUS(float_rounding_mode);
4838 if ( roundingMode == float_round_nearest_even ) {
4839 z.high += lastBitMask>>1;
4840 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4841 z.high &= ~ lastBitMask;
4844 else if ( roundingMode != float_round_to_zero ) {
4845 if ( extractFloat128Sign( z )
4846 ^ ( roundingMode == float_round_up ) ) {
4847 z.high |= ( a.low != 0 );
4848 z.high += roundBitsMask;
4851 z.high &= ~ roundBitsMask;
4853 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4854 STATUS(float_exception_flags) |= float_flag_inexact;
4856 return z;
4860 /*----------------------------------------------------------------------------
4861 | Returns the result of adding the absolute values of the quadruple-precision
4862 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4863 | before being returned. `zSign' is ignored if the result is a NaN.
4864 | The addition is performed according to the IEC/IEEE Standard for Binary
4865 | Floating-Point Arithmetic.
4866 *----------------------------------------------------------------------------*/
4868 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4870 int32 aExp, bExp, zExp;
4871 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4872 int32 expDiff;
4874 aSig1 = extractFloat128Frac1( a );
4875 aSig0 = extractFloat128Frac0( a );
4876 aExp = extractFloat128Exp( a );
4877 bSig1 = extractFloat128Frac1( b );
4878 bSig0 = extractFloat128Frac0( b );
4879 bExp = extractFloat128Exp( b );
4880 expDiff = aExp - bExp;
4881 if ( 0 < expDiff ) {
4882 if ( aExp == 0x7FFF ) {
4883 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4884 return a;
4886 if ( bExp == 0 ) {
4887 --expDiff;
4889 else {
4890 bSig0 |= LIT64( 0x0001000000000000 );
4892 shift128ExtraRightJamming(
4893 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4894 zExp = aExp;
4896 else if ( expDiff < 0 ) {
4897 if ( bExp == 0x7FFF ) {
4898 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4899 return packFloat128( zSign, 0x7FFF, 0, 0 );
4901 if ( aExp == 0 ) {
4902 ++expDiff;
4904 else {
4905 aSig0 |= LIT64( 0x0001000000000000 );
4907 shift128ExtraRightJamming(
4908 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4909 zExp = bExp;
4911 else {
4912 if ( aExp == 0x7FFF ) {
4913 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4914 return propagateFloat128NaN( a, b STATUS_VAR );
4916 return a;
4918 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4919 if ( aExp == 0 ) {
4920 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
4921 return packFloat128( zSign, 0, zSig0, zSig1 );
4923 zSig2 = 0;
4924 zSig0 |= LIT64( 0x0002000000000000 );
4925 zExp = aExp;
4926 goto shiftRight1;
4928 aSig0 |= LIT64( 0x0001000000000000 );
4929 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4930 --zExp;
4931 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4932 ++zExp;
4933 shiftRight1:
4934 shift128ExtraRightJamming(
4935 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4936 roundAndPack:
4937 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4941 /*----------------------------------------------------------------------------
4942 | Returns the result of subtracting the absolute values of the quadruple-
4943 | precision floating-point values `a' and `b'. If `zSign' is 1, the
4944 | difference is negated before being returned. `zSign' is ignored if the
4945 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4946 | Standard for Binary Floating-Point Arithmetic.
4947 *----------------------------------------------------------------------------*/
4949 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4951 int32 aExp, bExp, zExp;
4952 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4953 int32 expDiff;
4954 float128 z;
4956 aSig1 = extractFloat128Frac1( a );
4957 aSig0 = extractFloat128Frac0( a );
4958 aExp = extractFloat128Exp( a );
4959 bSig1 = extractFloat128Frac1( b );
4960 bSig0 = extractFloat128Frac0( b );
4961 bExp = extractFloat128Exp( b );
4962 expDiff = aExp - bExp;
4963 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4964 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4965 if ( 0 < expDiff ) goto aExpBigger;
4966 if ( expDiff < 0 ) goto bExpBigger;
4967 if ( aExp == 0x7FFF ) {
4968 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4969 return propagateFloat128NaN( a, b STATUS_VAR );
4971 float_raise( float_flag_invalid STATUS_VAR);
4972 z.low = float128_default_nan_low;
4973 z.high = float128_default_nan_high;
4974 return z;
4976 if ( aExp == 0 ) {
4977 aExp = 1;
4978 bExp = 1;
4980 if ( bSig0 < aSig0 ) goto aBigger;
4981 if ( aSig0 < bSig0 ) goto bBigger;
4982 if ( bSig1 < aSig1 ) goto aBigger;
4983 if ( aSig1 < bSig1 ) goto bBigger;
4984 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4985 bExpBigger:
4986 if ( bExp == 0x7FFF ) {
4987 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4988 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4990 if ( aExp == 0 ) {
4991 ++expDiff;
4993 else {
4994 aSig0 |= LIT64( 0x4000000000000000 );
4996 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4997 bSig0 |= LIT64( 0x4000000000000000 );
4998 bBigger:
4999 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5000 zExp = bExp;
5001 zSign ^= 1;
5002 goto normalizeRoundAndPack;
5003 aExpBigger:
5004 if ( aExp == 0x7FFF ) {
5005 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5006 return a;
5008 if ( bExp == 0 ) {
5009 --expDiff;
5011 else {
5012 bSig0 |= LIT64( 0x4000000000000000 );
5014 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5015 aSig0 |= LIT64( 0x4000000000000000 );
5016 aBigger:
5017 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5018 zExp = aExp;
5019 normalizeRoundAndPack:
5020 --zExp;
5021 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5025 /*----------------------------------------------------------------------------
5026 | Returns the result of adding the quadruple-precision floating-point values
5027 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5028 | for Binary Floating-Point Arithmetic.
5029 *----------------------------------------------------------------------------*/
5031 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5033 flag aSign, bSign;
5035 aSign = extractFloat128Sign( a );
5036 bSign = extractFloat128Sign( b );
5037 if ( aSign == bSign ) {
5038 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5040 else {
5041 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5046 /*----------------------------------------------------------------------------
5047 | Returns the result of subtracting the quadruple-precision floating-point
5048 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5049 | Standard for Binary Floating-Point Arithmetic.
5050 *----------------------------------------------------------------------------*/
5052 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5054 flag aSign, bSign;
5056 aSign = extractFloat128Sign( a );
5057 bSign = extractFloat128Sign( b );
5058 if ( aSign == bSign ) {
5059 return subFloat128Sigs( a, b, aSign STATUS_VAR );
5061 else {
5062 return addFloat128Sigs( a, b, aSign STATUS_VAR );
5067 /*----------------------------------------------------------------------------
5068 | Returns the result of multiplying the quadruple-precision floating-point
5069 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5070 | Standard for Binary Floating-Point Arithmetic.
5071 *----------------------------------------------------------------------------*/
5073 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5075 flag aSign, bSign, zSign;
5076 int32 aExp, bExp, zExp;
5077 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5078 float128 z;
5080 aSig1 = extractFloat128Frac1( a );
5081 aSig0 = extractFloat128Frac0( a );
5082 aExp = extractFloat128Exp( a );
5083 aSign = extractFloat128Sign( a );
5084 bSig1 = extractFloat128Frac1( b );
5085 bSig0 = extractFloat128Frac0( b );
5086 bExp = extractFloat128Exp( b );
5087 bSign = extractFloat128Sign( b );
5088 zSign = aSign ^ bSign;
5089 if ( aExp == 0x7FFF ) {
5090 if ( ( aSig0 | aSig1 )
5091 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5092 return propagateFloat128NaN( a, b STATUS_VAR );
5094 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5095 return packFloat128( zSign, 0x7FFF, 0, 0 );
5097 if ( bExp == 0x7FFF ) {
5098 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5099 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5100 invalid:
5101 float_raise( float_flag_invalid STATUS_VAR);
5102 z.low = float128_default_nan_low;
5103 z.high = float128_default_nan_high;
5104 return z;
5106 return packFloat128( zSign, 0x7FFF, 0, 0 );
5108 if ( aExp == 0 ) {
5109 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5110 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5112 if ( bExp == 0 ) {
5113 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5114 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5116 zExp = aExp + bExp - 0x4000;
5117 aSig0 |= LIT64( 0x0001000000000000 );
5118 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5119 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5120 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5121 zSig2 |= ( zSig3 != 0 );
5122 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5123 shift128ExtraRightJamming(
5124 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5125 ++zExp;
5127 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5131 /*----------------------------------------------------------------------------
5132 | Returns the result of dividing the quadruple-precision floating-point value
5133 | `a' by the corresponding value `b'. The operation is performed according to
5134 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5135 *----------------------------------------------------------------------------*/
5137 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5139 flag aSign, bSign, zSign;
5140 int32 aExp, bExp, zExp;
5141 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5142 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5143 float128 z;
5145 aSig1 = extractFloat128Frac1( a );
5146 aSig0 = extractFloat128Frac0( a );
5147 aExp = extractFloat128Exp( a );
5148 aSign = extractFloat128Sign( a );
5149 bSig1 = extractFloat128Frac1( b );
5150 bSig0 = extractFloat128Frac0( b );
5151 bExp = extractFloat128Exp( b );
5152 bSign = extractFloat128Sign( b );
5153 zSign = aSign ^ bSign;
5154 if ( aExp == 0x7FFF ) {
5155 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5156 if ( bExp == 0x7FFF ) {
5157 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5158 goto invalid;
5160 return packFloat128( zSign, 0x7FFF, 0, 0 );
5162 if ( bExp == 0x7FFF ) {
5163 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5164 return packFloat128( zSign, 0, 0, 0 );
5166 if ( bExp == 0 ) {
5167 if ( ( bSig0 | bSig1 ) == 0 ) {
5168 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5169 invalid:
5170 float_raise( float_flag_invalid STATUS_VAR);
5171 z.low = float128_default_nan_low;
5172 z.high = float128_default_nan_high;
5173 return z;
5175 float_raise( float_flag_divbyzero STATUS_VAR);
5176 return packFloat128( zSign, 0x7FFF, 0, 0 );
5178 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5180 if ( aExp == 0 ) {
5181 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5182 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5184 zExp = aExp - bExp + 0x3FFD;
5185 shortShift128Left(
5186 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5187 shortShift128Left(
5188 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5189 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5190 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5191 ++zExp;
5193 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5194 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5195 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5196 while ( (sbits64) rem0 < 0 ) {
5197 --zSig0;
5198 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5200 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5201 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5202 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5203 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5204 while ( (sbits64) rem1 < 0 ) {
5205 --zSig1;
5206 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5208 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5210 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5211 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5215 /*----------------------------------------------------------------------------
5216 | Returns the remainder of the quadruple-precision floating-point value `a'
5217 | with respect to the corresponding value `b'. The operation is performed
5218 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5219 *----------------------------------------------------------------------------*/
5221 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5223 flag aSign, zSign;
5224 int32 aExp, bExp, expDiff;
5225 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5226 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5227 sbits64 sigMean0;
5228 float128 z;
5230 aSig1 = extractFloat128Frac1( a );
5231 aSig0 = extractFloat128Frac0( a );
5232 aExp = extractFloat128Exp( a );
5233 aSign = extractFloat128Sign( a );
5234 bSig1 = extractFloat128Frac1( b );
5235 bSig0 = extractFloat128Frac0( b );
5236 bExp = extractFloat128Exp( b );
5237 if ( aExp == 0x7FFF ) {
5238 if ( ( aSig0 | aSig1 )
5239 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5240 return propagateFloat128NaN( a, b STATUS_VAR );
5242 goto invalid;
5244 if ( bExp == 0x7FFF ) {
5245 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5246 return a;
5248 if ( bExp == 0 ) {
5249 if ( ( bSig0 | bSig1 ) == 0 ) {
5250 invalid:
5251 float_raise( float_flag_invalid STATUS_VAR);
5252 z.low = float128_default_nan_low;
5253 z.high = float128_default_nan_high;
5254 return z;
5256 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5258 if ( aExp == 0 ) {
5259 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5260 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5262 expDiff = aExp - bExp;
5263 if ( expDiff < -1 ) return a;
5264 shortShift128Left(
5265 aSig0 | LIT64( 0x0001000000000000 ),
5266 aSig1,
5267 15 - ( expDiff < 0 ),
5268 &aSig0,
5269 &aSig1
5271 shortShift128Left(
5272 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5273 q = le128( bSig0, bSig1, aSig0, aSig1 );
5274 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5275 expDiff -= 64;
5276 while ( 0 < expDiff ) {
5277 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5278 q = ( 4 < q ) ? q - 4 : 0;
5279 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5280 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5281 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5282 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5283 expDiff -= 61;
5285 if ( -64 < expDiff ) {
5286 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5287 q = ( 4 < q ) ? q - 4 : 0;
5288 q >>= - expDiff;
5289 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5290 expDiff += 52;
5291 if ( expDiff < 0 ) {
5292 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5294 else {
5295 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5297 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5298 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5300 else {
5301 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5302 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5304 do {
5305 alternateASig0 = aSig0;
5306 alternateASig1 = aSig1;
5307 ++q;
5308 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5309 } while ( 0 <= (sbits64) aSig0 );
5310 add128(
5311 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5312 if ( ( sigMean0 < 0 )
5313 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5314 aSig0 = alternateASig0;
5315 aSig1 = alternateASig1;
5317 zSign = ( (sbits64) aSig0 < 0 );
5318 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5319 return
5320 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5324 /*----------------------------------------------------------------------------
5325 | Returns the square root of the quadruple-precision floating-point value `a'.
5326 | The operation is performed according to the IEC/IEEE Standard for Binary
5327 | Floating-Point Arithmetic.
5328 *----------------------------------------------------------------------------*/
5330 float128 float128_sqrt( float128 a STATUS_PARAM )
5332 flag aSign;
5333 int32 aExp, zExp;
5334 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5335 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5336 float128 z;
5338 aSig1 = extractFloat128Frac1( a );
5339 aSig0 = extractFloat128Frac0( a );
5340 aExp = extractFloat128Exp( a );
5341 aSign = extractFloat128Sign( a );
5342 if ( aExp == 0x7FFF ) {
5343 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5344 if ( ! aSign ) return a;
5345 goto invalid;
5347 if ( aSign ) {
5348 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5349 invalid:
5350 float_raise( float_flag_invalid STATUS_VAR);
5351 z.low = float128_default_nan_low;
5352 z.high = float128_default_nan_high;
5353 return z;
5355 if ( aExp == 0 ) {
5356 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5357 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5359 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5360 aSig0 |= LIT64( 0x0001000000000000 );
5361 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5362 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5363 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5364 doubleZSig0 = zSig0<<1;
5365 mul64To128( zSig0, zSig0, &term0, &term1 );
5366 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5367 while ( (sbits64) rem0 < 0 ) {
5368 --zSig0;
5369 doubleZSig0 -= 2;
5370 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5372 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5373 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5374 if ( zSig1 == 0 ) zSig1 = 1;
5375 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5376 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5377 mul64To128( zSig1, zSig1, &term2, &term3 );
5378 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5379 while ( (sbits64) rem1 < 0 ) {
5380 --zSig1;
5381 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5382 term3 |= 1;
5383 term2 |= doubleZSig0;
5384 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5386 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5388 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5389 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5393 /*----------------------------------------------------------------------------
5394 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5395 | the corresponding value `b', and 0 otherwise. The comparison is performed
5396 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5397 *----------------------------------------------------------------------------*/
5399 int float128_eq( float128 a, float128 b STATUS_PARAM )
5402 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5403 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5404 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5405 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5407 if ( float128_is_signaling_nan( a )
5408 || float128_is_signaling_nan( b ) ) {
5409 float_raise( float_flag_invalid STATUS_VAR);
5411 return 0;
5413 return
5414 ( a.low == b.low )
5415 && ( ( a.high == b.high )
5416 || ( ( a.low == 0 )
5417 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5422 /*----------------------------------------------------------------------------
5423 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5424 | or equal to the corresponding value `b', and 0 otherwise. The comparison
5425 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5426 | Arithmetic.
5427 *----------------------------------------------------------------------------*/
5429 int float128_le( float128 a, float128 b STATUS_PARAM )
5431 flag aSign, bSign;
5433 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5434 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5435 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5436 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5438 float_raise( float_flag_invalid STATUS_VAR);
5439 return 0;
5441 aSign = extractFloat128Sign( a );
5442 bSign = extractFloat128Sign( b );
5443 if ( aSign != bSign ) {
5444 return
5445 aSign
5446 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5447 == 0 );
5449 return
5450 aSign ? le128( b.high, b.low, a.high, a.low )
5451 : le128( a.high, a.low, b.high, b.low );
5455 /*----------------------------------------------------------------------------
5456 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5457 | the corresponding value `b', and 0 otherwise. The comparison is performed
5458 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5459 *----------------------------------------------------------------------------*/
5461 int float128_lt( float128 a, float128 b STATUS_PARAM )
5463 flag aSign, bSign;
5465 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5466 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5467 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5468 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5470 float_raise( float_flag_invalid STATUS_VAR);
5471 return 0;
5473 aSign = extractFloat128Sign( a );
5474 bSign = extractFloat128Sign( b );
5475 if ( aSign != bSign ) {
5476 return
5477 aSign
5478 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5479 != 0 );
5481 return
5482 aSign ? lt128( b.high, b.low, a.high, a.low )
5483 : lt128( a.high, a.low, b.high, b.low );
5487 /*----------------------------------------------------------------------------
5488 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5489 | the corresponding value `b', and 0 otherwise. The invalid exception is
5490 | raised if either operand is a NaN. Otherwise, the comparison is performed
5491 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5492 *----------------------------------------------------------------------------*/
5494 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5497 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5498 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5499 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5500 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5502 float_raise( float_flag_invalid STATUS_VAR);
5503 return 0;
5505 return
5506 ( a.low == b.low )
5507 && ( ( a.high == b.high )
5508 || ( ( a.low == 0 )
5509 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5514 /*----------------------------------------------------------------------------
5515 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5516 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5517 | cause an exception. Otherwise, the comparison is performed according to the
5518 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5519 *----------------------------------------------------------------------------*/
5521 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5523 flag aSign, bSign;
5525 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5526 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5527 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5528 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5530 if ( float128_is_signaling_nan( a )
5531 || float128_is_signaling_nan( b ) ) {
5532 float_raise( float_flag_invalid STATUS_VAR);
5534 return 0;
5536 aSign = extractFloat128Sign( a );
5537 bSign = extractFloat128Sign( b );
5538 if ( aSign != bSign ) {
5539 return
5540 aSign
5541 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5542 == 0 );
5544 return
5545 aSign ? le128( b.high, b.low, a.high, a.low )
5546 : le128( a.high, a.low, b.high, b.low );
5550 /*----------------------------------------------------------------------------
5551 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5552 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5553 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5554 | Standard for Binary Floating-Point Arithmetic.
5555 *----------------------------------------------------------------------------*/
5557 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5559 flag aSign, bSign;
5561 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5562 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5563 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5564 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5566 if ( float128_is_signaling_nan( a )
5567 || float128_is_signaling_nan( b ) ) {
5568 float_raise( float_flag_invalid STATUS_VAR);
5570 return 0;
5572 aSign = extractFloat128Sign( a );
5573 bSign = extractFloat128Sign( b );
5574 if ( aSign != bSign ) {
5575 return
5576 aSign
5577 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5578 != 0 );
5580 return
5581 aSign ? lt128( b.high, b.low, a.high, a.low )
5582 : lt128( a.high, a.low, b.high, b.low );
5586 #endif
5588 /* misc functions */
5589 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5591 return int64_to_float32(a STATUS_VAR);
5594 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5596 return int64_to_float64(a STATUS_VAR);
5599 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5601 int64_t v;
5602 unsigned int res;
5604 v = float32_to_int64(a STATUS_VAR);
5605 if (v < 0) {
5606 res = 0;
5607 float_raise( float_flag_invalid STATUS_VAR);
5608 } else if (v > 0xffffffff) {
5609 res = 0xffffffff;
5610 float_raise( float_flag_invalid STATUS_VAR);
5611 } else {
5612 res = v;
5614 return res;
5617 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5619 int64_t v;
5620 unsigned int res;
5622 v = float32_to_int64_round_to_zero(a STATUS_VAR);
5623 if (v < 0) {
5624 res = 0;
5625 float_raise( float_flag_invalid STATUS_VAR);
5626 } else if (v > 0xffffffff) {
5627 res = 0xffffffff;
5628 float_raise( float_flag_invalid STATUS_VAR);
5629 } else {
5630 res = v;
5632 return res;
5635 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5637 int64_t v;
5638 unsigned int res;
5640 v = float64_to_int64(a STATUS_VAR);
5641 if (v < 0) {
5642 res = 0;
5643 float_raise( float_flag_invalid STATUS_VAR);
5644 } else if (v > 0xffffffff) {
5645 res = 0xffffffff;
5646 float_raise( float_flag_invalid STATUS_VAR);
5647 } else {
5648 res = v;
5650 return res;
5653 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5655 int64_t v;
5656 unsigned int res;
5658 v = float64_to_int64_round_to_zero(a STATUS_VAR);
5659 if (v < 0) {
5660 res = 0;
5661 float_raise( float_flag_invalid STATUS_VAR);
5662 } else if (v > 0xffffffff) {
5663 res = 0xffffffff;
5664 float_raise( float_flag_invalid STATUS_VAR);
5665 } else {
5666 res = v;
5668 return res;
5671 /* FIXME: This looks broken. */
5672 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5674 int64_t v;
5676 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5677 v += float64_val(a);
5678 v = float64_to_int64(make_float64(v) STATUS_VAR);
5680 return v - INT64_MIN;
5683 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5685 int64_t v;
5687 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5688 v += float64_val(a);
5689 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5691 return v - INT64_MIN;
5694 #define COMPARE(s, nan_exp) \
5695 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
5696 int is_quiet STATUS_PARAM ) \
5698 flag aSign, bSign; \
5699 bits ## s av, bv; \
5701 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
5702 extractFloat ## s ## Frac( a ) ) || \
5703 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
5704 extractFloat ## s ## Frac( b ) )) { \
5705 if (!is_quiet || \
5706 float ## s ## _is_signaling_nan( a ) || \
5707 float ## s ## _is_signaling_nan( b ) ) { \
5708 float_raise( float_flag_invalid STATUS_VAR); \
5710 return float_relation_unordered; \
5712 aSign = extractFloat ## s ## Sign( a ); \
5713 bSign = extractFloat ## s ## Sign( b ); \
5714 av = float ## s ## _val(a); \
5715 bv = float ## s ## _val(b); \
5716 if ( aSign != bSign ) { \
5717 if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) { \
5718 /* zero case */ \
5719 return float_relation_equal; \
5720 } else { \
5721 return 1 - (2 * aSign); \
5723 } else { \
5724 if (av == bv) { \
5725 return float_relation_equal; \
5726 } else { \
5727 return 1 - 2 * (aSign ^ ( av < bv )); \
5732 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
5734 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
5737 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
5739 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
5742 COMPARE(32, 0xff)
5743 COMPARE(64, 0x7ff)
5745 INLINE int float128_compare_internal( float128 a, float128 b,
5746 int is_quiet STATUS_PARAM )
5748 flag aSign, bSign;
5750 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
5751 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
5752 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
5753 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
5754 if (!is_quiet ||
5755 float128_is_signaling_nan( a ) ||
5756 float128_is_signaling_nan( b ) ) {
5757 float_raise( float_flag_invalid STATUS_VAR);
5759 return float_relation_unordered;
5761 aSign = extractFloat128Sign( a );
5762 bSign = extractFloat128Sign( b );
5763 if ( aSign != bSign ) {
5764 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
5765 /* zero case */
5766 return float_relation_equal;
5767 } else {
5768 return 1 - (2 * aSign);
5770 } else {
5771 if (a.low == b.low && a.high == b.high) {
5772 return float_relation_equal;
5773 } else {
5774 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
5779 int float128_compare( float128 a, float128 b STATUS_PARAM )
5781 return float128_compare_internal(a, b, 0 STATUS_VAR);
5784 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
5786 return float128_compare_internal(a, b, 1 STATUS_VAR);
5789 /* Multiply A by 2 raised to the power N. */
5790 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
5792 flag aSign;
5793 int16 aExp;
5794 bits32 aSig;
5796 aSig = extractFloat32Frac( a );
5797 aExp = extractFloat32Exp( a );
5798 aSign = extractFloat32Sign( a );
5800 if ( aExp == 0xFF ) {
5801 return a;
5803 if ( aExp != 0 )
5804 aSig |= 0x00800000;
5805 else if ( aSig == 0 )
5806 return a;
5808 aExp += n - 1;
5809 aSig <<= 7;
5810 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
5813 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
5815 flag aSign;
5816 int16 aExp;
5817 bits64 aSig;
5819 aSig = extractFloat64Frac( a );
5820 aExp = extractFloat64Exp( a );
5821 aSign = extractFloat64Sign( a );
5823 if ( aExp == 0x7FF ) {
5824 return a;
5826 if ( aExp != 0 )
5827 aSig |= LIT64( 0x0010000000000000 );
5828 else if ( aSig == 0 )
5829 return a;
5831 aExp += n - 1;
5832 aSig <<= 10;
5833 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
5836 #ifdef FLOATX80
5837 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
5839 flag aSign;
5840 int16 aExp;
5841 bits64 aSig;
5843 aSig = extractFloatx80Frac( a );
5844 aExp = extractFloatx80Exp( a );
5845 aSign = extractFloatx80Sign( a );
5847 if ( aExp == 0x7FF ) {
5848 return a;
5850 if (aExp == 0 && aSig == 0)
5851 return a;
5853 aExp += n;
5854 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
5855 aSign, aExp, aSig, 0 STATUS_VAR );
5857 #endif
5859 #ifdef FLOAT128
5860 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
5862 flag aSign;
5863 int32 aExp;
5864 bits64 aSig0, aSig1;
5866 aSig1 = extractFloat128Frac1( a );
5867 aSig0 = extractFloat128Frac0( a );
5868 aExp = extractFloat128Exp( a );
5869 aSign = extractFloat128Sign( a );
5870 if ( aExp == 0x7FFF ) {
5871 return a;
5873 if ( aExp != 0 )
5874 aSig0 |= LIT64( 0x0001000000000000 );
5875 else if ( aSig0 == 0 && aSig1 == 0 )
5876 return a;
5878 aExp += n - 1;
5879 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
5880 STATUS_VAR );
5883 #endif