fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * Derived from SoftFloat.
   5  */
   6
   7 /*============================================================================
   8
   9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  10 Package, Release 2b.
  11
  12 Written by John R. Hauser.  This work was made possible in part by the
  13 International Computer Science Institute, located at Suite 600, 1947 Center
  14 Street, Berkeley, California 94704.  Funding was partially provided by the
  15 National Science Foundation under grant MIP-9311980.  The original version
  16 of this code was written as part of a project to build a fixed-point vector
  17 processor in collaboration with the University of California at Berkeley,
  18 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  20 arithmetic/SoftFloat.html'.
  21
  22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  24 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  30
  31 Derivative works are acceptable, even for commercial purposes, so long as
  32 (1) the source code for the derivative work includes prominent notice that
  33 the work is derivative, and (2) the source code includes prominent notice with
  34 these four paragraphs for those parts of this code that are retained.
  35
  36 =============================================================================*/
  37
  38 #include "softfloat.h"
  39
  40 /*----------------------------------------------------------------------------
  41 | Primitive arithmetic functions, including multi-word arithmetic, and
  42 | division and square root approximations.  (Can be specialized to target if
  43 | desired.)
  44 *----------------------------------------------------------------------------*/
  45 #include "softfloat-macros.h"
  46
  47 /*----------------------------------------------------------------------------
  48 | Functions and definitions to determine:  (1) whether tininess for underflow
  49 | is detected before or after rounding by default, (2) what (if anything)
  50 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  51 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  52 | are propagated from function inputs to output.  These details are target-
  53 | specific.
  54 *----------------------------------------------------------------------------*/
  55 #include "softfloat-specialize.h"
  56
  57 void set_float_rounding_mode(int val STATUS_PARAM)
  58 {
  59     STATUS(float_rounding_mode) = val;
  60 }
  61
  62 void set_float_exception_flags(int val STATUS_PARAM)
  63 {
  64     STATUS(float_exception_flags) = val;
  65 }
  66
  67 #ifdef FLOATX80
  68 void set_floatx80_rounding_precision(int val STATUS_PARAM)
  69 {
  70     STATUS(floatx80_rounding_precision) = val;
  71 }
  72 #endif
  73
  74 /*----------------------------------------------------------------------------
  75 | Returns the fraction bits of the half-precision floating-point value `a'.
  76 *----------------------------------------------------------------------------*/
  77
  78 INLINE uint32_t extractFloat16Frac(float16 a)
  79 {
  80     return float16_val(a) & 0x3ff;
  81 }
  82
  83 /*----------------------------------------------------------------------------
  84 | Returns the exponent bits of the half-precision floating-point value `a'.
  85 *----------------------------------------------------------------------------*/
  86
  87 INLINE int16 extractFloat16Exp(float16 a)
  88 {
  89     return (float16_val(a) >> 10) & 0x1f;
  90 }
  91
  92 /*----------------------------------------------------------------------------
  93 | Returns the sign bit of the single-precision floating-point value `a'.
  94 *----------------------------------------------------------------------------*/
  95
  96 INLINE flag extractFloat16Sign(float16 a)
  97 {
  98     return float16_val(a)>>15;
  99 }
 100
 101 /*----------------------------------------------------------------------------
 102 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
 103 | and 7, and returns the properly rounded 32-bit integer corresponding to the
 104 | input.  If `zSign' is 1, the input is negated before being converted to an
 105 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 106 | is simply rounded to an integer, with the inexact exception raised if the
 107 | input cannot be represented exactly as an integer.  However, if the fixed-
 108 | point input is too large, the invalid exception is raised and the largest
 109 | positive or negative integer is returned.
 110 *----------------------------------------------------------------------------*/
 111
 112 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
 113 {
 114     int8 roundingMode;
 115     flag roundNearestEven;
 116     int8 roundIncrement, roundBits;
 117     int32 z;
 118
 119     roundingMode = STATUS(float_rounding_mode);
 120     roundNearestEven = ( roundingMode == float_round_nearest_even );
 121     roundIncrement = 0x40;
 122     if ( ! roundNearestEven ) {
 123         if ( roundingMode == float_round_to_zero ) {
 124             roundIncrement = 0;
 125         }
 126         else {
 127             roundIncrement = 0x7F;
 128             if ( zSign ) {
 129                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 130             }
 131             else {
 132                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 133             }
 134         }
 135     }
 136     roundBits = absZ & 0x7F;
 137     absZ = ( absZ + roundIncrement )>>7;
 138     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 139     z = absZ;
 140     if ( zSign ) z = - z;
 141     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 142         float_raise( float_flag_invalid STATUS_VAR);
 143         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
 144     }
 145     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 146     return z;
 147
 148 }
 149
 150 /*----------------------------------------------------------------------------
 151 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 152 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 153 | and returns the properly rounded 64-bit integer corresponding to the input.
 154 | If `zSign' is 1, the input is negated before being converted to an integer.
 155 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 156 | the inexact exception raised if the input cannot be represented exactly as
 157 | an integer.  However, if the fixed-point input is too large, the invalid
 158 | exception is raised and the largest positive or negative integer is
 159 | returned.
 160 *----------------------------------------------------------------------------*/
 161
 162 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
 163 {
 164     int8 roundingMode;
 165     flag roundNearestEven, increment;
 166     int64 z;
 167
 168     roundingMode = STATUS(float_rounding_mode);
 169     roundNearestEven = ( roundingMode == float_round_nearest_even );
 170     increment = ( (int64_t) absZ1 < 0 );
 171     if ( ! roundNearestEven ) {
 172         if ( roundingMode == float_round_to_zero ) {
 173             increment = 0;
 174         }
 175         else {
 176             if ( zSign ) {
 177                 increment = ( roundingMode == float_round_down ) && absZ1;
 178             }
 179             else {
 180                 increment = ( roundingMode == float_round_up ) && absZ1;
 181             }
 182         }
 183     }
 184     if ( increment ) {
 185         ++absZ0;
 186         if ( absZ0 == 0 ) goto overflow;
 187         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 188     }
 189     z = absZ0;
 190     if ( zSign ) z = - z;
 191     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 192  overflow:
 193         float_raise( float_flag_invalid STATUS_VAR);
 194         return
 195               zSign ? (int64_t) LIT64( 0x8000000000000000 )
 196             : LIT64( 0x7FFFFFFFFFFFFFFF );
 197     }
 198     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 199     return z;
 200
 201 }
 202
 203 /*----------------------------------------------------------------------------
 204 | Returns the fraction bits of the single-precision floating-point value `a'.
 205 *----------------------------------------------------------------------------*/
 206
 207 INLINE uint32_t extractFloat32Frac( float32 a )
 208 {
 209
 210     return float32_val(a) & 0x007FFFFF;
 211
 212 }
 213
 214 /*----------------------------------------------------------------------------
 215 | Returns the exponent bits of the single-precision floating-point value `a'.
 216 *----------------------------------------------------------------------------*/
 217
 218 INLINE int16 extractFloat32Exp( float32 a )
 219 {
 220
 221     return ( float32_val(a)>>23 ) & 0xFF;
 222
 223 }
 224
 225 /*----------------------------------------------------------------------------
 226 | Returns the sign bit of the single-precision floating-point value `a'.
 227 *----------------------------------------------------------------------------*/
 228
 229 INLINE flag extractFloat32Sign( float32 a )
 230 {
 231
 232     return float32_val(a)>>31;
 233
 234 }
 235
 236 /*----------------------------------------------------------------------------
 237 | If `a' is denormal and we are in flush-to-zero mode then set the
 238 | input-denormal exception and return zero. Otherwise just return the value.
 239 *----------------------------------------------------------------------------*/
 240 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
 241 {
 242     if (STATUS(flush_inputs_to_zero)) {
 243         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
 244             float_raise(float_flag_input_denormal STATUS_VAR);
 245             return make_float32(float32_val(a) & 0x80000000);
 246         }
 247     }
 248     return a;
 249 }
 250
 251 /*----------------------------------------------------------------------------
 252 | Normalizes the subnormal single-precision floating-point value represented
 253 | by the denormalized significand `aSig'.  The normalized exponent and
 254 | significand are stored at the locations pointed to by `zExpPtr' and
 255 | `zSigPtr', respectively.
 256 *----------------------------------------------------------------------------*/
 257
 258 static void
 259  normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr )
 260 {
 261     int8 shiftCount;
 262
 263     shiftCount = countLeadingZeros32( aSig ) - 8;
 264     *zSigPtr = aSig<<shiftCount;
 265     *zExpPtr = 1 - shiftCount;
 266
 267 }
 268
 269 /*----------------------------------------------------------------------------
 270 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 271 | single-precision floating-point value, returning the result.  After being
 272 | shifted into the proper positions, the three fields are simply added
 273 | together to form the result.  This means that any integer portion of `zSig'
 274 | will be added into the exponent.  Since a properly normalized significand
 275 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 276 | than the desired result exponent whenever `zSig' is a complete, normalized
 277 | significand.
 278 *----------------------------------------------------------------------------*/
 279
 280 INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig )
 281 {
 282
 283     return make_float32(
 284           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
 285
 286 }
 287
 288 /*----------------------------------------------------------------------------
 289 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 290 | and significand `zSig', and returns the proper single-precision floating-
 291 | point value corresponding to the abstract input.  Ordinarily, the abstract
 292 | value is simply rounded and packed into the single-precision format, with
 293 | the inexact exception raised if the abstract input cannot be represented
 294 | exactly.  However, if the abstract value is too large, the overflow and
 295 | inexact exceptions are raised and an infinity or maximal finite value is
 296 | returned.  If the abstract value is too small, the input value is rounded to
 297 | a subnormal number, and the underflow and inexact exceptions are raised if
 298 | the abstract input cannot be represented exactly as a subnormal single-
 299 | precision floating-point number.
 300 |     The input significand `zSig' has its binary point between bits 30
 301 | and 29, which is 7 bits to the left of the usual location.  This shifted
 302 | significand must be normalized or smaller.  If `zSig' is not normalized,
 303 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 304 | and it must not require rounding.  In the usual case that `zSig' is
 305 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 306 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 307 | Binary Floating-Point Arithmetic.
 308 *----------------------------------------------------------------------------*/
 309
 310 static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
 311 {
 312     int8 roundingMode;
 313     flag roundNearestEven;
 314     int8 roundIncrement, roundBits;
 315     flag isTiny;
 316
 317     roundingMode = STATUS(float_rounding_mode);
 318     roundNearestEven = ( roundingMode == float_round_nearest_even );
 319     roundIncrement = 0x40;
 320     if ( ! roundNearestEven ) {
 321         if ( roundingMode == float_round_to_zero ) {
 322             roundIncrement = 0;
 323         }
 324         else {
 325             roundIncrement = 0x7F;
 326             if ( zSign ) {
 327                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 328             }
 329             else {
 330                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 331             }
 332         }
 333     }
 334     roundBits = zSig & 0x7F;
 335     if ( 0xFD <= (uint16_t) zExp ) {
 336         if (    ( 0xFD < zExp )
 337              || (    ( zExp == 0xFD )
 338                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
 339            ) {
 340             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 341             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 342         }
 343         if ( zExp < 0 ) {
 344             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
 345             isTiny =
 346                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 347                 || ( zExp < -1 )
 348                 || ( zSig + roundIncrement < 0x80000000 );
 349             shift32RightJamming( zSig, - zExp, &zSig );
 350             zExp = 0;
 351             roundBits = zSig & 0x7F;
 352             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 353         }
 354     }
 355     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 356     zSig = ( zSig + roundIncrement )>>7;
 357     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 358     if ( zSig == 0 ) zExp = 0;
 359     return packFloat32( zSign, zExp, zSig );
 360
 361 }
 362
 363 /*----------------------------------------------------------------------------
 364 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 365 | and significand `zSig', and returns the proper single-precision floating-
 366 | point value corresponding to the abstract input.  This routine is just like
 367 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 368 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 369 | floating-point exponent.
 370 *----------------------------------------------------------------------------*/
 371
 372 static float32
 373  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
 374 {
 375     int8 shiftCount;
 376
 377     shiftCount = countLeadingZeros32( zSig ) - 1;
 378     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 379
 380 }
 381
 382 /*----------------------------------------------------------------------------
 383 | Returns the fraction bits of the double-precision floating-point value `a'.
 384 *----------------------------------------------------------------------------*/
 385
 386 INLINE uint64_t extractFloat64Frac( float64 a )
 387 {
 388
 389     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 390
 391 }
 392
 393 /*----------------------------------------------------------------------------
 394 | Returns the exponent bits of the double-precision floating-point value `a'.
 395 *----------------------------------------------------------------------------*/
 396
 397 INLINE int16 extractFloat64Exp( float64 a )
 398 {
 399
 400     return ( float64_val(a)>>52 ) & 0x7FF;
 401
 402 }
 403
 404 /*----------------------------------------------------------------------------
 405 | Returns the sign bit of the double-precision floating-point value `a'.
 406 *----------------------------------------------------------------------------*/
 407
 408 INLINE flag extractFloat64Sign( float64 a )
 409 {
 410
 411     return float64_val(a)>>63;
 412
 413 }
 414
 415 /*----------------------------------------------------------------------------
 416 | If `a' is denormal and we are in flush-to-zero mode then set the
 417 | input-denormal exception and return zero. Otherwise just return the value.
 418 *----------------------------------------------------------------------------*/
 419 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
 420 {
 421     if (STATUS(flush_inputs_to_zero)) {
 422         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
 423             float_raise(float_flag_input_denormal STATUS_VAR);
 424             return make_float64(float64_val(a) & (1ULL << 63));
 425         }
 426     }
 427     return a;
 428 }
 429
 430 /*----------------------------------------------------------------------------
 431 | Normalizes the subnormal double-precision floating-point value represented
 432 | by the denormalized significand `aSig'.  The normalized exponent and
 433 | significand are stored at the locations pointed to by `zExpPtr' and
 434 | `zSigPtr', respectively.
 435 *----------------------------------------------------------------------------*/
 436
 437 static void
 438  normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr )
 439 {
 440     int8 shiftCount;
 441
 442     shiftCount = countLeadingZeros64( aSig ) - 11;
 443     *zSigPtr = aSig<<shiftCount;
 444     *zExpPtr = 1 - shiftCount;
 445
 446 }
 447
 448 /*----------------------------------------------------------------------------
 449 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 450 | double-precision floating-point value, returning the result.  After being
 451 | shifted into the proper positions, the three fields are simply added
 452 | together to form the result.  This means that any integer portion of `zSig'
 453 | will be added into the exponent.  Since a properly normalized significand
 454 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 455 | than the desired result exponent whenever `zSig' is a complete, normalized
 456 | significand.
 457 *----------------------------------------------------------------------------*/
 458
 459 INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig )
 460 {
 461
 462     return make_float64(
 463         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
 464
 465 }
 466
 467 /*----------------------------------------------------------------------------
 468 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 469 | and significand `zSig', and returns the proper double-precision floating-
 470 | point value corresponding to the abstract input.  Ordinarily, the abstract
 471 | value is simply rounded and packed into the double-precision format, with
 472 | the inexact exception raised if the abstract input cannot be represented
 473 | exactly.  However, if the abstract value is too large, the overflow and
 474 | inexact exceptions are raised and an infinity or maximal finite value is
 475 | returned.  If the abstract value is too small, the input value is rounded
 476 | to a subnormal number, and the underflow and inexact exceptions are raised
 477 | if the abstract input cannot be represented exactly as a subnormal double-
 478 | precision floating-point number.
 479 |     The input significand `zSig' has its binary point between bits 62
 480 | and 61, which is 10 bits to the left of the usual location.  This shifted
 481 | significand must be normalized or smaller.  If `zSig' is not normalized,
 482 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 483 | and it must not require rounding.  In the usual case that `zSig' is
 484 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 485 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 486 | Binary Floating-Point Arithmetic.
 487 *----------------------------------------------------------------------------*/
 488
 489 static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
 490 {
 491     int8 roundingMode;
 492     flag roundNearestEven;
 493     int16 roundIncrement, roundBits;
 494     flag isTiny;
 495
 496     roundingMode = STATUS(float_rounding_mode);
 497     roundNearestEven = ( roundingMode == float_round_nearest_even );
 498     roundIncrement = 0x200;
 499     if ( ! roundNearestEven ) {
 500         if ( roundingMode == float_round_to_zero ) {
 501             roundIncrement = 0;
 502         }
 503         else {
 504             roundIncrement = 0x3FF;
 505             if ( zSign ) {
 506                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 507             }
 508             else {
 509                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 510             }
 511         }
 512     }
 513     roundBits = zSig & 0x3FF;
 514     if ( 0x7FD <= (uint16_t) zExp ) {
 515         if (    ( 0x7FD < zExp )
 516              || (    ( zExp == 0x7FD )
 517                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
 518            ) {
 519             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 520             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 521         }
 522         if ( zExp < 0 ) {
 523             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
 524             isTiny =
 525                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 526                 || ( zExp < -1 )
 527                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 528             shift64RightJamming( zSig, - zExp, &zSig );
 529             zExp = 0;
 530             roundBits = zSig & 0x3FF;
 531             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 532         }
 533     }
 534     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 535     zSig = ( zSig + roundIncrement )>>10;
 536     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 537     if ( zSig == 0 ) zExp = 0;
 538     return packFloat64( zSign, zExp, zSig );
 539
 540 }
 541
 542 /*----------------------------------------------------------------------------
 543 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 544 | and significand `zSig', and returns the proper double-precision floating-
 545 | point value corresponding to the abstract input.  This routine is just like
 546 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 547 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 548 | floating-point exponent.
 549 *----------------------------------------------------------------------------*/
 550
 551 static float64
 552  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
 553 {
 554     int8 shiftCount;
 555
 556     shiftCount = countLeadingZeros64( zSig ) - 1;
 557     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 558
 559 }
 560
 561 #ifdef FLOATX80
 562
 563 /*----------------------------------------------------------------------------
 564 | Returns the fraction bits of the extended double-precision floating-point
 565 | value `a'.
 566 *----------------------------------------------------------------------------*/
 567
 568 INLINE uint64_t extractFloatx80Frac( floatx80 a )
 569 {
 570
 571     return a.low;
 572
 573 }
 574
 575 /*----------------------------------------------------------------------------
 576 | Returns the exponent bits of the extended double-precision floating-point
 577 | value `a'.
 578 *----------------------------------------------------------------------------*/
 579
 580 INLINE int32 extractFloatx80Exp( floatx80 a )
 581 {
 582
 583     return a.high & 0x7FFF;
 584
 585 }
 586
 587 /*----------------------------------------------------------------------------
 588 | Returns the sign bit of the extended double-precision floating-point value
 589 | `a'.
 590 *----------------------------------------------------------------------------*/
 591
 592 INLINE flag extractFloatx80Sign( floatx80 a )
 593 {
 594
 595     return a.high>>15;
 596
 597 }
 598
 599 /*----------------------------------------------------------------------------
 600 | Normalizes the subnormal extended double-precision floating-point value
 601 | represented by the denormalized significand `aSig'.  The normalized exponent
 602 | and significand are stored at the locations pointed to by `zExpPtr' and
 603 | `zSigPtr', respectively.
 604 *----------------------------------------------------------------------------*/
 605
 606 static void
 607  normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
 608 {
 609     int8 shiftCount;
 610
 611     shiftCount = countLeadingZeros64( aSig );
 612     *zSigPtr = aSig<<shiftCount;
 613     *zExpPtr = 1 - shiftCount;
 614
 615 }
 616
 617 /*----------------------------------------------------------------------------
 618 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 619 | extended double-precision floating-point value, returning the result.
 620 *----------------------------------------------------------------------------*/
 621
 622 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
 623 {
 624     floatx80 z;
 625
 626     z.low = zSig;
 627     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
 628     return z;
 629
 630 }
 631
 632 /*----------------------------------------------------------------------------
 633 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 634 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 635 | and returns the proper extended double-precision floating-point value
 636 | corresponding to the abstract input.  Ordinarily, the abstract value is
 637 | rounded and packed into the extended double-precision format, with the
 638 | inexact exception raised if the abstract input cannot be represented
 639 | exactly.  However, if the abstract value is too large, the overflow and
 640 | inexact exceptions are raised and an infinity or maximal finite value is
 641 | returned.  If the abstract value is too small, the input value is rounded to
 642 | a subnormal number, and the underflow and inexact exceptions are raised if
 643 | the abstract input cannot be represented exactly as a subnormal extended
 644 | double-precision floating-point number.
 645 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 646 | number of bits as single or double precision, respectively.  Otherwise, the
 647 | result is rounded to the full precision of the extended double-precision
 648 | format.
 649 |     The input significand must be normalized or smaller.  If the input
 650 | significand is not normalized, `zExp' must be 0; in that case, the result
 651 | returned is a subnormal number, and it must not require rounding.  The
 652 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 653 | Floating-Point Arithmetic.
 654 *----------------------------------------------------------------------------*/
 655
 656 static floatx80
 657  roundAndPackFloatx80(
 658      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 659  STATUS_PARAM)
 660 {
 661     int8 roundingMode;
 662     flag roundNearestEven, increment, isTiny;
 663     int64 roundIncrement, roundMask, roundBits;
 664
 665     roundingMode = STATUS(float_rounding_mode);
 666     roundNearestEven = ( roundingMode == float_round_nearest_even );
 667     if ( roundingPrecision == 80 ) goto precision80;
 668     if ( roundingPrecision == 64 ) {
 669         roundIncrement = LIT64( 0x0000000000000400 );
 670         roundMask = LIT64( 0x00000000000007FF );
 671     }
 672     else if ( roundingPrecision == 32 ) {
 673         roundIncrement = LIT64( 0x0000008000000000 );
 674         roundMask = LIT64( 0x000000FFFFFFFFFF );
 675     }
 676     else {
 677         goto precision80;
 678     }
 679     zSig0 |= ( zSig1 != 0 );
 680     if ( ! roundNearestEven ) {
 681         if ( roundingMode == float_round_to_zero ) {
 682             roundIncrement = 0;
 683         }
 684         else {
 685             roundIncrement = roundMask;
 686             if ( zSign ) {
 687                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 688             }
 689             else {
 690                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 691             }
 692         }
 693     }
 694     roundBits = zSig0 & roundMask;
 695     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 696         if (    ( 0x7FFE < zExp )
 697              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 698            ) {
 699             goto overflow;
 700         }
 701         if ( zExp <= 0 ) {
 702             if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
 703             isTiny =
 704                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 705                 || ( zExp < 0 )
 706                 || ( zSig0 <= zSig0 + roundIncrement );
 707             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 708             zExp = 0;
 709             roundBits = zSig0 & roundMask;
 710             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 711             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 712             zSig0 += roundIncrement;
 713             if ( (int64_t) zSig0 < 0 ) zExp = 1;
 714             roundIncrement = roundMask + 1;
 715             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 716                 roundMask |= roundIncrement;
 717             }
 718             zSig0 &= ~ roundMask;
 719             return packFloatx80( zSign, zExp, zSig0 );
 720         }
 721     }
 722     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 723     zSig0 += roundIncrement;
 724     if ( zSig0 < roundIncrement ) {
 725         ++zExp;
 726         zSig0 = LIT64( 0x8000000000000000 );
 727     }
 728     roundIncrement = roundMask + 1;
 729     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 730         roundMask |= roundIncrement;
 731     }
 732     zSig0 &= ~ roundMask;
 733     if ( zSig0 == 0 ) zExp = 0;
 734     return packFloatx80( zSign, zExp, zSig0 );
 735  precision80:
 736     increment = ( (int64_t) zSig1 < 0 );
 737     if ( ! roundNearestEven ) {
 738         if ( roundingMode == float_round_to_zero ) {
 739             increment = 0;
 740         }
 741         else {
 742             if ( zSign ) {
 743                 increment = ( roundingMode == float_round_down ) && zSig1;
 744             }
 745             else {
 746                 increment = ( roundingMode == float_round_up ) && zSig1;
 747             }
 748         }
 749     }
 750     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 751         if (    ( 0x7FFE < zExp )
 752              || (    ( zExp == 0x7FFE )
 753                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 754                   && increment
 755                 )
 756            ) {
 757             roundMask = 0;
 758  overflow:
 759             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 760             if (    ( roundingMode == float_round_to_zero )
 761                  || ( zSign && ( roundingMode == float_round_up ) )
 762                  || ( ! zSign && ( roundingMode == float_round_down ) )
 763                ) {
 764                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 765             }
 766             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 767         }
 768         if ( zExp <= 0 ) {
 769             isTiny =
 770                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 771                 || ( zExp < 0 )
 772                 || ! increment
 773                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 774             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 775             zExp = 0;
 776             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 777             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 778             if ( roundNearestEven ) {
 779                 increment = ( (int64_t) zSig1 < 0 );
 780             }
 781             else {
 782                 if ( zSign ) {
 783                     increment = ( roundingMode == float_round_down ) && zSig1;
 784                 }
 785                 else {
 786                     increment = ( roundingMode == float_round_up ) && zSig1;
 787                 }
 788             }
 789             if ( increment ) {
 790                 ++zSig0;
 791                 zSig0 &=
 792                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 793                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
 794             }
 795             return packFloatx80( zSign, zExp, zSig0 );
 796         }
 797     }
 798     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 799     if ( increment ) {
 800         ++zSig0;
 801         if ( zSig0 == 0 ) {
 802             ++zExp;
 803             zSig0 = LIT64( 0x8000000000000000 );
 804         }
 805         else {
 806             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 807         }
 808     }
 809     else {
 810         if ( zSig0 == 0 ) zExp = 0;
 811     }
 812     return packFloatx80( zSign, zExp, zSig0 );
 813
 814 }
 815
 816 /*----------------------------------------------------------------------------
 817 | Takes an abstract floating-point value having sign `zSign', exponent
 818 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 819 | and returns the proper extended double-precision floating-point value
 820 | corresponding to the abstract input.  This routine is just like
 821 | `roundAndPackFloatx80' except that the input significand does not have to be
 822 | normalized.
 823 *----------------------------------------------------------------------------*/
 824
 825 static floatx80
 826  normalizeRoundAndPackFloatx80(
 827      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 828  STATUS_PARAM)
 829 {
 830     int8 shiftCount;
 831
 832     if ( zSig0 == 0 ) {
 833         zSig0 = zSig1;
 834         zSig1 = 0;
 835         zExp -= 64;
 836     }
 837     shiftCount = countLeadingZeros64( zSig0 );
 838     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 839     zExp -= shiftCount;
 840     return
 841         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 842
 843 }
 844
 845 #endif
 846
 847 #ifdef FLOAT128
 848
 849 /*----------------------------------------------------------------------------
 850 | Returns the least-significant 64 fraction bits of the quadruple-precision
 851 | floating-point value `a'.
 852 *----------------------------------------------------------------------------*/
 853
 854 INLINE uint64_t extractFloat128Frac1( float128 a )
 855 {
 856
 857     return a.low;
 858
 859 }
 860
 861 /*----------------------------------------------------------------------------
 862 | Returns the most-significant 48 fraction bits of the quadruple-precision
 863 | floating-point value `a'.
 864 *----------------------------------------------------------------------------*/
 865
 866 INLINE uint64_t extractFloat128Frac0( float128 a )
 867 {
 868
 869     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 870
 871 }
 872
 873 /*----------------------------------------------------------------------------
 874 | Returns the exponent bits of the quadruple-precision floating-point value
 875 | `a'.
 876 *----------------------------------------------------------------------------*/
 877
 878 INLINE int32 extractFloat128Exp( float128 a )
 879 {
 880
 881     return ( a.high>>48 ) & 0x7FFF;
 882
 883 }
 884
 885 /*----------------------------------------------------------------------------
 886 | Returns the sign bit of the quadruple-precision floating-point value `a'.
 887 *----------------------------------------------------------------------------*/
 888
 889 INLINE flag extractFloat128Sign( float128 a )
 890 {
 891
 892     return a.high>>63;
 893
 894 }
 895
 896 /*----------------------------------------------------------------------------
 897 | Normalizes the subnormal quadruple-precision floating-point value
 898 | represented by the denormalized significand formed by the concatenation of
 899 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 900 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 901 | significand are stored at the location pointed to by `zSig0Ptr', and the
 902 | least significant 64 bits of the normalized significand are stored at the
 903 | location pointed to by `zSig1Ptr'.
 904 *----------------------------------------------------------------------------*/
 905
 906 static void
 907  normalizeFloat128Subnormal(
 908      uint64_t aSig0,
 909      uint64_t aSig1,
 910      int32 *zExpPtr,
 911      uint64_t *zSig0Ptr,
 912      uint64_t *zSig1Ptr
 913  )
 914 {
 915     int8 shiftCount;
 916
 917     if ( aSig0 == 0 ) {
 918         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 919         if ( shiftCount < 0 ) {
 920             *zSig0Ptr = aSig1>>( - shiftCount );
 921             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 922         }
 923         else {
 924             *zSig0Ptr = aSig1<<shiftCount;
 925             *zSig1Ptr = 0;
 926         }
 927         *zExpPtr = - shiftCount - 63;
 928     }
 929     else {
 930         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 931         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 932         *zExpPtr = 1 - shiftCount;
 933     }
 934
 935 }
 936
 937 /*----------------------------------------------------------------------------
 938 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 939 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 940 | floating-point value, returning the result.  After being shifted into the
 941 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 942 | added together to form the most significant 32 bits of the result.  This
 943 | means that any integer portion of `zSig0' will be added into the exponent.
 944 | Since a properly normalized significand will have an integer portion equal
 945 | to 1, the `zExp' input should be 1 less than the desired result exponent
 946 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 947 | significand.
 948 *----------------------------------------------------------------------------*/
 949
 950 INLINE float128
 951  packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
 952 {
 953     float128 z;
 954
 955     z.low = zSig1;
 956     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
 957     return z;
 958
 959 }
 960
 961 /*----------------------------------------------------------------------------
 962 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 963 | and extended significand formed by the concatenation of `zSig0', `zSig1',
 964 | and `zSig2', and returns the proper quadruple-precision floating-point value
 965 | corresponding to the abstract input.  Ordinarily, the abstract value is
 966 | simply rounded and packed into the quadruple-precision format, with the
 967 | inexact exception raised if the abstract input cannot be represented
 968 | exactly.  However, if the abstract value is too large, the overflow and
 969 | inexact exceptions are raised and an infinity or maximal finite value is
 970 | returned.  If the abstract value is too small, the input value is rounded to
 971 | a subnormal number, and the underflow and inexact exceptions are raised if
 972 | the abstract input cannot be represented exactly as a subnormal quadruple-
 973 | precision floating-point number.
 974 |     The input significand must be normalized or smaller.  If the input
 975 | significand is not normalized, `zExp' must be 0; in that case, the result
 976 | returned is a subnormal number, and it must not require rounding.  In the
 977 | usual case that the input significand is normalized, `zExp' must be 1 less
 978 | than the ``true'' floating-point exponent.  The handling of underflow and
 979 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 980 *----------------------------------------------------------------------------*/
 981
 982 static float128
 983  roundAndPackFloat128(
 984      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
 985 {
 986     int8 roundingMode;
 987     flag roundNearestEven, increment, isTiny;
 988
 989     roundingMode = STATUS(float_rounding_mode);
 990     roundNearestEven = ( roundingMode == float_round_nearest_even );
 991     increment = ( (int64_t) zSig2 < 0 );
 992     if ( ! roundNearestEven ) {
 993         if ( roundingMode == float_round_to_zero ) {
 994             increment = 0;
 995         }
 996         else {
 997             if ( zSign ) {
 998                 increment = ( roundingMode == float_round_down ) && zSig2;
 999             }
1000             else {
1001                 increment = ( roundingMode == float_round_up ) && zSig2;
1002             }
1003         }
1004     }
1005     if ( 0x7FFD <= (uint32_t) zExp ) {
1006         if (    ( 0x7FFD < zExp )
1007              || (    ( zExp == 0x7FFD )
1008                   && eq128(
1009                          LIT64( 0x0001FFFFFFFFFFFF ),
1010                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1011                          zSig0,
1012                          zSig1
1013                      )
1014                   && increment
1015                 )
1016            ) {
1017             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1018             if (    ( roundingMode == float_round_to_zero )
1019                  || ( zSign && ( roundingMode == float_round_up ) )
1020                  || ( ! zSign && ( roundingMode == float_round_down ) )
1021                ) {
1022                 return
1023                     packFloat128(
1024                         zSign,
1025                         0x7FFE,
1026                         LIT64( 0x0000FFFFFFFFFFFF ),
1027                         LIT64( 0xFFFFFFFFFFFFFFFF )
1028                     );
1029             }
1030             return packFloat128( zSign, 0x7FFF, 0, 0 );
1031         }
1032         if ( zExp < 0 ) {
1033             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
1034             isTiny =
1035                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1036                 || ( zExp < -1 )
1037                 || ! increment
1038                 || lt128(
1039                        zSig0,
1040                        zSig1,
1041                        LIT64( 0x0001FFFFFFFFFFFF ),
1042                        LIT64( 0xFFFFFFFFFFFFFFFF )
1043                    );
1044             shift128ExtraRightJamming(
1045                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1046             zExp = 0;
1047             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1048             if ( roundNearestEven ) {
1049                 increment = ( (int64_t) zSig2 < 0 );
1050             }
1051             else {
1052                 if ( zSign ) {
1053                     increment = ( roundingMode == float_round_down ) && zSig2;
1054                 }
1055                 else {
1056                     increment = ( roundingMode == float_round_up ) && zSig2;
1057                 }
1058             }
1059         }
1060     }
1061     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1062     if ( increment ) {
1063         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1064         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1065     }
1066     else {
1067         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1068     }
1069     return packFloat128( zSign, zExp, zSig0, zSig1 );
1070
1071 }
1072
1073 /*----------------------------------------------------------------------------
1074 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1075 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1076 | returns the proper quadruple-precision floating-point value corresponding
1077 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1078 | except that the input significand has fewer bits and does not have to be
1079 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1080 | point exponent.
1081 *----------------------------------------------------------------------------*/
1082
1083 static float128
1084  normalizeRoundAndPackFloat128(
1085      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1086 {
1087     int8 shiftCount;
1088     uint64_t zSig2;
1089
1090     if ( zSig0 == 0 ) {
1091         zSig0 = zSig1;
1092         zSig1 = 0;
1093         zExp -= 64;
1094     }
1095     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1096     if ( 0 <= shiftCount ) {
1097         zSig2 = 0;
1098         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1099     }
1100     else {
1101         shift128ExtraRightJamming(
1102             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1103     }
1104     zExp -= shiftCount;
1105     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1106
1107 }
1108
1109 #endif
1110
1111 /*----------------------------------------------------------------------------
1112 | Returns the result of converting the 32-bit two's complement integer `a'
1113 | to the single-precision floating-point format.  The conversion is performed
1114 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115 *----------------------------------------------------------------------------*/
1116
1117 float32 int32_to_float32( int32 a STATUS_PARAM )
1118 {
1119     flag zSign;
1120
1121     if ( a == 0 ) return float32_zero;
1122     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1123     zSign = ( a < 0 );
1124     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1125
1126 }
1127
1128 /*----------------------------------------------------------------------------
1129 | Returns the result of converting the 32-bit two's complement integer `a'
1130 | to the double-precision floating-point format.  The conversion is performed
1131 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1132 *----------------------------------------------------------------------------*/
1133
1134 float64 int32_to_float64( int32 a STATUS_PARAM )
1135 {
1136     flag zSign;
1137     uint32 absA;
1138     int8 shiftCount;
1139     uint64_t zSig;
1140
1141     if ( a == 0 ) return float64_zero;
1142     zSign = ( a < 0 );
1143     absA = zSign ? - a : a;
1144     shiftCount = countLeadingZeros32( absA ) + 21;
1145     zSig = absA;
1146     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1147
1148 }
1149
1150 #ifdef FLOATX80
1151
1152 /*----------------------------------------------------------------------------
1153 | Returns the result of converting the 32-bit two's complement integer `a'
1154 | to the extended double-precision floating-point format.  The conversion
1155 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1156 | Arithmetic.
1157 *----------------------------------------------------------------------------*/
1158
1159 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1160 {
1161     flag zSign;
1162     uint32 absA;
1163     int8 shiftCount;
1164     uint64_t zSig;
1165
1166     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1167     zSign = ( a < 0 );
1168     absA = zSign ? - a : a;
1169     shiftCount = countLeadingZeros32( absA ) + 32;
1170     zSig = absA;
1171     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1172
1173 }
1174
1175 #endif
1176
1177 #ifdef FLOAT128
1178
1179 /*----------------------------------------------------------------------------
1180 | Returns the result of converting the 32-bit two's complement integer `a' to
1181 | the quadruple-precision floating-point format.  The conversion is performed
1182 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1183 *----------------------------------------------------------------------------*/
1184
1185 float128 int32_to_float128( int32 a STATUS_PARAM )
1186 {
1187     flag zSign;
1188     uint32 absA;
1189     int8 shiftCount;
1190     uint64_t zSig0;
1191
1192     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1193     zSign = ( a < 0 );
1194     absA = zSign ? - a : a;
1195     shiftCount = countLeadingZeros32( absA ) + 17;
1196     zSig0 = absA;
1197     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1198
1199 }
1200
1201 #endif
1202
1203 /*----------------------------------------------------------------------------
1204 | Returns the result of converting the 64-bit two's complement integer `a'
1205 | to the single-precision floating-point format.  The conversion is performed
1206 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1207 *----------------------------------------------------------------------------*/
1208
1209 float32 int64_to_float32( int64 a STATUS_PARAM )
1210 {
1211     flag zSign;
1212     uint64 absA;
1213     int8 shiftCount;
1214
1215     if ( a == 0 ) return float32_zero;
1216     zSign = ( a < 0 );
1217     absA = zSign ? - a : a;
1218     shiftCount = countLeadingZeros64( absA ) - 40;
1219     if ( 0 <= shiftCount ) {
1220         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1221     }
1222     else {
1223         shiftCount += 7;
1224         if ( shiftCount < 0 ) {
1225             shift64RightJamming( absA, - shiftCount, &absA );
1226         }
1227         else {
1228             absA <<= shiftCount;
1229         }
1230         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1231     }
1232
1233 }
1234
1235 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1236 {
1237     int8 shiftCount;
1238
1239     if ( a == 0 ) return float32_zero;
1240     shiftCount = countLeadingZeros64( a ) - 40;
1241     if ( 0 <= shiftCount ) {
1242         return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1243     }
1244     else {
1245         shiftCount += 7;
1246         if ( shiftCount < 0 ) {
1247             shift64RightJamming( a, - shiftCount, &a );
1248         }
1249         else {
1250             a <<= shiftCount;
1251         }
1252         return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1253     }
1254 }
1255
1256 /*----------------------------------------------------------------------------
1257 | Returns the result of converting the 64-bit two's complement integer `a'
1258 | to the double-precision floating-point format.  The conversion is performed
1259 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1260 *----------------------------------------------------------------------------*/
1261
1262 float64 int64_to_float64( int64 a STATUS_PARAM )
1263 {
1264     flag zSign;
1265
1266     if ( a == 0 ) return float64_zero;
1267     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1268         return packFloat64( 1, 0x43E, 0 );
1269     }
1270     zSign = ( a < 0 );
1271     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1272
1273 }
1274
1275 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1276 {
1277     if ( a == 0 ) return float64_zero;
1278     return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1279
1280 }
1281
1282 #ifdef FLOATX80
1283
1284 /*----------------------------------------------------------------------------
1285 | Returns the result of converting the 64-bit two's complement integer `a'
1286 | to the extended double-precision floating-point format.  The conversion
1287 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1288 | Arithmetic.
1289 *----------------------------------------------------------------------------*/
1290
1291 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1292 {
1293     flag zSign;
1294     uint64 absA;
1295     int8 shiftCount;
1296
1297     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1298     zSign = ( a < 0 );
1299     absA = zSign ? - a : a;
1300     shiftCount = countLeadingZeros64( absA );
1301     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1302
1303 }
1304
1305 #endif
1306
1307 #ifdef FLOAT128
1308
1309 /*----------------------------------------------------------------------------
1310 | Returns the result of converting the 64-bit two's complement integer `a' to
1311 | the quadruple-precision floating-point format.  The conversion is performed
1312 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1313 *----------------------------------------------------------------------------*/
1314
1315 float128 int64_to_float128( int64 a STATUS_PARAM )
1316 {
1317     flag zSign;
1318     uint64 absA;
1319     int8 shiftCount;
1320     int32 zExp;
1321     uint64_t zSig0, zSig1;
1322
1323     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1324     zSign = ( a < 0 );
1325     absA = zSign ? - a : a;
1326     shiftCount = countLeadingZeros64( absA ) + 49;
1327     zExp = 0x406E - shiftCount;
1328     if ( 64 <= shiftCount ) {
1329         zSig1 = 0;
1330         zSig0 = absA;
1331         shiftCount -= 64;
1332     }
1333     else {
1334         zSig1 = absA;
1335         zSig0 = 0;
1336     }
1337     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1338     return packFloat128( zSign, zExp, zSig0, zSig1 );
1339
1340 }
1341
1342 #endif
1343
1344 /*----------------------------------------------------------------------------
1345 | Returns the result of converting the single-precision floating-point value
1346 | `a' to the 32-bit two's complement integer format.  The conversion is
1347 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1348 | Arithmetic---which means in particular that the conversion is rounded
1349 | according to the current rounding mode.  If `a' is a NaN, the largest
1350 | positive integer is returned.  Otherwise, if the conversion overflows, the
1351 | largest integer with the same sign as `a' is returned.
1352 *----------------------------------------------------------------------------*/
1353
1354 int32 float32_to_int32( float32 a STATUS_PARAM )
1355 {
1356     flag aSign;
1357     int16 aExp, shiftCount;
1358     uint32_t aSig;
1359     uint64_t aSig64;
1360
1361     a = float32_squash_input_denormal(a STATUS_VAR);
1362     aSig = extractFloat32Frac( a );
1363     aExp = extractFloat32Exp( a );
1364     aSign = extractFloat32Sign( a );
1365     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1366     if ( aExp ) aSig |= 0x00800000;
1367     shiftCount = 0xAF - aExp;
1368     aSig64 = aSig;
1369     aSig64 <<= 32;
1370     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1371     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1372
1373 }
1374
1375 /*----------------------------------------------------------------------------
1376 | Returns the result of converting the single-precision floating-point value
1377 | `a' to the 32-bit two's complement integer format.  The conversion is
1378 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1379 | Arithmetic, except that the conversion is always rounded toward zero.
1380 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1381 | the conversion overflows, the largest integer with the same sign as `a' is
1382 | returned.
1383 *----------------------------------------------------------------------------*/
1384
1385 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1386 {
1387     flag aSign;
1388     int16 aExp, shiftCount;
1389     uint32_t aSig;
1390     int32 z;
1391     a = float32_squash_input_denormal(a STATUS_VAR);
1392
1393     aSig = extractFloat32Frac( a );
1394     aExp = extractFloat32Exp( a );
1395     aSign = extractFloat32Sign( a );
1396     shiftCount = aExp - 0x9E;
1397     if ( 0 <= shiftCount ) {
1398         if ( float32_val(a) != 0xCF000000 ) {
1399             float_raise( float_flag_invalid STATUS_VAR);
1400             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1401         }
1402         return (int32_t) 0x80000000;
1403     }
1404     else if ( aExp <= 0x7E ) {
1405         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1406         return 0;
1407     }
1408     aSig = ( aSig | 0x00800000 )<<8;
1409     z = aSig>>( - shiftCount );
1410     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1411         STATUS(float_exception_flags) |= float_flag_inexact;
1412     }
1413     if ( aSign ) z = - z;
1414     return z;
1415
1416 }
1417
1418 /*----------------------------------------------------------------------------
1419 | Returns the result of converting the single-precision floating-point value
1420 | `a' to the 16-bit two's complement integer format.  The conversion is
1421 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1422 | Arithmetic, except that the conversion is always rounded toward zero.
1423 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1424 | the conversion overflows, the largest integer with the same sign as `a' is
1425 | returned.
1426 *----------------------------------------------------------------------------*/
1427
1428 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1429 {
1430     flag aSign;
1431     int16 aExp, shiftCount;
1432     uint32_t aSig;
1433     int32 z;
1434
1435     aSig = extractFloat32Frac( a );
1436     aExp = extractFloat32Exp( a );
1437     aSign = extractFloat32Sign( a );
1438     shiftCount = aExp - 0x8E;
1439     if ( 0 <= shiftCount ) {
1440         if ( float32_val(a) != 0xC7000000 ) {
1441             float_raise( float_flag_invalid STATUS_VAR);
1442             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1443                 return 0x7FFF;
1444             }
1445         }
1446         return (int32_t) 0xffff8000;
1447     }
1448     else if ( aExp <= 0x7E ) {
1449         if ( aExp | aSig ) {
1450             STATUS(float_exception_flags) |= float_flag_inexact;
1451         }
1452         return 0;
1453     }
1454     shiftCount -= 0x10;
1455     aSig = ( aSig | 0x00800000 )<<8;
1456     z = aSig>>( - shiftCount );
1457     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1458         STATUS(float_exception_flags) |= float_flag_inexact;
1459     }
1460     if ( aSign ) {
1461         z = - z;
1462     }
1463     return z;
1464
1465 }
1466
1467 /*----------------------------------------------------------------------------
1468 | Returns the result of converting the single-precision floating-point value
1469 | `a' to the 64-bit two's complement integer format.  The conversion is
1470 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1471 | Arithmetic---which means in particular that the conversion is rounded
1472 | according to the current rounding mode.  If `a' is a NaN, the largest
1473 | positive integer is returned.  Otherwise, if the conversion overflows, the
1474 | largest integer with the same sign as `a' is returned.
1475 *----------------------------------------------------------------------------*/
1476
1477 int64 float32_to_int64( float32 a STATUS_PARAM )
1478 {
1479     flag aSign;
1480     int16 aExp, shiftCount;
1481     uint32_t aSig;
1482     uint64_t aSig64, aSigExtra;
1483     a = float32_squash_input_denormal(a STATUS_VAR);
1484
1485     aSig = extractFloat32Frac( a );
1486     aExp = extractFloat32Exp( a );
1487     aSign = extractFloat32Sign( a );
1488     shiftCount = 0xBE - aExp;
1489     if ( shiftCount < 0 ) {
1490         float_raise( float_flag_invalid STATUS_VAR);
1491         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1492             return LIT64( 0x7FFFFFFFFFFFFFFF );
1493         }
1494         return (int64_t) LIT64( 0x8000000000000000 );
1495     }
1496     if ( aExp ) aSig |= 0x00800000;
1497     aSig64 = aSig;
1498     aSig64 <<= 40;
1499     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1500     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1501
1502 }
1503
1504 /*----------------------------------------------------------------------------
1505 | Returns the result of converting the single-precision floating-point value
1506 | `a' to the 64-bit two's complement integer format.  The conversion is
1507 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1508 | Arithmetic, except that the conversion is always rounded toward zero.  If
1509 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1510 | conversion overflows, the largest integer with the same sign as `a' is
1511 | returned.
1512 *----------------------------------------------------------------------------*/
1513
1514 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1515 {
1516     flag aSign;
1517     int16 aExp, shiftCount;
1518     uint32_t aSig;
1519     uint64_t aSig64;
1520     int64 z;
1521     a = float32_squash_input_denormal(a STATUS_VAR);
1522
1523     aSig = extractFloat32Frac( a );
1524     aExp = extractFloat32Exp( a );
1525     aSign = extractFloat32Sign( a );
1526     shiftCount = aExp - 0xBE;
1527     if ( 0 <= shiftCount ) {
1528         if ( float32_val(a) != 0xDF000000 ) {
1529             float_raise( float_flag_invalid STATUS_VAR);
1530             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1531                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1532             }
1533         }
1534         return (int64_t) LIT64( 0x8000000000000000 );
1535     }
1536     else if ( aExp <= 0x7E ) {
1537         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1538         return 0;
1539     }
1540     aSig64 = aSig | 0x00800000;
1541     aSig64 <<= 40;
1542     z = aSig64>>( - shiftCount );
1543     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1544         STATUS(float_exception_flags) |= float_flag_inexact;
1545     }
1546     if ( aSign ) z = - z;
1547     return z;
1548
1549 }
1550
1551 /*----------------------------------------------------------------------------
1552 | Returns the result of converting the single-precision floating-point value
1553 | `a' to the double-precision floating-point format.  The conversion is
1554 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1555 | Arithmetic.
1556 *----------------------------------------------------------------------------*/
1557
1558 float64 float32_to_float64( float32 a STATUS_PARAM )
1559 {
1560     flag aSign;
1561     int16 aExp;
1562     uint32_t aSig;
1563     a = float32_squash_input_denormal(a STATUS_VAR);
1564
1565     aSig = extractFloat32Frac( a );
1566     aExp = extractFloat32Exp( a );
1567     aSign = extractFloat32Sign( a );
1568     if ( aExp == 0xFF ) {
1569         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1570         return packFloat64( aSign, 0x7FF, 0 );
1571     }
1572     if ( aExp == 0 ) {
1573         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1574         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1575         --aExp;
1576     }
1577     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1578
1579 }
1580
1581 #ifdef FLOATX80
1582
1583 /*----------------------------------------------------------------------------
1584 | Returns the result of converting the single-precision floating-point value
1585 | `a' to the extended double-precision floating-point format.  The conversion
1586 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1587 | Arithmetic.
1588 *----------------------------------------------------------------------------*/
1589
1590 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1591 {
1592     flag aSign;
1593     int16 aExp;
1594     uint32_t aSig;
1595
1596     a = float32_squash_input_denormal(a STATUS_VAR);
1597     aSig = extractFloat32Frac( a );
1598     aExp = extractFloat32Exp( a );
1599     aSign = extractFloat32Sign( a );
1600     if ( aExp == 0xFF ) {
1601         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1602         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1603     }
1604     if ( aExp == 0 ) {
1605         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1606         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1607     }
1608     aSig |= 0x00800000;
1609     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1610
1611 }
1612
1613 #endif
1614
1615 #ifdef FLOAT128
1616
1617 /*----------------------------------------------------------------------------
1618 | Returns the result of converting the single-precision floating-point value
1619 | `a' to the double-precision floating-point format.  The conversion is
1620 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1621 | Arithmetic.
1622 *----------------------------------------------------------------------------*/
1623
1624 float128 float32_to_float128( float32 a STATUS_PARAM )
1625 {
1626     flag aSign;
1627     int16 aExp;
1628     uint32_t aSig;
1629
1630     a = float32_squash_input_denormal(a STATUS_VAR);
1631     aSig = extractFloat32Frac( a );
1632     aExp = extractFloat32Exp( a );
1633     aSign = extractFloat32Sign( a );
1634     if ( aExp == 0xFF ) {
1635         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1636         return packFloat128( aSign, 0x7FFF, 0, 0 );
1637     }
1638     if ( aExp == 0 ) {
1639         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1640         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1641         --aExp;
1642     }
1643     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1644
1645 }
1646
1647 #endif
1648
1649 /*----------------------------------------------------------------------------
1650 | Rounds the single-precision floating-point value `a' to an integer, and
1651 | returns the result as a single-precision floating-point value.  The
1652 | operation is performed according to the IEC/IEEE Standard for Binary
1653 | Floating-Point Arithmetic.
1654 *----------------------------------------------------------------------------*/
1655
1656 float32 float32_round_to_int( float32 a STATUS_PARAM)
1657 {
1658     flag aSign;
1659     int16 aExp;
1660     uint32_t lastBitMask, roundBitsMask;
1661     int8 roundingMode;
1662     uint32_t z;
1663     a = float32_squash_input_denormal(a STATUS_VAR);
1664
1665     aExp = extractFloat32Exp( a );
1666     if ( 0x96 <= aExp ) {
1667         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1668             return propagateFloat32NaN( a, a STATUS_VAR );
1669         }
1670         return a;
1671     }
1672     if ( aExp <= 0x7E ) {
1673         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1674         STATUS(float_exception_flags) |= float_flag_inexact;
1675         aSign = extractFloat32Sign( a );
1676         switch ( STATUS(float_rounding_mode) ) {
1677          case float_round_nearest_even:
1678             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1679                 return packFloat32( aSign, 0x7F, 0 );
1680             }
1681             break;
1682          case float_round_down:
1683             return make_float32(aSign ? 0xBF800000 : 0);
1684          case float_round_up:
1685             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1686         }
1687         return packFloat32( aSign, 0, 0 );
1688     }
1689     lastBitMask = 1;
1690     lastBitMask <<= 0x96 - aExp;
1691     roundBitsMask = lastBitMask - 1;
1692     z = float32_val(a);
1693     roundingMode = STATUS(float_rounding_mode);
1694     if ( roundingMode == float_round_nearest_even ) {
1695         z += lastBitMask>>1;
1696         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1697     }
1698     else if ( roundingMode != float_round_to_zero ) {
1699         if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1700             z += roundBitsMask;
1701         }
1702     }
1703     z &= ~ roundBitsMask;
1704     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1705     return make_float32(z);
1706
1707 }
1708
1709 /*----------------------------------------------------------------------------
1710 | Returns the result of adding the absolute values of the single-precision
1711 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1712 | before being returned.  `zSign' is ignored if the result is a NaN.
1713 | The addition is performed according to the IEC/IEEE Standard for Binary
1714 | Floating-Point Arithmetic.
1715 *----------------------------------------------------------------------------*/
1716
1717 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1718 {
1719     int16 aExp, bExp, zExp;
1720     uint32_t aSig, bSig, zSig;
1721     int16 expDiff;
1722
1723     aSig = extractFloat32Frac( a );
1724     aExp = extractFloat32Exp( a );
1725     bSig = extractFloat32Frac( b );
1726     bExp = extractFloat32Exp( b );
1727     expDiff = aExp - bExp;
1728     aSig <<= 6;
1729     bSig <<= 6;
1730     if ( 0 < expDiff ) {
1731         if ( aExp == 0xFF ) {
1732             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1733             return a;
1734         }
1735         if ( bExp == 0 ) {
1736             --expDiff;
1737         }
1738         else {
1739             bSig |= 0x20000000;
1740         }
1741         shift32RightJamming( bSig, expDiff, &bSig );
1742         zExp = aExp;
1743     }
1744     else if ( expDiff < 0 ) {
1745         if ( bExp == 0xFF ) {
1746             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1747             return packFloat32( zSign, 0xFF, 0 );
1748         }
1749         if ( aExp == 0 ) {
1750             ++expDiff;
1751         }
1752         else {
1753             aSig |= 0x20000000;
1754         }
1755         shift32RightJamming( aSig, - expDiff, &aSig );
1756         zExp = bExp;
1757     }
1758     else {
1759         if ( aExp == 0xFF ) {
1760             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1761             return a;
1762         }
1763         if ( aExp == 0 ) {
1764             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1765             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1766         }
1767         zSig = 0x40000000 + aSig + bSig;
1768         zExp = aExp;
1769         goto roundAndPack;
1770     }
1771     aSig |= 0x20000000;
1772     zSig = ( aSig + bSig )<<1;
1773     --zExp;
1774     if ( (int32_t) zSig < 0 ) {
1775         zSig = aSig + bSig;
1776         ++zExp;
1777     }
1778  roundAndPack:
1779     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1780
1781 }
1782
1783 /*----------------------------------------------------------------------------
1784 | Returns the result of subtracting the absolute values of the single-
1785 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1786 | difference is negated before being returned.  `zSign' is ignored if the
1787 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1788 | Standard for Binary Floating-Point Arithmetic.
1789 *----------------------------------------------------------------------------*/
1790
1791 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1792 {
1793     int16 aExp, bExp, zExp;
1794     uint32_t aSig, bSig, zSig;
1795     int16 expDiff;
1796
1797     aSig = extractFloat32Frac( a );
1798     aExp = extractFloat32Exp( a );
1799     bSig = extractFloat32Frac( b );
1800     bExp = extractFloat32Exp( b );
1801     expDiff = aExp - bExp;
1802     aSig <<= 7;
1803     bSig <<= 7;
1804     if ( 0 < expDiff ) goto aExpBigger;
1805     if ( expDiff < 0 ) goto bExpBigger;
1806     if ( aExp == 0xFF ) {
1807         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1808         float_raise( float_flag_invalid STATUS_VAR);
1809         return float32_default_nan;
1810     }
1811     if ( aExp == 0 ) {
1812         aExp = 1;
1813         bExp = 1;
1814     }
1815     if ( bSig < aSig ) goto aBigger;
1816     if ( aSig < bSig ) goto bBigger;
1817     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1818  bExpBigger:
1819     if ( bExp == 0xFF ) {
1820         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1821         return packFloat32( zSign ^ 1, 0xFF, 0 );
1822     }
1823     if ( aExp == 0 ) {
1824         ++expDiff;
1825     }
1826     else {
1827         aSig |= 0x40000000;
1828     }
1829     shift32RightJamming( aSig, - expDiff, &aSig );
1830     bSig |= 0x40000000;
1831  bBigger:
1832     zSig = bSig - aSig;
1833     zExp = bExp;
1834     zSign ^= 1;
1835     goto normalizeRoundAndPack;
1836  aExpBigger:
1837     if ( aExp == 0xFF ) {
1838         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1839         return a;
1840     }
1841     if ( bExp == 0 ) {
1842         --expDiff;
1843     }
1844     else {
1845         bSig |= 0x40000000;
1846     }
1847     shift32RightJamming( bSig, expDiff, &bSig );
1848     aSig |= 0x40000000;
1849  aBigger:
1850     zSig = aSig - bSig;
1851     zExp = aExp;
1852  normalizeRoundAndPack:
1853     --zExp;
1854     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1855
1856 }
1857
1858 /*----------------------------------------------------------------------------
1859 | Returns the result of adding the single-precision floating-point values `a'
1860 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1861 | Binary Floating-Point Arithmetic.
1862 *----------------------------------------------------------------------------*/
1863
1864 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1865 {
1866     flag aSign, bSign;
1867     a = float32_squash_input_denormal(a STATUS_VAR);
1868     b = float32_squash_input_denormal(b STATUS_VAR);
1869
1870     aSign = extractFloat32Sign( a );
1871     bSign = extractFloat32Sign( b );
1872     if ( aSign == bSign ) {
1873         return addFloat32Sigs( a, b, aSign STATUS_VAR);
1874     }
1875     else {
1876         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1877     }
1878
1879 }
1880
1881 /*----------------------------------------------------------------------------
1882 | Returns the result of subtracting the single-precision floating-point values
1883 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1884 | for Binary Floating-Point Arithmetic.
1885 *----------------------------------------------------------------------------*/
1886
1887 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1888 {
1889     flag aSign, bSign;
1890     a = float32_squash_input_denormal(a STATUS_VAR);
1891     b = float32_squash_input_denormal(b STATUS_VAR);
1892
1893     aSign = extractFloat32Sign( a );
1894     bSign = extractFloat32Sign( b );
1895     if ( aSign == bSign ) {
1896         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1897     }
1898     else {
1899         return addFloat32Sigs( a, b, aSign STATUS_VAR );
1900     }
1901
1902 }
1903
1904 /*----------------------------------------------------------------------------
1905 | Returns the result of multiplying the single-precision floating-point values
1906 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1907 | for Binary Floating-Point Arithmetic.
1908 *----------------------------------------------------------------------------*/
1909
1910 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1911 {
1912     flag aSign, bSign, zSign;
1913     int16 aExp, bExp, zExp;
1914     uint32_t aSig, bSig;
1915     uint64_t zSig64;
1916     uint32_t zSig;
1917
1918     a = float32_squash_input_denormal(a STATUS_VAR);
1919     b = float32_squash_input_denormal(b STATUS_VAR);
1920
1921     aSig = extractFloat32Frac( a );
1922     aExp = extractFloat32Exp( a );
1923     aSign = extractFloat32Sign( a );
1924     bSig = extractFloat32Frac( b );
1925     bExp = extractFloat32Exp( b );
1926     bSign = extractFloat32Sign( b );
1927     zSign = aSign ^ bSign;
1928     if ( aExp == 0xFF ) {
1929         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1930             return propagateFloat32NaN( a, b STATUS_VAR );
1931         }
1932         if ( ( bExp | bSig ) == 0 ) {
1933             float_raise( float_flag_invalid STATUS_VAR);
1934             return float32_default_nan;
1935         }
1936         return packFloat32( zSign, 0xFF, 0 );
1937     }
1938     if ( bExp == 0xFF ) {
1939         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1940         if ( ( aExp | aSig ) == 0 ) {
1941             float_raise( float_flag_invalid STATUS_VAR);
1942             return float32_default_nan;
1943         }
1944         return packFloat32( zSign, 0xFF, 0 );
1945     }
1946     if ( aExp == 0 ) {
1947         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1948         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1949     }
1950     if ( bExp == 0 ) {
1951         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1952         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1953     }
1954     zExp = aExp + bExp - 0x7F;
1955     aSig = ( aSig | 0x00800000 )<<7;
1956     bSig = ( bSig | 0x00800000 )<<8;
1957     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
1958     zSig = zSig64;
1959     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
1960         zSig <<= 1;
1961         --zExp;
1962     }
1963     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1964
1965 }
1966
1967 /*----------------------------------------------------------------------------
1968 | Returns the result of dividing the single-precision floating-point value `a'
1969 | by the corresponding value `b'.  The operation is performed according to the
1970 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1971 *----------------------------------------------------------------------------*/
1972
1973 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1974 {
1975     flag aSign, bSign, zSign;
1976     int16 aExp, bExp, zExp;
1977     uint32_t aSig, bSig, zSig;
1978     a = float32_squash_input_denormal(a STATUS_VAR);
1979     b = float32_squash_input_denormal(b STATUS_VAR);
1980
1981     aSig = extractFloat32Frac( a );
1982     aExp = extractFloat32Exp( a );
1983     aSign = extractFloat32Sign( a );
1984     bSig = extractFloat32Frac( b );
1985     bExp = extractFloat32Exp( b );
1986     bSign = extractFloat32Sign( b );
1987     zSign = aSign ^ bSign;
1988     if ( aExp == 0xFF ) {
1989         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1990         if ( bExp == 0xFF ) {
1991             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1992             float_raise( float_flag_invalid STATUS_VAR);
1993             return float32_default_nan;
1994         }
1995         return packFloat32( zSign, 0xFF, 0 );
1996     }
1997     if ( bExp == 0xFF ) {
1998         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1999         return packFloat32( zSign, 0, 0 );
2000     }
2001     if ( bExp == 0 ) {
2002         if ( bSig == 0 ) {
2003             if ( ( aExp | aSig ) == 0 ) {
2004                 float_raise( float_flag_invalid STATUS_VAR);
2005                 return float32_default_nan;
2006             }
2007             float_raise( float_flag_divbyzero STATUS_VAR);
2008             return packFloat32( zSign, 0xFF, 0 );
2009         }
2010         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2011     }
2012     if ( aExp == 0 ) {
2013         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2014         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2015     }
2016     zExp = aExp - bExp + 0x7D;
2017     aSig = ( aSig | 0x00800000 )<<7;
2018     bSig = ( bSig | 0x00800000 )<<8;
2019     if ( bSig <= ( aSig + aSig ) ) {
2020         aSig >>= 1;
2021         ++zExp;
2022     }
2023     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2024     if ( ( zSig & 0x3F ) == 0 ) {
2025         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2026     }
2027     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2028
2029 }
2030
2031 /*----------------------------------------------------------------------------
2032 | Returns the remainder of the single-precision floating-point value `a'
2033 | with respect to the corresponding value `b'.  The operation is performed
2034 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2035 *----------------------------------------------------------------------------*/
2036
2037 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2038 {
2039     flag aSign, zSign;
2040     int16 aExp, bExp, expDiff;
2041     uint32_t aSig, bSig;
2042     uint32_t q;
2043     uint64_t aSig64, bSig64, q64;
2044     uint32_t alternateASig;
2045     int32_t sigMean;
2046     a = float32_squash_input_denormal(a STATUS_VAR);
2047     b = float32_squash_input_denormal(b STATUS_VAR);
2048
2049     aSig = extractFloat32Frac( a );
2050     aExp = extractFloat32Exp( a );
2051     aSign = extractFloat32Sign( a );
2052     bSig = extractFloat32Frac( b );
2053     bExp = extractFloat32Exp( b );
2054     if ( aExp == 0xFF ) {
2055         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2056             return propagateFloat32NaN( a, b STATUS_VAR );
2057         }
2058         float_raise( float_flag_invalid STATUS_VAR);
2059         return float32_default_nan;
2060     }
2061     if ( bExp == 0xFF ) {
2062         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2063         return a;
2064     }
2065     if ( bExp == 0 ) {
2066         if ( bSig == 0 ) {
2067             float_raise( float_flag_invalid STATUS_VAR);
2068             return float32_default_nan;
2069         }
2070         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2071     }
2072     if ( aExp == 0 ) {
2073         if ( aSig == 0 ) return a;
2074         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2075     }
2076     expDiff = aExp - bExp;
2077     aSig |= 0x00800000;
2078     bSig |= 0x00800000;
2079     if ( expDiff < 32 ) {
2080         aSig <<= 8;
2081         bSig <<= 8;
2082         if ( expDiff < 0 ) {
2083             if ( expDiff < -1 ) return a;
2084             aSig >>= 1;
2085         }
2086         q = ( bSig <= aSig );
2087         if ( q ) aSig -= bSig;
2088         if ( 0 < expDiff ) {
2089             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2090             q >>= 32 - expDiff;
2091             bSig >>= 2;
2092             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2093         }
2094         else {
2095             aSig >>= 2;
2096             bSig >>= 2;
2097         }
2098     }
2099     else {
2100         if ( bSig <= aSig ) aSig -= bSig;
2101         aSig64 = ( (uint64_t) aSig )<<40;
2102         bSig64 = ( (uint64_t) bSig )<<40;
2103         expDiff -= 64;
2104         while ( 0 < expDiff ) {
2105             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2106             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2107             aSig64 = - ( ( bSig * q64 )<<38 );
2108             expDiff -= 62;
2109         }
2110         expDiff += 64;
2111         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2112         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2113         q = q64>>( 64 - expDiff );
2114         bSig <<= 6;
2115         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2116     }
2117     do {
2118         alternateASig = aSig;
2119         ++q;
2120         aSig -= bSig;
2121     } while ( 0 <= (int32_t) aSig );
2122     sigMean = aSig + alternateASig;
2123     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2124         aSig = alternateASig;
2125     }
2126     zSign = ( (int32_t) aSig < 0 );
2127     if ( zSign ) aSig = - aSig;
2128     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2129
2130 }
2131
2132 /*----------------------------------------------------------------------------
2133 | Returns the square root of the single-precision floating-point value `a'.
2134 | The operation is performed according to the IEC/IEEE Standard for Binary
2135 | Floating-Point Arithmetic.
2136 *----------------------------------------------------------------------------*/
2137
2138 float32 float32_sqrt( float32 a STATUS_PARAM )
2139 {
2140     flag aSign;
2141     int16 aExp, zExp;
2142     uint32_t aSig, zSig;
2143     uint64_t rem, term;
2144     a = float32_squash_input_denormal(a STATUS_VAR);
2145
2146     aSig = extractFloat32Frac( a );
2147     aExp = extractFloat32Exp( a );
2148     aSign = extractFloat32Sign( a );
2149     if ( aExp == 0xFF ) {
2150         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2151         if ( ! aSign ) return a;
2152         float_raise( float_flag_invalid STATUS_VAR);
2153         return float32_default_nan;
2154     }
2155     if ( aSign ) {
2156         if ( ( aExp | aSig ) == 0 ) return a;
2157         float_raise( float_flag_invalid STATUS_VAR);
2158         return float32_default_nan;
2159     }
2160     if ( aExp == 0 ) {
2161         if ( aSig == 0 ) return float32_zero;
2162         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2163     }
2164     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2165     aSig = ( aSig | 0x00800000 )<<8;
2166     zSig = estimateSqrt32( aExp, aSig ) + 2;
2167     if ( ( zSig & 0x7F ) <= 5 ) {
2168         if ( zSig < 2 ) {
2169             zSig = 0x7FFFFFFF;
2170             goto roundAndPack;
2171         }
2172         aSig >>= aExp & 1;
2173         term = ( (uint64_t) zSig ) * zSig;
2174         rem = ( ( (uint64_t) aSig )<<32 ) - term;
2175         while ( (int64_t) rem < 0 ) {
2176             --zSig;
2177             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2178         }
2179         zSig |= ( rem != 0 );
2180     }
2181     shift32RightJamming( zSig, 1, &zSig );
2182  roundAndPack:
2183     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2184
2185 }
2186
2187 /*----------------------------------------------------------------------------
2188 | Returns the binary exponential of the single-precision floating-point value
2189 | `a'. The operation is performed according to the IEC/IEEE Standard for
2190 | Binary Floating-Point Arithmetic.
2191 |
2192 | Uses the following identities:
2193 |
2194 | 1. -------------------------------------------------------------------------
2195 |      x    x*ln(2)
2196 |     2  = e
2197 |
2198 | 2. -------------------------------------------------------------------------
2199 |                      2     3     4     5           n
2200 |      x        x     x     x     x     x           x
2201 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2202 |               1!    2!    3!    4!    5!          n!
2203 *----------------------------------------------------------------------------*/
2204
2205 static const float64 float32_exp2_coefficients[15] =
2206 {
2207     const_float64( 0x3ff0000000000000ll ), /*  1 */
2208     const_float64( 0x3fe0000000000000ll ), /*  2 */
2209     const_float64( 0x3fc5555555555555ll ), /*  3 */
2210     const_float64( 0x3fa5555555555555ll ), /*  4 */
2211     const_float64( 0x3f81111111111111ll ), /*  5 */
2212     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2213     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2214     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2215     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2216     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2217     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2218     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2219     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2220     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2221     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2222 };
2223
2224 float32 float32_exp2( float32 a STATUS_PARAM )
2225 {
2226     flag aSign;
2227     int16 aExp;
2228     uint32_t aSig;
2229     float64 r, x, xn;
2230     int i;
2231     a = float32_squash_input_denormal(a STATUS_VAR);
2232
2233     aSig = extractFloat32Frac( a );
2234     aExp = extractFloat32Exp( a );
2235     aSign = extractFloat32Sign( a );
2236
2237     if ( aExp == 0xFF) {
2238         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2239         return (aSign) ? float32_zero : a;
2240     }
2241     if (aExp == 0) {
2242         if (aSig == 0) return float32_one;
2243     }
2244
2245     float_raise( float_flag_inexact STATUS_VAR);
2246
2247     /* ******************************* */
2248     /* using float64 for approximation */
2249     /* ******************************* */
2250     x = float32_to_float64(a STATUS_VAR);
2251     x = float64_mul(x, float64_ln2 STATUS_VAR);
2252
2253     xn = x;
2254     r = float64_one;
2255     for (i = 0 ; i < 15 ; i++) {
2256         float64 f;
2257
2258         f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2259         r = float64_add(r, f STATUS_VAR);
2260
2261         xn = float64_mul(xn, x STATUS_VAR);
2262     }
2263
2264     return float64_to_float32(r, status);
2265 }
2266
2267 /*----------------------------------------------------------------------------
2268 | Returns the binary log of the single-precision floating-point value `a'.
2269 | The operation is performed according to the IEC/IEEE Standard for Binary
2270 | Floating-Point Arithmetic.
2271 *----------------------------------------------------------------------------*/
2272 float32 float32_log2( float32 a STATUS_PARAM )
2273 {
2274     flag aSign, zSign;
2275     int16 aExp;
2276     uint32_t aSig, zSig, i;
2277
2278     a = float32_squash_input_denormal(a STATUS_VAR);
2279     aSig = extractFloat32Frac( a );
2280     aExp = extractFloat32Exp( a );
2281     aSign = extractFloat32Sign( a );
2282
2283     if ( aExp == 0 ) {
2284         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2285         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2286     }
2287     if ( aSign ) {
2288         float_raise( float_flag_invalid STATUS_VAR);
2289         return float32_default_nan;
2290     }
2291     if ( aExp == 0xFF ) {
2292         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2293         return a;
2294     }
2295
2296     aExp -= 0x7F;
2297     aSig |= 0x00800000;
2298     zSign = aExp < 0;
2299     zSig = aExp << 23;
2300
2301     for (i = 1 << 22; i > 0; i >>= 1) {
2302         aSig = ( (uint64_t)aSig * aSig ) >> 23;
2303         if ( aSig & 0x01000000 ) {
2304             aSig >>= 1;
2305             zSig |= i;
2306         }
2307     }
2308
2309     if ( zSign )
2310         zSig = -zSig;
2311
2312     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2313 }
2314
2315 /*----------------------------------------------------------------------------
2316 | Returns 1 if the single-precision floating-point value `a' is equal to
2317 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2318 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2319 *----------------------------------------------------------------------------*/
2320
2321 int float32_eq( float32 a, float32 b STATUS_PARAM )
2322 {
2323     a = float32_squash_input_denormal(a STATUS_VAR);
2324     b = float32_squash_input_denormal(b STATUS_VAR);
2325
2326     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2327          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2328        ) {
2329         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2330             float_raise( float_flag_invalid STATUS_VAR);
2331         }
2332         return 0;
2333     }
2334     return ( float32_val(a) == float32_val(b) ) ||
2335             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2336
2337 }
2338
2339 /*----------------------------------------------------------------------------
2340 | Returns 1 if the single-precision floating-point value `a' is less than
2341 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
2342 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2343 | Arithmetic.
2344 *----------------------------------------------------------------------------*/
2345
2346 int float32_le( float32 a, float32 b STATUS_PARAM )
2347 {
2348     flag aSign, bSign;
2349     uint32_t av, bv;
2350     a = float32_squash_input_denormal(a STATUS_VAR);
2351     b = float32_squash_input_denormal(b STATUS_VAR);
2352
2353     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2354          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2355        ) {
2356         float_raise( float_flag_invalid STATUS_VAR);
2357         return 0;
2358     }
2359     aSign = extractFloat32Sign( a );
2360     bSign = extractFloat32Sign( b );
2361     av = float32_val(a);
2362     bv = float32_val(b);
2363     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2364     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2365
2366 }
2367
2368 /*----------------------------------------------------------------------------
2369 | Returns 1 if the single-precision floating-point value `a' is less than
2370 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2371 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2372 *----------------------------------------------------------------------------*/
2373
2374 int float32_lt( float32 a, float32 b STATUS_PARAM )
2375 {
2376     flag aSign, bSign;
2377     uint32_t av, bv;
2378     a = float32_squash_input_denormal(a STATUS_VAR);
2379     b = float32_squash_input_denormal(b STATUS_VAR);
2380
2381     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2382          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2383        ) {
2384         float_raise( float_flag_invalid STATUS_VAR);
2385         return 0;
2386     }
2387     aSign = extractFloat32Sign( a );
2388     bSign = extractFloat32Sign( b );
2389     av = float32_val(a);
2390     bv = float32_val(b);
2391     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2392     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2393
2394 }
2395
2396 /*----------------------------------------------------------------------------
2397 | Returns 1 if the single-precision floating-point value `a' is equal to
2398 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2399 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2400 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2401 *----------------------------------------------------------------------------*/
2402
2403 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2404 {
2405     uint32_t av, bv;
2406     a = float32_squash_input_denormal(a STATUS_VAR);
2407     b = float32_squash_input_denormal(b STATUS_VAR);
2408
2409     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2410          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2411        ) {
2412         float_raise( float_flag_invalid STATUS_VAR);
2413         return 0;
2414     }
2415     av = float32_val(a);
2416     bv = float32_val(b);
2417     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2418
2419 }
2420
2421 /*----------------------------------------------------------------------------
2422 | Returns 1 if the single-precision floating-point value `a' is less than or
2423 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2424 | cause an exception.  Otherwise, the comparison is performed according to the
2425 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2426 *----------------------------------------------------------------------------*/
2427
2428 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2429 {
2430     flag aSign, bSign;
2431     uint32_t av, bv;
2432     a = float32_squash_input_denormal(a STATUS_VAR);
2433     b = float32_squash_input_denormal(b STATUS_VAR);
2434
2435     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2436          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2437        ) {
2438         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2439             float_raise( float_flag_invalid STATUS_VAR);
2440         }
2441         return 0;
2442     }
2443     aSign = extractFloat32Sign( a );
2444     bSign = extractFloat32Sign( b );
2445     av = float32_val(a);
2446     bv = float32_val(b);
2447     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2448     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2449
2450 }
2451
2452 /*----------------------------------------------------------------------------
2453 | Returns 1 if the single-precision floating-point value `a' is less than
2454 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2455 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2456 | Standard for Binary Floating-Point Arithmetic.
2457 *----------------------------------------------------------------------------*/
2458
2459 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2460 {
2461     flag aSign, bSign;
2462     uint32_t av, bv;
2463     a = float32_squash_input_denormal(a STATUS_VAR);
2464     b = float32_squash_input_denormal(b STATUS_VAR);
2465
2466     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2467          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2468        ) {
2469         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2470             float_raise( float_flag_invalid STATUS_VAR);
2471         }
2472         return 0;
2473     }
2474     aSign = extractFloat32Sign( a );
2475     bSign = extractFloat32Sign( b );
2476     av = float32_val(a);
2477     bv = float32_val(b);
2478     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2479     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2480
2481 }
2482
2483 /*----------------------------------------------------------------------------
2484 | Returns the result of converting the double-precision floating-point value
2485 | `a' to the 32-bit two's complement integer format.  The conversion is
2486 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2487 | Arithmetic---which means in particular that the conversion is rounded
2488 | according to the current rounding mode.  If `a' is a NaN, the largest
2489 | positive integer is returned.  Otherwise, if the conversion overflows, the
2490 | largest integer with the same sign as `a' is returned.
2491 *----------------------------------------------------------------------------*/
2492
2493 int32 float64_to_int32( float64 a STATUS_PARAM )
2494 {
2495     flag aSign;
2496     int16 aExp, shiftCount;
2497     uint64_t aSig;
2498     a = float64_squash_input_denormal(a STATUS_VAR);
2499
2500     aSig = extractFloat64Frac( a );
2501     aExp = extractFloat64Exp( a );
2502     aSign = extractFloat64Sign( a );
2503     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2504     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2505     shiftCount = 0x42C - aExp;
2506     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2507     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2508
2509 }
2510
2511 /*----------------------------------------------------------------------------
2512 | Returns the result of converting the double-precision floating-point value
2513 | `a' to the 32-bit two's complement integer format.  The conversion is
2514 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2515 | Arithmetic, except that the conversion is always rounded toward zero.
2516 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2517 | the conversion overflows, the largest integer with the same sign as `a' is
2518 | returned.
2519 *----------------------------------------------------------------------------*/
2520
2521 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2522 {
2523     flag aSign;
2524     int16 aExp, shiftCount;
2525     uint64_t aSig, savedASig;
2526     int32 z;
2527     a = float64_squash_input_denormal(a STATUS_VAR);
2528
2529     aSig = extractFloat64Frac( a );
2530     aExp = extractFloat64Exp( a );
2531     aSign = extractFloat64Sign( a );
2532     if ( 0x41E < aExp ) {
2533         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2534         goto invalid;
2535     }
2536     else if ( aExp < 0x3FF ) {
2537         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2538         return 0;
2539     }
2540     aSig |= LIT64( 0x0010000000000000 );
2541     shiftCount = 0x433 - aExp;
2542     savedASig = aSig;
2543     aSig >>= shiftCount;
2544     z = aSig;
2545     if ( aSign ) z = - z;
2546     if ( ( z < 0 ) ^ aSign ) {
2547  invalid:
2548         float_raise( float_flag_invalid STATUS_VAR);
2549         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2550     }
2551     if ( ( aSig<<shiftCount ) != savedASig ) {
2552         STATUS(float_exception_flags) |= float_flag_inexact;
2553     }
2554     return z;
2555
2556 }
2557
2558 /*----------------------------------------------------------------------------
2559 | Returns the result of converting the double-precision floating-point value
2560 | `a' to the 16-bit two's complement integer format.  The conversion is
2561 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2562 | Arithmetic, except that the conversion is always rounded toward zero.
2563 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2564 | the conversion overflows, the largest integer with the same sign as `a' is
2565 | returned.
2566 *----------------------------------------------------------------------------*/
2567
2568 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2569 {
2570     flag aSign;
2571     int16 aExp, shiftCount;
2572     uint64_t aSig, savedASig;
2573     int32 z;
2574
2575     aSig = extractFloat64Frac( a );
2576     aExp = extractFloat64Exp( a );
2577     aSign = extractFloat64Sign( a );
2578     if ( 0x40E < aExp ) {
2579         if ( ( aExp == 0x7FF ) && aSig ) {
2580             aSign = 0;
2581         }
2582         goto invalid;
2583     }
2584     else if ( aExp < 0x3FF ) {
2585         if ( aExp || aSig ) {
2586             STATUS(float_exception_flags) |= float_flag_inexact;
2587         }
2588         return 0;
2589     }
2590     aSig |= LIT64( 0x0010000000000000 );
2591     shiftCount = 0x433 - aExp;
2592     savedASig = aSig;
2593     aSig >>= shiftCount;
2594     z = aSig;
2595     if ( aSign ) {
2596         z = - z;
2597     }
2598     if ( ( (int16_t)z < 0 ) ^ aSign ) {
2599  invalid:
2600         float_raise( float_flag_invalid STATUS_VAR);
2601         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2602     }
2603     if ( ( aSig<<shiftCount ) != savedASig ) {
2604         STATUS(float_exception_flags) |= float_flag_inexact;
2605     }
2606     return z;
2607 }
2608
2609 /*----------------------------------------------------------------------------
2610 | Returns the result of converting the double-precision floating-point value
2611 | `a' to the 64-bit two's complement integer format.  The conversion is
2612 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2613 | Arithmetic---which means in particular that the conversion is rounded
2614 | according to the current rounding mode.  If `a' is a NaN, the largest
2615 | positive integer is returned.  Otherwise, if the conversion overflows, the
2616 | largest integer with the same sign as `a' is returned.
2617 *----------------------------------------------------------------------------*/
2618
2619 int64 float64_to_int64( float64 a STATUS_PARAM )
2620 {
2621     flag aSign;
2622     int16 aExp, shiftCount;
2623     uint64_t aSig, aSigExtra;
2624     a = float64_squash_input_denormal(a STATUS_VAR);
2625
2626     aSig = extractFloat64Frac( a );
2627     aExp = extractFloat64Exp( a );
2628     aSign = extractFloat64Sign( a );
2629     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2630     shiftCount = 0x433 - aExp;
2631     if ( shiftCount <= 0 ) {
2632         if ( 0x43E < aExp ) {
2633             float_raise( float_flag_invalid STATUS_VAR);
2634             if (    ! aSign
2635                  || (    ( aExp == 0x7FF )
2636                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2637                ) {
2638                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2639             }
2640             return (int64_t) LIT64( 0x8000000000000000 );
2641         }
2642         aSigExtra = 0;
2643         aSig <<= - shiftCount;
2644     }
2645     else {
2646         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2647     }
2648     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2649
2650 }
2651
2652 /*----------------------------------------------------------------------------
2653 | Returns the result of converting the double-precision floating-point value
2654 | `a' to the 64-bit two's complement integer format.  The conversion is
2655 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2656 | Arithmetic, except that the conversion is always rounded toward zero.
2657 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2658 | the conversion overflows, the largest integer with the same sign as `a' is
2659 | returned.
2660 *----------------------------------------------------------------------------*/
2661
2662 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2663 {
2664     flag aSign;
2665     int16 aExp, shiftCount;
2666     uint64_t aSig;
2667     int64 z;
2668     a = float64_squash_input_denormal(a STATUS_VAR);
2669
2670     aSig = extractFloat64Frac( a );
2671     aExp = extractFloat64Exp( a );
2672     aSign = extractFloat64Sign( a );
2673     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2674     shiftCount = aExp - 0x433;
2675     if ( 0 <= shiftCount ) {
2676         if ( 0x43E <= aExp ) {
2677             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2678                 float_raise( float_flag_invalid STATUS_VAR);
2679                 if (    ! aSign
2680                      || (    ( aExp == 0x7FF )
2681                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2682                    ) {
2683                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2684                 }
2685             }
2686             return (int64_t) LIT64( 0x8000000000000000 );
2687         }
2688         z = aSig<<shiftCount;
2689     }
2690     else {
2691         if ( aExp < 0x3FE ) {
2692             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2693             return 0;
2694         }
2695         z = aSig>>( - shiftCount );
2696         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
2697             STATUS(float_exception_flags) |= float_flag_inexact;
2698         }
2699     }
2700     if ( aSign ) z = - z;
2701     return z;
2702
2703 }
2704
2705 /*----------------------------------------------------------------------------
2706 | Returns the result of converting the double-precision floating-point value
2707 | `a' to the single-precision floating-point format.  The conversion is
2708 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2709 | Arithmetic.
2710 *----------------------------------------------------------------------------*/
2711
2712 float32 float64_to_float32( float64 a STATUS_PARAM )
2713 {
2714     flag aSign;
2715     int16 aExp;
2716     uint64_t aSig;
2717     uint32_t zSig;
2718     a = float64_squash_input_denormal(a STATUS_VAR);
2719
2720     aSig = extractFloat64Frac( a );
2721     aExp = extractFloat64Exp( a );
2722     aSign = extractFloat64Sign( a );
2723     if ( aExp == 0x7FF ) {
2724         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2725         return packFloat32( aSign, 0xFF, 0 );
2726     }
2727     shift64RightJamming( aSig, 22, &aSig );
2728     zSig = aSig;
2729     if ( aExp || zSig ) {
2730         zSig |= 0x40000000;
2731         aExp -= 0x381;
2732     }
2733     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2734
2735 }
2736
2737
2738 /*----------------------------------------------------------------------------
2739 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2740 | half-precision floating-point value, returning the result.  After being
2741 | shifted into the proper positions, the three fields are simply added
2742 | together to form the result.  This means that any integer portion of `zSig'
2743 | will be added into the exponent.  Since a properly normalized significand
2744 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2745 | than the desired result exponent whenever `zSig' is a complete, normalized
2746 | significand.
2747 *----------------------------------------------------------------------------*/
2748 static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig)
2749 {
2750     return make_float16(
2751         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
2752 }
2753
2754 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2755    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2756
2757 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2758 {
2759     flag aSign;
2760     int16 aExp;
2761     uint32_t aSig;
2762
2763     aSign = extractFloat16Sign(a);
2764     aExp = extractFloat16Exp(a);
2765     aSig = extractFloat16Frac(a);
2766
2767     if (aExp == 0x1f && ieee) {
2768         if (aSig) {
2769             return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
2770         }
2771         return packFloat32(aSign, 0xff, aSig << 13);
2772     }
2773     if (aExp == 0) {
2774         int8 shiftCount;
2775
2776         if (aSig == 0) {
2777             return packFloat32(aSign, 0, 0);
2778         }
2779
2780         shiftCount = countLeadingZeros32( aSig ) - 21;
2781         aSig = aSig << shiftCount;
2782         aExp = -shiftCount;
2783     }
2784     return packFloat32( aSign, aExp + 0x70, aSig << 13);
2785 }
2786
2787 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
2788 {
2789     flag aSign;
2790     int16 aExp;
2791     uint32_t aSig;
2792     uint32_t mask;
2793     uint32_t increment;
2794     int8 roundingMode;
2795     a = float32_squash_input_denormal(a STATUS_VAR);
2796
2797     aSig = extractFloat32Frac( a );
2798     aExp = extractFloat32Exp( a );
2799     aSign = extractFloat32Sign( a );
2800     if ( aExp == 0xFF ) {
2801         if (aSig) {
2802             /* Input is a NaN */
2803             float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2804             if (!ieee) {
2805                 return packFloat16(aSign, 0, 0);
2806             }
2807             return r;
2808         }
2809         /* Infinity */
2810         if (!ieee) {
2811             float_raise(float_flag_invalid STATUS_VAR);
2812             return packFloat16(aSign, 0x1f, 0x3ff);
2813         }
2814         return packFloat16(aSign, 0x1f, 0);
2815     }
2816     if (aExp == 0 && aSig == 0) {
2817         return packFloat16(aSign, 0, 0);
2818     }
2819     /* Decimal point between bits 22 and 23.  */
2820     aSig |= 0x00800000;
2821     aExp -= 0x7f;
2822     if (aExp < -14) {
2823         mask = 0x00ffffff;
2824         if (aExp >= -24) {
2825             mask >>= 25 + aExp;
2826         }
2827     } else {
2828         mask = 0x00001fff;
2829     }
2830     if (aSig & mask) {
2831         float_raise( float_flag_underflow STATUS_VAR );
2832         roundingMode = STATUS(float_rounding_mode);
2833         switch (roundingMode) {
2834         case float_round_nearest_even:
2835             increment = (mask + 1) >> 1;
2836             if ((aSig & mask) == increment) {
2837                 increment = aSig & (increment << 1);
2838             }
2839             break;
2840         case float_round_up:
2841             increment = aSign ? 0 : mask;
2842             break;
2843         case float_round_down:
2844             increment = aSign ? mask : 0;
2845             break;
2846         default: /* round_to_zero */
2847             increment = 0;
2848             break;
2849         }
2850         aSig += increment;
2851         if (aSig >= 0x01000000) {
2852             aSig >>= 1;
2853             aExp++;
2854         }
2855     } else if (aExp < -14
2856           && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2857         float_raise( float_flag_underflow STATUS_VAR);
2858     }
2859
2860     if (ieee) {
2861         if (aExp > 15) {
2862             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2863             return packFloat16(aSign, 0x1f, 0);
2864         }
2865     } else {
2866         if (aExp > 16) {
2867             float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
2868             return packFloat16(aSign, 0x1f, 0x3ff);
2869         }
2870     }
2871     if (aExp < -24) {
2872         return packFloat16(aSign, 0, 0);
2873     }
2874     if (aExp < -14) {
2875         aSig >>= -14 - aExp;
2876         aExp = -14;
2877     }
2878     return packFloat16(aSign, aExp + 14, aSig >> 13);
2879 }
2880
2881 #ifdef FLOATX80
2882
2883 /*----------------------------------------------------------------------------
2884 | Returns the result of converting the double-precision floating-point value
2885 | `a' to the extended double-precision floating-point format.  The conversion
2886 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2887 | Arithmetic.
2888 *----------------------------------------------------------------------------*/
2889
2890 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2891 {
2892     flag aSign;
2893     int16 aExp;
2894     uint64_t aSig;
2895
2896     a = float64_squash_input_denormal(a STATUS_VAR);
2897     aSig = extractFloat64Frac( a );
2898     aExp = extractFloat64Exp( a );
2899     aSign = extractFloat64Sign( a );
2900     if ( aExp == 0x7FF ) {
2901         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2902         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2903     }
2904     if ( aExp == 0 ) {
2905         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2906         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2907     }
2908     return
2909         packFloatx80(
2910             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2911
2912 }
2913
2914 #endif
2915
2916 #ifdef FLOAT128
2917
2918 /*----------------------------------------------------------------------------
2919 | Returns the result of converting the double-precision floating-point value
2920 | `a' to the quadruple-precision floating-point format.  The conversion is
2921 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2922 | Arithmetic.
2923 *----------------------------------------------------------------------------*/
2924
2925 float128 float64_to_float128( float64 a STATUS_PARAM )
2926 {
2927     flag aSign;
2928     int16 aExp;
2929     uint64_t aSig, zSig0, zSig1;
2930
2931     a = float64_squash_input_denormal(a STATUS_VAR);
2932     aSig = extractFloat64Frac( a );
2933     aExp = extractFloat64Exp( a );
2934     aSign = extractFloat64Sign( a );
2935     if ( aExp == 0x7FF ) {
2936         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2937         return packFloat128( aSign, 0x7FFF, 0, 0 );
2938     }
2939     if ( aExp == 0 ) {
2940         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2941         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2942         --aExp;
2943     }
2944     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2945     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2946
2947 }
2948
2949 #endif
2950
2951 /*----------------------------------------------------------------------------
2952 | Rounds the double-precision floating-point value `a' to an integer, and
2953 | returns the result as a double-precision floating-point value.  The
2954 | operation is performed according to the IEC/IEEE Standard for Binary
2955 | Floating-Point Arithmetic.
2956 *----------------------------------------------------------------------------*/
2957
2958 float64 float64_round_to_int( float64 a STATUS_PARAM )
2959 {
2960     flag aSign;
2961     int16 aExp;
2962     uint64_t lastBitMask, roundBitsMask;
2963     int8 roundingMode;
2964     uint64_t z;
2965     a = float64_squash_input_denormal(a STATUS_VAR);
2966
2967     aExp = extractFloat64Exp( a );
2968     if ( 0x433 <= aExp ) {
2969         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2970             return propagateFloat64NaN( a, a STATUS_VAR );
2971         }
2972         return a;
2973     }
2974     if ( aExp < 0x3FF ) {
2975         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
2976         STATUS(float_exception_flags) |= float_flag_inexact;
2977         aSign = extractFloat64Sign( a );
2978         switch ( STATUS(float_rounding_mode) ) {
2979          case float_round_nearest_even:
2980             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2981                 return packFloat64( aSign, 0x3FF, 0 );
2982             }
2983             break;
2984          case float_round_down:
2985             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2986          case float_round_up:
2987             return make_float64(
2988             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2989         }
2990         return packFloat64( aSign, 0, 0 );
2991     }
2992     lastBitMask = 1;
2993     lastBitMask <<= 0x433 - aExp;
2994     roundBitsMask = lastBitMask - 1;
2995     z = float64_val(a);
2996     roundingMode = STATUS(float_rounding_mode);
2997     if ( roundingMode == float_round_nearest_even ) {
2998         z += lastBitMask>>1;
2999         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3000     }
3001     else if ( roundingMode != float_round_to_zero ) {
3002         if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3003             z += roundBitsMask;
3004         }
3005     }
3006     z &= ~ roundBitsMask;
3007     if ( z != float64_val(a) )
3008         STATUS(float_exception_flags) |= float_flag_inexact;
3009     return make_float64(z);
3010
3011 }
3012
3013 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3014 {
3015     int oldmode;
3016     float64 res;
3017     oldmode = STATUS(float_rounding_mode);
3018     STATUS(float_rounding_mode) = float_round_to_zero;
3019     res = float64_round_to_int(a STATUS_VAR);
3020     STATUS(float_rounding_mode) = oldmode;
3021     return res;
3022 }
3023
3024 /*----------------------------------------------------------------------------
3025 | Returns the result of adding the absolute values of the double-precision
3026 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3027 | before being returned.  `zSign' is ignored if the result is a NaN.
3028 | The addition is performed according to the IEC/IEEE Standard for Binary
3029 | Floating-Point Arithmetic.
3030 *----------------------------------------------------------------------------*/
3031
3032 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3033 {
3034     int16 aExp, bExp, zExp;
3035     uint64_t aSig, bSig, zSig;
3036     int16 expDiff;
3037
3038     aSig = extractFloat64Frac( a );
3039     aExp = extractFloat64Exp( a );
3040     bSig = extractFloat64Frac( b );
3041     bExp = extractFloat64Exp( b );
3042     expDiff = aExp - bExp;
3043     aSig <<= 9;
3044     bSig <<= 9;
3045     if ( 0 < expDiff ) {
3046         if ( aExp == 0x7FF ) {
3047             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3048             return a;
3049         }
3050         if ( bExp == 0 ) {
3051             --expDiff;
3052         }
3053         else {
3054             bSig |= LIT64( 0x2000000000000000 );
3055         }
3056         shift64RightJamming( bSig, expDiff, &bSig );
3057         zExp = aExp;
3058     }
3059     else if ( expDiff < 0 ) {
3060         if ( bExp == 0x7FF ) {
3061             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3062             return packFloat64( zSign, 0x7FF, 0 );
3063         }
3064         if ( aExp == 0 ) {
3065             ++expDiff;
3066         }
3067         else {
3068             aSig |= LIT64( 0x2000000000000000 );
3069         }
3070         shift64RightJamming( aSig, - expDiff, &aSig );
3071         zExp = bExp;
3072     }
3073     else {
3074         if ( aExp == 0x7FF ) {
3075             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3076             return a;
3077         }
3078         if ( aExp == 0 ) {
3079             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
3080             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3081         }
3082         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3083         zExp = aExp;
3084         goto roundAndPack;
3085     }
3086     aSig |= LIT64( 0x2000000000000000 );
3087     zSig = ( aSig + bSig )<<1;
3088     --zExp;
3089     if ( (int64_t) zSig < 0 ) {
3090         zSig = aSig + bSig;
3091         ++zExp;
3092     }
3093  roundAndPack:
3094     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3095
3096 }
3097
3098 /*----------------------------------------------------------------------------
3099 | Returns the result of subtracting the absolute values of the double-
3100 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3101 | difference is negated before being returned.  `zSign' is ignored if the
3102 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3103 | Standard for Binary Floating-Point Arithmetic.
3104 *----------------------------------------------------------------------------*/
3105
3106 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3107 {
3108     int16 aExp, bExp, zExp;
3109     uint64_t aSig, bSig, zSig;
3110     int16 expDiff;
3111
3112     aSig = extractFloat64Frac( a );
3113     aExp = extractFloat64Exp( a );
3114     bSig = extractFloat64Frac( b );
3115     bExp = extractFloat64Exp( b );
3116     expDiff = aExp - bExp;
3117     aSig <<= 10;
3118     bSig <<= 10;
3119     if ( 0 < expDiff ) goto aExpBigger;
3120     if ( expDiff < 0 ) goto bExpBigger;
3121     if ( aExp == 0x7FF ) {
3122         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3123         float_raise( float_flag_invalid STATUS_VAR);
3124         return float64_default_nan;
3125     }
3126     if ( aExp == 0 ) {
3127         aExp = 1;
3128         bExp = 1;
3129     }
3130     if ( bSig < aSig ) goto aBigger;
3131     if ( aSig < bSig ) goto bBigger;
3132     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3133  bExpBigger:
3134     if ( bExp == 0x7FF ) {
3135         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3136         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3137     }
3138     if ( aExp == 0 ) {
3139         ++expDiff;
3140     }
3141     else {
3142         aSig |= LIT64( 0x4000000000000000 );
3143     }
3144     shift64RightJamming( aSig, - expDiff, &aSig );
3145     bSig |= LIT64( 0x4000000000000000 );
3146  bBigger:
3147     zSig = bSig - aSig;
3148     zExp = bExp;
3149     zSign ^= 1;
3150     goto normalizeRoundAndPack;
3151  aExpBigger:
3152     if ( aExp == 0x7FF ) {
3153         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3154         return a;
3155     }
3156     if ( bExp == 0 ) {
3157         --expDiff;
3158     }
3159     else {
3160         bSig |= LIT64( 0x4000000000000000 );
3161     }
3162     shift64RightJamming( bSig, expDiff, &bSig );
3163     aSig |= LIT64( 0x4000000000000000 );
3164  aBigger:
3165     zSig = aSig - bSig;
3166     zExp = aExp;
3167  normalizeRoundAndPack:
3168     --zExp;
3169     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3170
3171 }
3172
3173 /*----------------------------------------------------------------------------
3174 | Returns the result of adding the double-precision floating-point values `a'
3175 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3176 | Binary Floating-Point Arithmetic.
3177 *----------------------------------------------------------------------------*/
3178
3179 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3180 {
3181     flag aSign, bSign;
3182     a = float64_squash_input_denormal(a STATUS_VAR);
3183     b = float64_squash_input_denormal(b STATUS_VAR);
3184
3185     aSign = extractFloat64Sign( a );
3186     bSign = extractFloat64Sign( b );
3187     if ( aSign == bSign ) {
3188         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3189     }
3190     else {
3191         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3192     }
3193
3194 }
3195
3196 /*----------------------------------------------------------------------------
3197 | Returns the result of subtracting the double-precision floating-point values
3198 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3199 | for Binary Floating-Point Arithmetic.
3200 *----------------------------------------------------------------------------*/
3201
3202 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3203 {
3204     flag aSign, bSign;
3205     a = float64_squash_input_denormal(a STATUS_VAR);
3206     b = float64_squash_input_denormal(b STATUS_VAR);
3207
3208     aSign = extractFloat64Sign( a );
3209     bSign = extractFloat64Sign( b );
3210     if ( aSign == bSign ) {
3211         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3212     }
3213     else {
3214         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3215     }
3216
3217 }
3218
3219 /*----------------------------------------------------------------------------
3220 | Returns the result of multiplying the double-precision floating-point values
3221 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3222 | for Binary Floating-Point Arithmetic.
3223 *----------------------------------------------------------------------------*/
3224
3225 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3226 {
3227     flag aSign, bSign, zSign;
3228     int16 aExp, bExp, zExp;
3229     uint64_t aSig, bSig, zSig0, zSig1;
3230
3231     a = float64_squash_input_denormal(a STATUS_VAR);
3232     b = float64_squash_input_denormal(b STATUS_VAR);
3233
3234     aSig = extractFloat64Frac( a );
3235     aExp = extractFloat64Exp( a );
3236     aSign = extractFloat64Sign( a );
3237     bSig = extractFloat64Frac( b );
3238     bExp = extractFloat64Exp( b );
3239     bSign = extractFloat64Sign( b );
3240     zSign = aSign ^ bSign;
3241     if ( aExp == 0x7FF ) {
3242         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3243             return propagateFloat64NaN( a, b STATUS_VAR );
3244         }
3245         if ( ( bExp | bSig ) == 0 ) {
3246             float_raise( float_flag_invalid STATUS_VAR);
3247             return float64_default_nan;
3248         }
3249         return packFloat64( zSign, 0x7FF, 0 );
3250     }
3251     if ( bExp == 0x7FF ) {
3252         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3253         if ( ( aExp | aSig ) == 0 ) {
3254             float_raise( float_flag_invalid STATUS_VAR);
3255             return float64_default_nan;
3256         }
3257         return packFloat64( zSign, 0x7FF, 0 );
3258     }
3259     if ( aExp == 0 ) {
3260         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3261         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3262     }
3263     if ( bExp == 0 ) {
3264         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3265         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3266     }
3267     zExp = aExp + bExp - 0x3FF;
3268     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3269     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3270     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3271     zSig0 |= ( zSig1 != 0 );
3272     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3273         zSig0 <<= 1;
3274         --zExp;
3275     }
3276     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3277
3278 }
3279
3280 /*----------------------------------------------------------------------------
3281 | Returns the result of dividing the double-precision floating-point value `a'
3282 | by the corresponding value `b'.  The operation is performed according to
3283 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3284 *----------------------------------------------------------------------------*/
3285
3286 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3287 {
3288     flag aSign, bSign, zSign;
3289     int16 aExp, bExp, zExp;
3290     uint64_t aSig, bSig, zSig;
3291     uint64_t rem0, rem1;
3292     uint64_t term0, term1;
3293     a = float64_squash_input_denormal(a STATUS_VAR);
3294     b = float64_squash_input_denormal(b STATUS_VAR);
3295
3296     aSig = extractFloat64Frac( a );
3297     aExp = extractFloat64Exp( a );
3298     aSign = extractFloat64Sign( a );
3299     bSig = extractFloat64Frac( b );
3300     bExp = extractFloat64Exp( b );
3301     bSign = extractFloat64Sign( b );
3302     zSign = aSign ^ bSign;
3303     if ( aExp == 0x7FF ) {
3304         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3305         if ( bExp == 0x7FF ) {
3306             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3307             float_raise( float_flag_invalid STATUS_VAR);
3308             return float64_default_nan;
3309         }
3310         return packFloat64( zSign, 0x7FF, 0 );
3311     }
3312     if ( bExp == 0x7FF ) {
3313         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3314         return packFloat64( zSign, 0, 0 );
3315     }
3316     if ( bExp == 0 ) {
3317         if ( bSig == 0 ) {
3318             if ( ( aExp | aSig ) == 0 ) {
3319                 float_raise( float_flag_invalid STATUS_VAR);
3320                 return float64_default_nan;
3321             }
3322             float_raise( float_flag_divbyzero STATUS_VAR);
3323             return packFloat64( zSign, 0x7FF, 0 );
3324         }
3325         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3326     }
3327     if ( aExp == 0 ) {
3328         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3329         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3330     }
3331     zExp = aExp - bExp + 0x3FD;
3332     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3333     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3334     if ( bSig <= ( aSig + aSig ) ) {
3335         aSig >>= 1;
3336         ++zExp;
3337     }
3338     zSig = estimateDiv128To64( aSig, 0, bSig );
3339     if ( ( zSig & 0x1FF ) <= 2 ) {
3340         mul64To128( bSig, zSig, &term0, &term1 );
3341         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3342         while ( (int64_t) rem0 < 0 ) {
3343             --zSig;
3344             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3345         }
3346         zSig |= ( rem1 != 0 );
3347     }
3348     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3349
3350 }
3351
3352 /*----------------------------------------------------------------------------
3353 | Returns the remainder of the double-precision floating-point value `a'
3354 | with respect to the corresponding value `b'.  The operation is performed
3355 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3356 *----------------------------------------------------------------------------*/
3357
3358 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3359 {
3360     flag aSign, zSign;
3361     int16 aExp, bExp, expDiff;
3362     uint64_t aSig, bSig;
3363     uint64_t q, alternateASig;
3364     int64_t sigMean;
3365
3366     a = float64_squash_input_denormal(a STATUS_VAR);
3367     b = float64_squash_input_denormal(b STATUS_VAR);
3368     aSig = extractFloat64Frac( a );
3369     aExp = extractFloat64Exp( a );
3370     aSign = extractFloat64Sign( a );
3371     bSig = extractFloat64Frac( b );
3372     bExp = extractFloat64Exp( b );
3373     if ( aExp == 0x7FF ) {
3374         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3375             return propagateFloat64NaN( a, b STATUS_VAR );
3376         }
3377         float_raise( float_flag_invalid STATUS_VAR);
3378         return float64_default_nan;
3379     }
3380     if ( bExp == 0x7FF ) {
3381         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3382         return a;
3383     }
3384     if ( bExp == 0 ) {
3385         if ( bSig == 0 ) {
3386             float_raise( float_flag_invalid STATUS_VAR);
3387             return float64_default_nan;
3388         }
3389         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3390     }
3391     if ( aExp == 0 ) {
3392         if ( aSig == 0 ) return a;
3393         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3394     }
3395     expDiff = aExp - bExp;
3396     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3397     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3398     if ( expDiff < 0 ) {
3399         if ( expDiff < -1 ) return a;
3400         aSig >>= 1;
3401     }
3402     q = ( bSig <= aSig );
3403     if ( q ) aSig -= bSig;
3404     expDiff -= 64;
3405     while ( 0 < expDiff ) {
3406         q = estimateDiv128To64( aSig, 0, bSig );
3407         q = ( 2 < q ) ? q - 2 : 0;
3408         aSig = - ( ( bSig>>2 ) * q );
3409         expDiff -= 62;
3410     }
3411     expDiff += 64;
3412     if ( 0 < expDiff ) {
3413         q = estimateDiv128To64( aSig, 0, bSig );
3414         q = ( 2 < q ) ? q - 2 : 0;
3415         q >>= 64 - expDiff;
3416         bSig >>= 2;
3417         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3418     }
3419     else {
3420         aSig >>= 2;
3421         bSig >>= 2;
3422     }
3423     do {
3424         alternateASig = aSig;
3425         ++q;
3426         aSig -= bSig;
3427     } while ( 0 <= (int64_t) aSig );
3428     sigMean = aSig + alternateASig;
3429     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3430         aSig = alternateASig;
3431     }
3432     zSign = ( (int64_t) aSig < 0 );
3433     if ( zSign ) aSig = - aSig;
3434     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3435
3436 }
3437
3438 /*----------------------------------------------------------------------------
3439 | Returns the square root of the double-precision floating-point value `a'.
3440 | The operation is performed according to the IEC/IEEE Standard for Binary
3441 | Floating-Point Arithmetic.
3442 *----------------------------------------------------------------------------*/
3443
3444 float64 float64_sqrt( float64 a STATUS_PARAM )
3445 {
3446     flag aSign;
3447     int16 aExp, zExp;
3448     uint64_t aSig, zSig, doubleZSig;
3449     uint64_t rem0, rem1, term0, term1;
3450     a = float64_squash_input_denormal(a STATUS_VAR);
3451
3452     aSig = extractFloat64Frac( a );
3453     aExp = extractFloat64Exp( a );
3454     aSign = extractFloat64Sign( a );
3455     if ( aExp == 0x7FF ) {
3456         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3457         if ( ! aSign ) return a;
3458         float_raise( float_flag_invalid STATUS_VAR);
3459         return float64_default_nan;
3460     }
3461     if ( aSign ) {
3462         if ( ( aExp | aSig ) == 0 ) return a;
3463         float_raise( float_flag_invalid STATUS_VAR);
3464         return float64_default_nan;
3465     }
3466     if ( aExp == 0 ) {
3467         if ( aSig == 0 ) return float64_zero;
3468         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3469     }
3470     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3471     aSig |= LIT64( 0x0010000000000000 );
3472     zSig = estimateSqrt32( aExp, aSig>>21 );
3473     aSig <<= 9 - ( aExp & 1 );
3474     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3475     if ( ( zSig & 0x1FF ) <= 5 ) {
3476         doubleZSig = zSig<<1;
3477         mul64To128( zSig, zSig, &term0, &term1 );
3478         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3479         while ( (int64_t) rem0 < 0 ) {
3480             --zSig;
3481             doubleZSig -= 2;
3482             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3483         }
3484         zSig |= ( ( rem0 | rem1 ) != 0 );
3485     }
3486     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3487
3488 }
3489
3490 /*----------------------------------------------------------------------------
3491 | Returns the binary log of the double-precision floating-point value `a'.
3492 | The operation is performed according to the IEC/IEEE Standard for Binary
3493 | Floating-Point Arithmetic.
3494 *----------------------------------------------------------------------------*/
3495 float64 float64_log2( float64 a STATUS_PARAM )
3496 {
3497     flag aSign, zSign;
3498     int16 aExp;
3499     uint64_t aSig, aSig0, aSig1, zSig, i;
3500     a = float64_squash_input_denormal(a STATUS_VAR);
3501
3502     aSig = extractFloat64Frac( a );
3503     aExp = extractFloat64Exp( a );
3504     aSign = extractFloat64Sign( a );
3505
3506     if ( aExp == 0 ) {
3507         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3508         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3509     }
3510     if ( aSign ) {
3511         float_raise( float_flag_invalid STATUS_VAR);
3512         return float64_default_nan;
3513     }
3514     if ( aExp == 0x7FF ) {
3515         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3516         return a;
3517     }
3518
3519     aExp -= 0x3FF;
3520     aSig |= LIT64( 0x0010000000000000 );
3521     zSign = aExp < 0;
3522     zSig = (uint64_t)aExp << 52;
3523     for (i = 1LL << 51; i > 0; i >>= 1) {
3524         mul64To128( aSig, aSig, &aSig0, &aSig1 );
3525         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3526         if ( aSig & LIT64( 0x0020000000000000 ) ) {
3527             aSig >>= 1;
3528             zSig |= i;
3529         }
3530     }
3531
3532     if ( zSign )
3533         zSig = -zSig;
3534     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3535 }
3536
3537 /*----------------------------------------------------------------------------
3538 | Returns 1 if the double-precision floating-point value `a' is equal to the
3539 | corresponding value `b', and 0 otherwise.  The comparison is performed
3540 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3541 *----------------------------------------------------------------------------*/
3542
3543 int float64_eq( float64 a, float64 b STATUS_PARAM )
3544 {
3545     uint64_t av, bv;
3546     a = float64_squash_input_denormal(a STATUS_VAR);
3547     b = float64_squash_input_denormal(b STATUS_VAR);
3548
3549     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3550          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3551        ) {
3552         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3553             float_raise( float_flag_invalid STATUS_VAR);
3554         }
3555         return 0;
3556     }
3557     av = float64_val(a);
3558     bv = float64_val(b);
3559     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3560
3561 }
3562
3563 /*----------------------------------------------------------------------------
3564 | Returns 1 if the double-precision floating-point value `a' is less than or
3565 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
3566 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3567 | Arithmetic.
3568 *----------------------------------------------------------------------------*/
3569
3570 int float64_le( float64 a, float64 b STATUS_PARAM )
3571 {
3572     flag aSign, bSign;
3573     uint64_t av, bv;
3574     a = float64_squash_input_denormal(a STATUS_VAR);
3575     b = float64_squash_input_denormal(b STATUS_VAR);
3576
3577     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3578          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3579        ) {
3580         float_raise( float_flag_invalid STATUS_VAR);
3581         return 0;
3582     }
3583     aSign = extractFloat64Sign( a );
3584     bSign = extractFloat64Sign( b );
3585     av = float64_val(a);
3586     bv = float64_val(b);
3587     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3588     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3589
3590 }
3591
3592 /*----------------------------------------------------------------------------
3593 | Returns 1 if the double-precision floating-point value `a' is less than
3594 | the corresponding value `b', and 0 otherwise.  The comparison is performed
3595 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3596 *----------------------------------------------------------------------------*/
3597
3598 int float64_lt( float64 a, float64 b STATUS_PARAM )
3599 {
3600     flag aSign, bSign;
3601     uint64_t av, bv;
3602
3603     a = float64_squash_input_denormal(a STATUS_VAR);
3604     b = float64_squash_input_denormal(b STATUS_VAR);
3605     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3606          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3607        ) {
3608         float_raise( float_flag_invalid STATUS_VAR);
3609         return 0;
3610     }
3611     aSign = extractFloat64Sign( a );
3612     bSign = extractFloat64Sign( b );
3613     av = float64_val(a);
3614     bv = float64_val(b);
3615     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3616     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3617
3618 }
3619
3620 /*----------------------------------------------------------------------------
3621 | Returns 1 if the double-precision floating-point value `a' is equal to the
3622 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
3623 | if either operand is a NaN.  Otherwise, the comparison is performed
3624 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3625 *----------------------------------------------------------------------------*/
3626
3627 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3628 {
3629     uint64_t av, bv;
3630     a = float64_squash_input_denormal(a STATUS_VAR);
3631     b = float64_squash_input_denormal(b STATUS_VAR);
3632
3633     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3634          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3635        ) {
3636         float_raise( float_flag_invalid STATUS_VAR);
3637         return 0;
3638     }
3639     av = float64_val(a);
3640     bv = float64_val(b);
3641     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3642
3643 }
3644
3645 /*----------------------------------------------------------------------------
3646 | Returns 1 if the double-precision floating-point value `a' is less than or
3647 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3648 | cause an exception.  Otherwise, the comparison is performed according to the
3649 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3650 *----------------------------------------------------------------------------*/
3651
3652 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3653 {
3654     flag aSign, bSign;
3655     uint64_t av, bv;
3656     a = float64_squash_input_denormal(a STATUS_VAR);
3657     b = float64_squash_input_denormal(b STATUS_VAR);
3658
3659     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3660          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3661        ) {
3662         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3663             float_raise( float_flag_invalid STATUS_VAR);
3664         }
3665         return 0;
3666     }
3667     aSign = extractFloat64Sign( a );
3668     bSign = extractFloat64Sign( b );
3669     av = float64_val(a);
3670     bv = float64_val(b);
3671     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3672     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3673
3674 }
3675
3676 /*----------------------------------------------------------------------------
3677 | Returns 1 if the double-precision floating-point value `a' is less than
3678 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3679 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3680 | Standard for Binary Floating-Point Arithmetic.
3681 *----------------------------------------------------------------------------*/
3682
3683 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3684 {
3685     flag aSign, bSign;
3686     uint64_t av, bv;
3687     a = float64_squash_input_denormal(a STATUS_VAR);
3688     b = float64_squash_input_denormal(b STATUS_VAR);
3689
3690     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3691          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3692        ) {
3693         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3694             float_raise( float_flag_invalid STATUS_VAR);
3695         }
3696         return 0;
3697     }
3698     aSign = extractFloat64Sign( a );
3699     bSign = extractFloat64Sign( b );
3700     av = float64_val(a);
3701     bv = float64_val(b);
3702     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3703     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3704
3705 }
3706
3707 #ifdef FLOATX80
3708
3709 /*----------------------------------------------------------------------------
3710 | Returns the result of converting the extended double-precision floating-
3711 | point value `a' to the 32-bit two's complement integer format.  The
3712 | conversion is performed according to the IEC/IEEE Standard for Binary
3713 | Floating-Point Arithmetic---which means in particular that the conversion
3714 | is rounded according to the current rounding mode.  If `a' is a NaN, the
3715 | largest positive integer is returned.  Otherwise, if the conversion
3716 | overflows, the largest integer with the same sign as `a' is returned.
3717 *----------------------------------------------------------------------------*/
3718
3719 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3720 {
3721     flag aSign;
3722     int32 aExp, shiftCount;
3723     uint64_t aSig;
3724
3725     aSig = extractFloatx80Frac( a );
3726     aExp = extractFloatx80Exp( a );
3727     aSign = extractFloatx80Sign( a );
3728     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3729     shiftCount = 0x4037 - aExp;
3730     if ( shiftCount <= 0 ) shiftCount = 1;
3731     shift64RightJamming( aSig, shiftCount, &aSig );
3732     return roundAndPackInt32( aSign, aSig STATUS_VAR );
3733
3734 }
3735
3736 /*----------------------------------------------------------------------------
3737 | Returns the result of converting the extended double-precision floating-
3738 | point value `a' to the 32-bit two's complement integer format.  The
3739 | conversion is performed according to the IEC/IEEE Standard for Binary
3740 | Floating-Point Arithmetic, except that the conversion is always rounded
3741 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3742 | Otherwise, if the conversion overflows, the largest integer with the same
3743 | sign as `a' is returned.
3744 *----------------------------------------------------------------------------*/
3745
3746 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3747 {
3748     flag aSign;
3749     int32 aExp, shiftCount;
3750     uint64_t aSig, savedASig;
3751     int32 z;
3752
3753     aSig = extractFloatx80Frac( a );
3754     aExp = extractFloatx80Exp( a );
3755     aSign = extractFloatx80Sign( a );
3756     if ( 0x401E < aExp ) {
3757         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3758         goto invalid;
3759     }
3760     else if ( aExp < 0x3FFF ) {
3761         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3762         return 0;
3763     }
3764     shiftCount = 0x403E - aExp;
3765     savedASig = aSig;
3766     aSig >>= shiftCount;
3767     z = aSig;
3768     if ( aSign ) z = - z;
3769     if ( ( z < 0 ) ^ aSign ) {
3770  invalid:
3771         float_raise( float_flag_invalid STATUS_VAR);
3772         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3773     }
3774     if ( ( aSig<<shiftCount ) != savedASig ) {
3775         STATUS(float_exception_flags) |= float_flag_inexact;
3776     }
3777     return z;
3778
3779 }
3780
3781 /*----------------------------------------------------------------------------
3782 | Returns the result of converting the extended double-precision floating-
3783 | point value `a' to the 64-bit two's complement integer format.  The
3784 | conversion is performed according to the IEC/IEEE Standard for Binary
3785 | Floating-Point Arithmetic---which means in particular that the conversion
3786 | is rounded according to the current rounding mode.  If `a' is a NaN,
3787 | the largest positive integer is returned.  Otherwise, if the conversion
3788 | overflows, the largest integer with the same sign as `a' is returned.
3789 *----------------------------------------------------------------------------*/
3790
3791 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3792 {
3793     flag aSign;
3794     int32 aExp, shiftCount;
3795     uint64_t aSig, aSigExtra;
3796
3797     aSig = extractFloatx80Frac( a );
3798     aExp = extractFloatx80Exp( a );
3799     aSign = extractFloatx80Sign( a );
3800     shiftCount = 0x403E - aExp;
3801     if ( shiftCount <= 0 ) {
3802         if ( shiftCount ) {
3803             float_raise( float_flag_invalid STATUS_VAR);
3804             if (    ! aSign
3805                  || (    ( aExp == 0x7FFF )
3806                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3807                ) {
3808                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3809             }
3810             return (int64_t) LIT64( 0x8000000000000000 );
3811         }
3812         aSigExtra = 0;
3813     }
3814     else {
3815         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3816     }
3817     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3818
3819 }
3820
3821 /*----------------------------------------------------------------------------
3822 | Returns the result of converting the extended double-precision floating-
3823 | point value `a' to the 64-bit two's complement integer format.  The
3824 | conversion is performed according to the IEC/IEEE Standard for Binary
3825 | Floating-Point Arithmetic, except that the conversion is always rounded
3826 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3827 | Otherwise, if the conversion overflows, the largest integer with the same
3828 | sign as `a' is returned.
3829 *----------------------------------------------------------------------------*/
3830
3831 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3832 {
3833     flag aSign;
3834     int32 aExp, shiftCount;
3835     uint64_t aSig;
3836     int64 z;
3837
3838     aSig = extractFloatx80Frac( a );
3839     aExp = extractFloatx80Exp( a );
3840     aSign = extractFloatx80Sign( a );
3841     shiftCount = aExp - 0x403E;
3842     if ( 0 <= shiftCount ) {
3843         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3844         if ( ( a.high != 0xC03E ) || aSig ) {
3845             float_raise( float_flag_invalid STATUS_VAR);
3846             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3847                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3848             }
3849         }
3850         return (int64_t) LIT64( 0x8000000000000000 );
3851     }
3852     else if ( aExp < 0x3FFF ) {
3853         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3854         return 0;
3855     }
3856     z = aSig>>( - shiftCount );
3857     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3858         STATUS(float_exception_flags) |= float_flag_inexact;
3859     }
3860     if ( aSign ) z = - z;
3861     return z;
3862
3863 }
3864
3865 /*----------------------------------------------------------------------------
3866 | Returns the result of converting the extended double-precision floating-
3867 | point value `a' to the single-precision floating-point format.  The
3868 | conversion is performed according to the IEC/IEEE Standard for Binary
3869 | Floating-Point Arithmetic.
3870 *----------------------------------------------------------------------------*/
3871
3872 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3873 {
3874     flag aSign;
3875     int32 aExp;
3876     uint64_t aSig;
3877
3878     aSig = extractFloatx80Frac( a );
3879     aExp = extractFloatx80Exp( a );
3880     aSign = extractFloatx80Sign( a );
3881     if ( aExp == 0x7FFF ) {
3882         if ( (uint64_t) ( aSig<<1 ) ) {
3883             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3884         }
3885         return packFloat32( aSign, 0xFF, 0 );
3886     }
3887     shift64RightJamming( aSig, 33, &aSig );
3888     if ( aExp || aSig ) aExp -= 0x3F81;
3889     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3890
3891 }
3892
3893 /*----------------------------------------------------------------------------
3894 | Returns the result of converting the extended double-precision floating-
3895 | point value `a' to the double-precision floating-point format.  The
3896 | conversion is performed according to the IEC/IEEE Standard for Binary
3897 | Floating-Point Arithmetic.
3898 *----------------------------------------------------------------------------*/
3899
3900 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3901 {
3902     flag aSign;
3903     int32 aExp;
3904     uint64_t aSig, zSig;
3905
3906     aSig = extractFloatx80Frac( a );
3907     aExp = extractFloatx80Exp( a );
3908     aSign = extractFloatx80Sign( a );
3909     if ( aExp == 0x7FFF ) {
3910         if ( (uint64_t) ( aSig<<1 ) ) {
3911             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3912         }
3913         return packFloat64( aSign, 0x7FF, 0 );
3914     }
3915     shift64RightJamming( aSig, 1, &zSig );
3916     if ( aExp || aSig ) aExp -= 0x3C01;
3917     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3918
3919 }
3920
3921 #ifdef FLOAT128
3922
3923 /*----------------------------------------------------------------------------
3924 | Returns the result of converting the extended double-precision floating-
3925 | point value `a' to the quadruple-precision floating-point format.  The
3926 | conversion is performed according to the IEC/IEEE Standard for Binary
3927 | Floating-Point Arithmetic.
3928 *----------------------------------------------------------------------------*/
3929
3930 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3931 {
3932     flag aSign;
3933     int16 aExp;
3934     uint64_t aSig, zSig0, zSig1;
3935
3936     aSig = extractFloatx80Frac( a );
3937     aExp = extractFloatx80Exp( a );
3938     aSign = extractFloatx80Sign( a );
3939     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
3940         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3941     }
3942     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3943     return packFloat128( aSign, aExp, zSig0, zSig1 );
3944
3945 }
3946
3947 #endif
3948
3949 /*----------------------------------------------------------------------------
3950 | Rounds the extended double-precision floating-point value `a' to an integer,
3951 | and returns the result as an extended quadruple-precision floating-point
3952 | value.  The operation is performed according to the IEC/IEEE Standard for
3953 | Binary Floating-Point Arithmetic.
3954 *----------------------------------------------------------------------------*/
3955
3956 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3957 {
3958     flag aSign;
3959     int32 aExp;
3960     uint64_t lastBitMask, roundBitsMask;
3961     int8 roundingMode;
3962     floatx80 z;
3963
3964     aExp = extractFloatx80Exp( a );
3965     if ( 0x403E <= aExp ) {
3966         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
3967             return propagateFloatx80NaN( a, a STATUS_VAR );
3968         }
3969         return a;
3970     }
3971     if ( aExp < 0x3FFF ) {
3972         if (    ( aExp == 0 )
3973              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3974             return a;
3975         }
3976         STATUS(float_exception_flags) |= float_flag_inexact;
3977         aSign = extractFloatx80Sign( a );
3978         switch ( STATUS(float_rounding_mode) ) {
3979          case float_round_nearest_even:
3980             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
3981                ) {
3982                 return
3983                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3984             }
3985             break;
3986          case float_round_down:
3987             return
3988                   aSign ?
3989                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3990                 : packFloatx80( 0, 0, 0 );
3991          case float_round_up:
3992             return
3993                   aSign ? packFloatx80( 1, 0, 0 )
3994                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3995         }
3996         return packFloatx80( aSign, 0, 0 );
3997     }
3998     lastBitMask = 1;
3999     lastBitMask <<= 0x403E - aExp;
4000     roundBitsMask = lastBitMask - 1;
4001     z = a;
4002     roundingMode = STATUS(float_rounding_mode);
4003     if ( roundingMode == float_round_nearest_even ) {
4004         z.low += lastBitMask>>1;
4005         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4006     }
4007     else if ( roundingMode != float_round_to_zero ) {
4008         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4009             z.low += roundBitsMask;
4010         }
4011     }
4012     z.low &= ~ roundBitsMask;
4013     if ( z.low == 0 ) {
4014         ++z.high;
4015         z.low = LIT64( 0x8000000000000000 );
4016     }
4017     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4018     return z;
4019
4020 }
4021
4022 /*----------------------------------------------------------------------------
4023 | Returns the result of adding the absolute values of the extended double-
4024 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4025 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4026 | The addition is performed according to the IEC/IEEE Standard for Binary
4027 | Floating-Point Arithmetic.
4028 *----------------------------------------------------------------------------*/
4029
4030 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4031 {
4032     int32 aExp, bExp, zExp;
4033     uint64_t aSig, bSig, zSig0, zSig1;
4034     int32 expDiff;
4035
4036     aSig = extractFloatx80Frac( a );
4037     aExp = extractFloatx80Exp( a );
4038     bSig = extractFloatx80Frac( b );
4039     bExp = extractFloatx80Exp( b );
4040     expDiff = aExp - bExp;
4041     if ( 0 < expDiff ) {
4042         if ( aExp == 0x7FFF ) {
4043             if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4044             return a;
4045         }
4046         if ( bExp == 0 ) --expDiff;
4047         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4048         zExp = aExp;
4049     }
4050     else if ( expDiff < 0 ) {
4051         if ( bExp == 0x7FFF ) {
4052             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4053             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4054         }
4055         if ( aExp == 0 ) ++expDiff;
4056         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4057         zExp = bExp;
4058     }
4059     else {
4060         if ( aExp == 0x7FFF ) {
4061             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4062                 return propagateFloatx80NaN( a, b STATUS_VAR );
4063             }
4064             return a;
4065         }
4066         zSig1 = 0;
4067         zSig0 = aSig + bSig;
4068         if ( aExp == 0 ) {
4069             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4070             goto roundAndPack;
4071         }
4072         zExp = aExp;
4073         goto shiftRight1;
4074     }
4075     zSig0 = aSig + bSig;
4076     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4077  shiftRight1:
4078     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4079     zSig0 |= LIT64( 0x8000000000000000 );
4080     ++zExp;
4081  roundAndPack:
4082     return
4083         roundAndPackFloatx80(
4084             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4085
4086 }
4087
4088 /*----------------------------------------------------------------------------
4089 | Returns the result of subtracting the absolute values of the extended
4090 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4091 | difference is negated before being returned.  `zSign' is ignored if the
4092 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4093 | Standard for Binary Floating-Point Arithmetic.
4094 *----------------------------------------------------------------------------*/
4095
4096 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4097 {
4098     int32 aExp, bExp, zExp;
4099     uint64_t aSig, bSig, zSig0, zSig1;
4100     int32 expDiff;
4101     floatx80 z;
4102
4103     aSig = extractFloatx80Frac( a );
4104     aExp = extractFloatx80Exp( a );
4105     bSig = extractFloatx80Frac( b );
4106     bExp = extractFloatx80Exp( b );
4107     expDiff = aExp - bExp;
4108     if ( 0 < expDiff ) goto aExpBigger;
4109     if ( expDiff < 0 ) goto bExpBigger;
4110     if ( aExp == 0x7FFF ) {
4111         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4112             return propagateFloatx80NaN( a, b STATUS_VAR );
4113         }
4114         float_raise( float_flag_invalid STATUS_VAR);
4115         z.low = floatx80_default_nan_low;
4116         z.high = floatx80_default_nan_high;
4117         return z;
4118     }
4119     if ( aExp == 0 ) {
4120         aExp = 1;
4121         bExp = 1;
4122     }
4123     zSig1 = 0;
4124     if ( bSig < aSig ) goto aBigger;
4125     if ( aSig < bSig ) goto bBigger;
4126     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4127  bExpBigger:
4128     if ( bExp == 0x7FFF ) {
4129         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4130         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4131     }
4132     if ( aExp == 0 ) ++expDiff;
4133     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4134  bBigger:
4135     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4136     zExp = bExp;
4137     zSign ^= 1;
4138     goto normalizeRoundAndPack;
4139  aExpBigger:
4140     if ( aExp == 0x7FFF ) {
4141         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4142         return a;
4143     }
4144     if ( bExp == 0 ) --expDiff;
4145     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4146  aBigger:
4147     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4148     zExp = aExp;
4149  normalizeRoundAndPack:
4150     return
4151         normalizeRoundAndPackFloatx80(
4152             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4153
4154 }
4155
4156 /*----------------------------------------------------------------------------
4157 | Returns the result of adding the extended double-precision floating-point
4158 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4159 | Standard for Binary Floating-Point Arithmetic.
4160 *----------------------------------------------------------------------------*/
4161
4162 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4163 {
4164     flag aSign, bSign;
4165
4166     aSign = extractFloatx80Sign( a );
4167     bSign = extractFloatx80Sign( b );
4168     if ( aSign == bSign ) {
4169         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4170     }
4171     else {
4172         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4173     }
4174
4175 }
4176
4177 /*----------------------------------------------------------------------------
4178 | Returns the result of subtracting the extended double-precision floating-
4179 | point values `a' and `b'.  The operation is performed according to the
4180 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4181 *----------------------------------------------------------------------------*/
4182
4183 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4184 {
4185     flag aSign, bSign;
4186
4187     aSign = extractFloatx80Sign( a );
4188     bSign = extractFloatx80Sign( b );
4189     if ( aSign == bSign ) {
4190         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4191     }
4192     else {
4193         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4194     }
4195
4196 }
4197
4198 /*----------------------------------------------------------------------------
4199 | Returns the result of multiplying the extended double-precision floating-
4200 | point values `a' and `b'.  The operation is performed according to the
4201 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4202 *----------------------------------------------------------------------------*/
4203
4204 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4205 {
4206     flag aSign, bSign, zSign;
4207     int32 aExp, bExp, zExp;
4208     uint64_t aSig, bSig, zSig0, zSig1;
4209     floatx80 z;
4210
4211     aSig = extractFloatx80Frac( a );
4212     aExp = extractFloatx80Exp( a );
4213     aSign = extractFloatx80Sign( a );
4214     bSig = extractFloatx80Frac( b );
4215     bExp = extractFloatx80Exp( b );
4216     bSign = extractFloatx80Sign( b );
4217     zSign = aSign ^ bSign;
4218     if ( aExp == 0x7FFF ) {
4219         if (    (uint64_t) ( aSig<<1 )
4220              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4221             return propagateFloatx80NaN( a, b STATUS_VAR );
4222         }
4223         if ( ( bExp | bSig ) == 0 ) goto invalid;
4224         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4225     }
4226     if ( bExp == 0x7FFF ) {
4227         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4228         if ( ( aExp | aSig ) == 0 ) {
4229  invalid:
4230             float_raise( float_flag_invalid STATUS_VAR);
4231             z.low = floatx80_default_nan_low;
4232             z.high = floatx80_default_nan_high;
4233             return z;
4234         }
4235         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4236     }
4237     if ( aExp == 0 ) {
4238         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4239         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4240     }
4241     if ( bExp == 0 ) {
4242         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4243         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4244     }
4245     zExp = aExp + bExp - 0x3FFE;
4246     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4247     if ( 0 < (int64_t) zSig0 ) {
4248         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4249         --zExp;
4250     }
4251     return
4252         roundAndPackFloatx80(
4253             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4254
4255 }
4256
4257 /*----------------------------------------------------------------------------
4258 | Returns the result of dividing the extended double-precision floating-point
4259 | value `a' by the corresponding value `b'.  The operation is performed
4260 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4261 *----------------------------------------------------------------------------*/
4262
4263 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4264 {
4265     flag aSign, bSign, zSign;
4266     int32 aExp, bExp, zExp;
4267     uint64_t aSig, bSig, zSig0, zSig1;
4268     uint64_t rem0, rem1, rem2, term0, term1, term2;
4269     floatx80 z;
4270
4271     aSig = extractFloatx80Frac( a );
4272     aExp = extractFloatx80Exp( a );
4273     aSign = extractFloatx80Sign( a );
4274     bSig = extractFloatx80Frac( b );
4275     bExp = extractFloatx80Exp( b );
4276     bSign = extractFloatx80Sign( b );
4277     zSign = aSign ^ bSign;
4278     if ( aExp == 0x7FFF ) {
4279         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4280         if ( bExp == 0x7FFF ) {
4281             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4282             goto invalid;
4283         }
4284         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4285     }
4286     if ( bExp == 0x7FFF ) {
4287         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4288         return packFloatx80( zSign, 0, 0 );
4289     }
4290     if ( bExp == 0 ) {
4291         if ( bSig == 0 ) {
4292             if ( ( aExp | aSig ) == 0 ) {
4293  invalid:
4294                 float_raise( float_flag_invalid STATUS_VAR);
4295                 z.low = floatx80_default_nan_low;
4296                 z.high = floatx80_default_nan_high;
4297                 return z;
4298             }
4299             float_raise( float_flag_divbyzero STATUS_VAR);
4300             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4301         }
4302         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4303     }
4304     if ( aExp == 0 ) {
4305         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4306         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4307     }
4308     zExp = aExp - bExp + 0x3FFE;
4309     rem1 = 0;
4310     if ( bSig <= aSig ) {
4311         shift128Right( aSig, 0, 1, &aSig, &rem1 );
4312         ++zExp;
4313     }
4314     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4315     mul64To128( bSig, zSig0, &term0, &term1 );
4316     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4317     while ( (int64_t) rem0 < 0 ) {
4318         --zSig0;
4319         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4320     }
4321     zSig1 = estimateDiv128To64( rem1, 0, bSig );
4322     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4323         mul64To128( bSig, zSig1, &term1, &term2 );
4324         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4325         while ( (int64_t) rem1 < 0 ) {
4326             --zSig1;
4327             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4328         }
4329         zSig1 |= ( ( rem1 | rem2 ) != 0 );
4330     }
4331     return
4332         roundAndPackFloatx80(
4333             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4334
4335 }
4336
4337 /*----------------------------------------------------------------------------
4338 | Returns the remainder of the extended double-precision floating-point value
4339 | `a' with respect to the corresponding value `b'.  The operation is performed
4340 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4341 *----------------------------------------------------------------------------*/
4342
4343 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4344 {
4345     flag aSign, zSign;
4346     int32 aExp, bExp, expDiff;
4347     uint64_t aSig0, aSig1, bSig;
4348     uint64_t q, term0, term1, alternateASig0, alternateASig1;
4349     floatx80 z;
4350
4351     aSig0 = extractFloatx80Frac( a );
4352     aExp = extractFloatx80Exp( a );
4353     aSign = extractFloatx80Sign( a );
4354     bSig = extractFloatx80Frac( b );
4355     bExp = extractFloatx80Exp( b );
4356     if ( aExp == 0x7FFF ) {
4357         if (    (uint64_t) ( aSig0<<1 )
4358              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4359             return propagateFloatx80NaN( a, b STATUS_VAR );
4360         }
4361         goto invalid;
4362     }
4363     if ( bExp == 0x7FFF ) {
4364         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4365         return a;
4366     }
4367     if ( bExp == 0 ) {
4368         if ( bSig == 0 ) {
4369  invalid:
4370             float_raise( float_flag_invalid STATUS_VAR);
4371             z.low = floatx80_default_nan_low;
4372             z.high = floatx80_default_nan_high;
4373             return z;
4374         }
4375         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4376     }
4377     if ( aExp == 0 ) {
4378         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4379         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4380     }
4381     bSig |= LIT64( 0x8000000000000000 );
4382     zSign = aSign;
4383     expDiff = aExp - bExp;
4384     aSig1 = 0;
4385     if ( expDiff < 0 ) {
4386         if ( expDiff < -1 ) return a;
4387         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4388         expDiff = 0;
4389     }
4390     q = ( bSig <= aSig0 );
4391     if ( q ) aSig0 -= bSig;
4392     expDiff -= 64;
4393     while ( 0 < expDiff ) {
4394         q = estimateDiv128To64( aSig0, aSig1, bSig );
4395         q = ( 2 < q ) ? q - 2 : 0;
4396         mul64To128( bSig, q, &term0, &term1 );
4397         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4398         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4399         expDiff -= 62;
4400     }
4401     expDiff += 64;
4402     if ( 0 < expDiff ) {
4403         q = estimateDiv128To64( aSig0, aSig1, bSig );
4404         q = ( 2 < q ) ? q - 2 : 0;
4405         q >>= 64 - expDiff;
4406         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4407         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4408         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4409         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4410             ++q;
4411             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4412         }
4413     }
4414     else {
4415         term1 = 0;
4416         term0 = bSig;
4417     }
4418     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4419     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4420          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4421               && ( q & 1 ) )
4422        ) {
4423         aSig0 = alternateASig0;
4424         aSig1 = alternateASig1;
4425         zSign = ! zSign;
4426     }
4427     return
4428         normalizeRoundAndPackFloatx80(
4429             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4430
4431 }
4432
4433 /*----------------------------------------------------------------------------
4434 | Returns the square root of the extended double-precision floating-point
4435 | value `a'.  The operation is performed according to the IEC/IEEE Standard
4436 | for Binary Floating-Point Arithmetic.
4437 *----------------------------------------------------------------------------*/
4438
4439 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4440 {
4441     flag aSign;
4442     int32 aExp, zExp;
4443     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4444     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4445     floatx80 z;
4446
4447     aSig0 = extractFloatx80Frac( a );
4448     aExp = extractFloatx80Exp( a );
4449     aSign = extractFloatx80Sign( a );
4450     if ( aExp == 0x7FFF ) {
4451         if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4452         if ( ! aSign ) return a;
4453         goto invalid;
4454     }
4455     if ( aSign ) {
4456         if ( ( aExp | aSig0 ) == 0 ) return a;
4457  invalid:
4458         float_raise( float_flag_invalid STATUS_VAR);
4459         z.low = floatx80_default_nan_low;
4460         z.high = floatx80_default_nan_high;
4461         return z;
4462     }
4463     if ( aExp == 0 ) {
4464         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4465         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4466     }
4467     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4468     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4469     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4470     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4471     doubleZSig0 = zSig0<<1;
4472     mul64To128( zSig0, zSig0, &term0, &term1 );
4473     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4474     while ( (int64_t) rem0 < 0 ) {
4475         --zSig0;
4476         doubleZSig0 -= 2;
4477         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4478     }
4479     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4480     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4481         if ( zSig1 == 0 ) zSig1 = 1;
4482         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4483         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4484         mul64To128( zSig1, zSig1, &term2, &term3 );
4485         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4486         while ( (int64_t) rem1 < 0 ) {
4487             --zSig1;
4488             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4489             term3 |= 1;
4490             term2 |= doubleZSig0;
4491             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4492         }
4493         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4494     }
4495     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4496     zSig0 |= doubleZSig0;
4497     return
4498         roundAndPackFloatx80(
4499             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4500
4501 }
4502
4503 /*----------------------------------------------------------------------------
4504 | Returns 1 if the extended double-precision floating-point value `a' is
4505 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
4506 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4507 | Arithmetic.
4508 *----------------------------------------------------------------------------*/
4509
4510 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4511 {
4512
4513     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4514               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4515          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4516               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4517        ) {
4518         if (    floatx80_is_signaling_nan( a )
4519              || floatx80_is_signaling_nan( b ) ) {
4520             float_raise( float_flag_invalid STATUS_VAR);
4521         }
4522         return 0;
4523     }
4524     return
4525            ( a.low == b.low )
4526         && (    ( a.high == b.high )
4527              || (    ( a.low == 0 )
4528                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4529            );
4530
4531 }
4532
4533 /*----------------------------------------------------------------------------
4534 | Returns 1 if the extended double-precision floating-point value `a' is
4535 | less than or equal to the corresponding value `b', and 0 otherwise.  The
4536 | comparison is performed according to the IEC/IEEE Standard for Binary
4537 | Floating-Point Arithmetic.
4538 *----------------------------------------------------------------------------*/
4539
4540 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4541 {
4542     flag aSign, bSign;
4543
4544     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4545               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4546          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4547               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4548        ) {
4549         float_raise( float_flag_invalid STATUS_VAR);
4550         return 0;
4551     }
4552     aSign = extractFloatx80Sign( a );
4553     bSign = extractFloatx80Sign( b );
4554     if ( aSign != bSign ) {
4555         return
4556                aSign
4557             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4558                  == 0 );
4559     }
4560     return
4561           aSign ? le128( b.high, b.low, a.high, a.low )
4562         : le128( a.high, a.low, b.high, b.low );
4563
4564 }
4565
4566 /*----------------------------------------------------------------------------
4567 | Returns 1 if the extended double-precision floating-point value `a' is
4568 | less than the corresponding value `b', and 0 otherwise.  The comparison
4569 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4570 | Arithmetic.
4571 *----------------------------------------------------------------------------*/
4572
4573 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4574 {
4575     flag aSign, bSign;
4576
4577     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4578               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4579          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4580               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4581        ) {
4582         float_raise( float_flag_invalid STATUS_VAR);
4583         return 0;
4584     }
4585     aSign = extractFloatx80Sign( a );
4586     bSign = extractFloatx80Sign( b );
4587     if ( aSign != bSign ) {
4588         return
4589                aSign
4590             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4591                  != 0 );
4592     }
4593     return
4594           aSign ? lt128( b.high, b.low, a.high, a.low )
4595         : lt128( a.high, a.low, b.high, b.low );
4596
4597 }
4598
4599 /*----------------------------------------------------------------------------
4600 | Returns 1 if the extended double-precision floating-point value `a' is equal
4601 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
4602 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4603 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4604 *----------------------------------------------------------------------------*/
4605
4606 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4607 {
4608
4609     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4610               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4611          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4612               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4613        ) {
4614         float_raise( float_flag_invalid STATUS_VAR);
4615         return 0;
4616     }
4617     return
4618            ( a.low == b.low )
4619         && (    ( a.high == b.high )
4620              || (    ( a.low == 0 )
4621                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4622            );
4623
4624 }
4625
4626 /*----------------------------------------------------------------------------
4627 | Returns 1 if the extended double-precision floating-point value `a' is less
4628 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4629 | do not cause an exception.  Otherwise, the comparison is performed according
4630 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4631 *----------------------------------------------------------------------------*/
4632
4633 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4634 {
4635     flag aSign, bSign;
4636
4637     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4638               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4639          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4640               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4641        ) {
4642         if (    floatx80_is_signaling_nan( a )
4643              || floatx80_is_signaling_nan( b ) ) {
4644             float_raise( float_flag_invalid STATUS_VAR);
4645         }
4646         return 0;
4647     }
4648     aSign = extractFloatx80Sign( a );
4649     bSign = extractFloatx80Sign( b );
4650     if ( aSign != bSign ) {
4651         return
4652                aSign
4653             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4654                  == 0 );
4655     }
4656     return
4657           aSign ? le128( b.high, b.low, a.high, a.low )
4658         : le128( a.high, a.low, b.high, b.low );
4659
4660 }
4661
4662 /*----------------------------------------------------------------------------
4663 | Returns 1 if the extended double-precision floating-point value `a' is less
4664 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4665 | an exception.  Otherwise, the comparison is performed according to the
4666 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4667 *----------------------------------------------------------------------------*/
4668
4669 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4670 {
4671     flag aSign, bSign;
4672
4673     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4674               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4675          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4676               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4677        ) {
4678         if (    floatx80_is_signaling_nan( a )
4679              || floatx80_is_signaling_nan( b ) ) {
4680             float_raise( float_flag_invalid STATUS_VAR);
4681         }
4682         return 0;
4683     }
4684     aSign = extractFloatx80Sign( a );
4685     bSign = extractFloatx80Sign( b );
4686     if ( aSign != bSign ) {
4687         return
4688                aSign
4689             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4690                  != 0 );
4691     }
4692     return
4693           aSign ? lt128( b.high, b.low, a.high, a.low )
4694         : lt128( a.high, a.low, b.high, b.low );
4695
4696 }
4697
4698 #endif
4699
4700 #ifdef FLOAT128
4701
4702 /*----------------------------------------------------------------------------
4703 | Returns the result of converting the quadruple-precision floating-point
4704 | value `a' to the 32-bit two's complement integer format.  The conversion
4705 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4706 | Arithmetic---which means in particular that the conversion is rounded
4707 | according to the current rounding mode.  If `a' is a NaN, the largest
4708 | positive integer is returned.  Otherwise, if the conversion overflows, the
4709 | largest integer with the same sign as `a' is returned.
4710 *----------------------------------------------------------------------------*/
4711
4712 int32 float128_to_int32( float128 a STATUS_PARAM )
4713 {
4714     flag aSign;
4715     int32 aExp, shiftCount;
4716     uint64_t aSig0, aSig1;
4717
4718     aSig1 = extractFloat128Frac1( a );
4719     aSig0 = extractFloat128Frac0( a );
4720     aExp = extractFloat128Exp( a );
4721     aSign = extractFloat128Sign( a );
4722     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4723     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4724     aSig0 |= ( aSig1 != 0 );
4725     shiftCount = 0x4028 - aExp;
4726     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4727     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4728
4729 }
4730
4731 /*----------------------------------------------------------------------------
4732 | Returns the result of converting the quadruple-precision floating-point
4733 | value `a' to the 32-bit two's complement integer format.  The conversion
4734 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4735 | Arithmetic, except that the conversion is always rounded toward zero.  If
4736 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4737 | conversion overflows, the largest integer with the same sign as `a' is
4738 | returned.
4739 *----------------------------------------------------------------------------*/
4740
4741 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4742 {
4743     flag aSign;
4744     int32 aExp, shiftCount;
4745     uint64_t aSig0, aSig1, savedASig;
4746     int32 z;
4747
4748     aSig1 = extractFloat128Frac1( a );
4749     aSig0 = extractFloat128Frac0( a );
4750     aExp = extractFloat128Exp( a );
4751     aSign = extractFloat128Sign( a );
4752     aSig0 |= ( aSig1 != 0 );
4753     if ( 0x401E < aExp ) {
4754         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4755         goto invalid;
4756     }
4757     else if ( aExp < 0x3FFF ) {
4758         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4759         return 0;
4760     }
4761     aSig0 |= LIT64( 0x0001000000000000 );
4762     shiftCount = 0x402F - aExp;
4763     savedASig = aSig0;
4764     aSig0 >>= shiftCount;
4765     z = aSig0;
4766     if ( aSign ) z = - z;
4767     if ( ( z < 0 ) ^ aSign ) {
4768  invalid:
4769         float_raise( float_flag_invalid STATUS_VAR);
4770         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4771     }
4772     if ( ( aSig0<<shiftCount ) != savedASig ) {
4773         STATUS(float_exception_flags) |= float_flag_inexact;
4774     }
4775     return z;
4776
4777 }
4778
4779 /*----------------------------------------------------------------------------
4780 | Returns the result of converting the quadruple-precision floating-point
4781 | value `a' to the 64-bit two's complement integer format.  The conversion
4782 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4783 | Arithmetic---which means in particular that the conversion is rounded
4784 | according to the current rounding mode.  If `a' is a NaN, the largest
4785 | positive integer is returned.  Otherwise, if the conversion overflows, the
4786 | largest integer with the same sign as `a' is returned.
4787 *----------------------------------------------------------------------------*/
4788
4789 int64 float128_to_int64( float128 a STATUS_PARAM )
4790 {
4791     flag aSign;
4792     int32 aExp, shiftCount;
4793     uint64_t aSig0, aSig1;
4794
4795     aSig1 = extractFloat128Frac1( a );
4796     aSig0 = extractFloat128Frac0( a );
4797     aExp = extractFloat128Exp( a );
4798     aSign = extractFloat128Sign( a );
4799     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4800     shiftCount = 0x402F - aExp;
4801     if ( shiftCount <= 0 ) {
4802         if ( 0x403E < aExp ) {
4803             float_raise( float_flag_invalid STATUS_VAR);
4804             if (    ! aSign
4805                  || (    ( aExp == 0x7FFF )
4806                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4807                     )
4808                ) {
4809                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4810             }
4811             return (int64_t) LIT64( 0x8000000000000000 );
4812         }
4813         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4814     }
4815     else {
4816         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4817     }
4818     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4819
4820 }
4821
4822 /*----------------------------------------------------------------------------
4823 | Returns the result of converting the quadruple-precision floating-point
4824 | value `a' to the 64-bit two's complement integer format.  The conversion
4825 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4826 | Arithmetic, except that the conversion is always rounded toward zero.
4827 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4828 | the conversion overflows, the largest integer with the same sign as `a' is
4829 | returned.
4830 *----------------------------------------------------------------------------*/
4831
4832 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4833 {
4834     flag aSign;
4835     int32 aExp, shiftCount;
4836     uint64_t aSig0, aSig1;
4837     int64 z;
4838
4839     aSig1 = extractFloat128Frac1( a );
4840     aSig0 = extractFloat128Frac0( a );
4841     aExp = extractFloat128Exp( a );
4842     aSign = extractFloat128Sign( a );
4843     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4844     shiftCount = aExp - 0x402F;
4845     if ( 0 < shiftCount ) {
4846         if ( 0x403E <= aExp ) {
4847             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4848             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4849                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4850                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4851             }
4852             else {
4853                 float_raise( float_flag_invalid STATUS_VAR);
4854                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4855                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4856                 }
4857             }
4858             return (int64_t) LIT64( 0x8000000000000000 );
4859         }
4860         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4861         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
4862             STATUS(float_exception_flags) |= float_flag_inexact;
4863         }
4864     }
4865     else {
4866         if ( aExp < 0x3FFF ) {
4867             if ( aExp | aSig0 | aSig1 ) {
4868                 STATUS(float_exception_flags) |= float_flag_inexact;
4869             }
4870             return 0;
4871         }
4872         z = aSig0>>( - shiftCount );
4873         if (    aSig1
4874              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4875             STATUS(float_exception_flags) |= float_flag_inexact;
4876         }
4877     }
4878     if ( aSign ) z = - z;
4879     return z;
4880
4881 }
4882
4883 /*----------------------------------------------------------------------------
4884 | Returns the result of converting the quadruple-precision floating-point
4885 | value `a' to the single-precision floating-point format.  The conversion
4886 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4887 | Arithmetic.
4888 *----------------------------------------------------------------------------*/
4889
4890 float32 float128_to_float32( float128 a STATUS_PARAM )
4891 {
4892     flag aSign;
4893     int32 aExp;
4894     uint64_t aSig0, aSig1;
4895     uint32_t zSig;
4896
4897     aSig1 = extractFloat128Frac1( a );
4898     aSig0 = extractFloat128Frac0( a );
4899     aExp = extractFloat128Exp( a );
4900     aSign = extractFloat128Sign( a );
4901     if ( aExp == 0x7FFF ) {
4902         if ( aSig0 | aSig1 ) {
4903             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4904         }
4905         return packFloat32( aSign, 0xFF, 0 );
4906     }
4907     aSig0 |= ( aSig1 != 0 );
4908     shift64RightJamming( aSig0, 18, &aSig0 );
4909     zSig = aSig0;
4910     if ( aExp || zSig ) {
4911         zSig |= 0x40000000;
4912         aExp -= 0x3F81;
4913     }
4914     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4915
4916 }
4917
4918 /*----------------------------------------------------------------------------
4919 | Returns the result of converting the quadruple-precision floating-point
4920 | value `a' to the double-precision floating-point format.  The conversion
4921 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4922 | Arithmetic.
4923 *----------------------------------------------------------------------------*/
4924
4925 float64 float128_to_float64( float128 a STATUS_PARAM )
4926 {
4927     flag aSign;
4928     int32 aExp;
4929     uint64_t aSig0, aSig1;
4930
4931     aSig1 = extractFloat128Frac1( a );
4932     aSig0 = extractFloat128Frac0( a );
4933     aExp = extractFloat128Exp( a );
4934     aSign = extractFloat128Sign( a );
4935     if ( aExp == 0x7FFF ) {
4936         if ( aSig0 | aSig1 ) {
4937             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4938         }
4939         return packFloat64( aSign, 0x7FF, 0 );
4940     }
4941     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4942     aSig0 |= ( aSig1 != 0 );
4943     if ( aExp || aSig0 ) {
4944         aSig0 |= LIT64( 0x4000000000000000 );
4945         aExp -= 0x3C01;
4946     }
4947     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4948
4949 }
4950
4951 #ifdef FLOATX80
4952
4953 /*----------------------------------------------------------------------------
4954 | Returns the result of converting the quadruple-precision floating-point
4955 | value `a' to the extended double-precision floating-point format.  The
4956 | conversion is performed according to the IEC/IEEE Standard for Binary
4957 | Floating-Point Arithmetic.
4958 *----------------------------------------------------------------------------*/
4959
4960 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4961 {
4962     flag aSign;
4963     int32 aExp;
4964     uint64_t aSig0, aSig1;
4965
4966     aSig1 = extractFloat128Frac1( a );
4967     aSig0 = extractFloat128Frac0( a );
4968     aExp = extractFloat128Exp( a );
4969     aSign = extractFloat128Sign( a );
4970     if ( aExp == 0x7FFF ) {
4971         if ( aSig0 | aSig1 ) {
4972             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4973         }
4974         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4975     }
4976     if ( aExp == 0 ) {
4977         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4978         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4979     }
4980     else {
4981         aSig0 |= LIT64( 0x0001000000000000 );
4982     }
4983     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4984     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4985
4986 }
4987
4988 #endif
4989
4990 /*----------------------------------------------------------------------------
4991 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4992 | returns the result as a quadruple-precision floating-point value.  The
4993 | operation is performed according to the IEC/IEEE Standard for Binary
4994 | Floating-Point Arithmetic.
4995 *----------------------------------------------------------------------------*/
4996
4997 float128 float128_round_to_int( float128 a STATUS_PARAM )
4998 {
4999     flag aSign;
5000     int32 aExp;
5001     uint64_t lastBitMask, roundBitsMask;
5002     int8 roundingMode;
5003     float128 z;
5004
5005     aExp = extractFloat128Exp( a );
5006     if ( 0x402F <= aExp ) {
5007         if ( 0x406F <= aExp ) {
5008             if (    ( aExp == 0x7FFF )
5009                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5010                ) {
5011                 return propagateFloat128NaN( a, a STATUS_VAR );
5012             }
5013             return a;
5014         }
5015         lastBitMask = 1;
5016         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5017         roundBitsMask = lastBitMask - 1;
5018         z = a;
5019         roundingMode = STATUS(float_rounding_mode);
5020         if ( roundingMode == float_round_nearest_even ) {
5021             if ( lastBitMask ) {
5022                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5023                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5024             }
5025             else {
5026                 if ( (int64_t) z.low < 0 ) {
5027                     ++z.high;
5028                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5029                 }
5030             }
5031         }
5032         else if ( roundingMode != float_round_to_zero ) {
5033             if (   extractFloat128Sign( z )
5034                  ^ ( roundingMode == float_round_up ) ) {
5035                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5036             }
5037         }
5038         z.low &= ~ roundBitsMask;
5039     }
5040     else {
5041         if ( aExp < 0x3FFF ) {
5042             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5043             STATUS(float_exception_flags) |= float_flag_inexact;
5044             aSign = extractFloat128Sign( a );
5045             switch ( STATUS(float_rounding_mode) ) {
5046              case float_round_nearest_even:
5047                 if (    ( aExp == 0x3FFE )
5048                      && (   extractFloat128Frac0( a )
5049                           | extractFloat128Frac1( a ) )
5050                    ) {
5051                     return packFloat128( aSign, 0x3FFF, 0, 0 );
5052                 }
5053                 break;
5054              case float_round_down:
5055                 return
5056                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5057                     : packFloat128( 0, 0, 0, 0 );
5058              case float_round_up:
5059                 return
5060                       aSign ? packFloat128( 1, 0, 0, 0 )
5061                     : packFloat128( 0, 0x3FFF, 0, 0 );
5062             }
5063             return packFloat128( aSign, 0, 0, 0 );
5064         }
5065         lastBitMask = 1;
5066         lastBitMask <<= 0x402F - aExp;
5067         roundBitsMask = lastBitMask - 1;
5068         z.low = 0;
5069         z.high = a.high;
5070         roundingMode = STATUS(float_rounding_mode);
5071         if ( roundingMode == float_round_nearest_even ) {
5072             z.high += lastBitMask>>1;
5073             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5074                 z.high &= ~ lastBitMask;
5075             }
5076         }
5077         else if ( roundingMode != float_round_to_zero ) {
5078             if (   extractFloat128Sign( z )
5079                  ^ ( roundingMode == float_round_up ) ) {
5080                 z.high |= ( a.low != 0 );
5081                 z.high += roundBitsMask;
5082             }
5083         }
5084         z.high &= ~ roundBitsMask;
5085     }
5086     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5087         STATUS(float_exception_flags) |= float_flag_inexact;
5088     }
5089     return z;
5090
5091 }
5092
5093 /*----------------------------------------------------------------------------
5094 | Returns the result of adding the absolute values of the quadruple-precision
5095 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5096 | before being returned.  `zSign' is ignored if the result is a NaN.
5097 | The addition is performed according to the IEC/IEEE Standard for Binary
5098 | Floating-Point Arithmetic.
5099 *----------------------------------------------------------------------------*/
5100
5101 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5102 {
5103     int32 aExp, bExp, zExp;
5104     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5105     int32 expDiff;
5106
5107     aSig1 = extractFloat128Frac1( a );
5108     aSig0 = extractFloat128Frac0( a );
5109     aExp = extractFloat128Exp( a );
5110     bSig1 = extractFloat128Frac1( b );
5111     bSig0 = extractFloat128Frac0( b );
5112     bExp = extractFloat128Exp( b );
5113     expDiff = aExp - bExp;
5114     if ( 0 < expDiff ) {
5115         if ( aExp == 0x7FFF ) {
5116             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5117             return a;
5118         }
5119         if ( bExp == 0 ) {
5120             --expDiff;
5121         }
5122         else {
5123             bSig0 |= LIT64( 0x0001000000000000 );
5124         }
5125         shift128ExtraRightJamming(
5126             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5127         zExp = aExp;
5128     }
5129     else if ( expDiff < 0 ) {
5130         if ( bExp == 0x7FFF ) {
5131             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5132             return packFloat128( zSign, 0x7FFF, 0, 0 );
5133         }
5134         if ( aExp == 0 ) {
5135             ++expDiff;
5136         }
5137         else {
5138             aSig0 |= LIT64( 0x0001000000000000 );
5139         }
5140         shift128ExtraRightJamming(
5141             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5142         zExp = bExp;
5143     }
5144     else {
5145         if ( aExp == 0x7FFF ) {
5146             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5147                 return propagateFloat128NaN( a, b STATUS_VAR );
5148             }
5149             return a;
5150         }
5151         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5152         if ( aExp == 0 ) {
5153             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
5154             return packFloat128( zSign, 0, zSig0, zSig1 );
5155         }
5156         zSig2 = 0;
5157         zSig0 |= LIT64( 0x0002000000000000 );
5158         zExp = aExp;
5159         goto shiftRight1;
5160     }
5161     aSig0 |= LIT64( 0x0001000000000000 );
5162     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5163     --zExp;
5164     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5165     ++zExp;
5166  shiftRight1:
5167     shift128ExtraRightJamming(
5168         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5169  roundAndPack:
5170     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5171
5172 }
5173
5174 /*----------------------------------------------------------------------------
5175 | Returns the result of subtracting the absolute values of the quadruple-
5176 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
5177 | difference is negated before being returned.  `zSign' is ignored if the
5178 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5179 | Standard for Binary Floating-Point Arithmetic.
5180 *----------------------------------------------------------------------------*/
5181
5182 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5183 {
5184     int32 aExp, bExp, zExp;
5185     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5186     int32 expDiff;
5187     float128 z;
5188
5189     aSig1 = extractFloat128Frac1( a );
5190     aSig0 = extractFloat128Frac0( a );
5191     aExp = extractFloat128Exp( a );
5192     bSig1 = extractFloat128Frac1( b );
5193     bSig0 = extractFloat128Frac0( b );
5194     bExp = extractFloat128Exp( b );
5195     expDiff = aExp - bExp;
5196     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5197     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5198     if ( 0 < expDiff ) goto aExpBigger;
5199     if ( expDiff < 0 ) goto bExpBigger;
5200     if ( aExp == 0x7FFF ) {
5201         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5202             return propagateFloat128NaN( a, b STATUS_VAR );
5203         }
5204         float_raise( float_flag_invalid STATUS_VAR);
5205         z.low = float128_default_nan_low;
5206         z.high = float128_default_nan_high;
5207         return z;
5208     }
5209     if ( aExp == 0 ) {
5210         aExp = 1;
5211         bExp = 1;
5212     }
5213     if ( bSig0 < aSig0 ) goto aBigger;
5214     if ( aSig0 < bSig0 ) goto bBigger;
5215     if ( bSig1 < aSig1 ) goto aBigger;
5216     if ( aSig1 < bSig1 ) goto bBigger;
5217     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5218  bExpBigger:
5219     if ( bExp == 0x7FFF ) {
5220         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5221         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5222     }
5223     if ( aExp == 0 ) {
5224         ++expDiff;
5225     }
5226     else {
5227         aSig0 |= LIT64( 0x4000000000000000 );
5228     }
5229     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5230     bSig0 |= LIT64( 0x4000000000000000 );
5231  bBigger:
5232     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5233     zExp = bExp;
5234     zSign ^= 1;
5235     goto normalizeRoundAndPack;
5236  aExpBigger:
5237     if ( aExp == 0x7FFF ) {
5238         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5239         return a;
5240     }
5241     if ( bExp == 0 ) {
5242         --expDiff;
5243     }
5244     else {
5245         bSig0 |= LIT64( 0x4000000000000000 );
5246     }
5247     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5248     aSig0 |= LIT64( 0x4000000000000000 );
5249  aBigger:
5250     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5251     zExp = aExp;
5252  normalizeRoundAndPack:
5253     --zExp;
5254     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5255
5256 }
5257
5258 /*----------------------------------------------------------------------------
5259 | Returns the result of adding the quadruple-precision floating-point values
5260 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5261 | for Binary Floating-Point Arithmetic.
5262 *----------------------------------------------------------------------------*/
5263
5264 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5265 {
5266     flag aSign, bSign;
5267
5268     aSign = extractFloat128Sign( a );
5269     bSign = extractFloat128Sign( b );
5270     if ( aSign == bSign ) {
5271         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5272     }
5273     else {
5274         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5275     }
5276
5277 }
5278
5279 /*----------------------------------------------------------------------------
5280 | Returns the result of subtracting the quadruple-precision floating-point
5281 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5282 | Standard for Binary Floating-Point Arithmetic.
5283 *----------------------------------------------------------------------------*/
5284
5285 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5286 {
5287     flag aSign, bSign;
5288
5289     aSign = extractFloat128Sign( a );
5290     bSign = extractFloat128Sign( b );
5291     if ( aSign == bSign ) {
5292         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5293     }
5294     else {
5295         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5296     }
5297
5298 }
5299
5300 /*----------------------------------------------------------------------------
5301 | Returns the result of multiplying the quadruple-precision floating-point
5302 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5303 | Standard for Binary Floating-Point Arithmetic.
5304 *----------------------------------------------------------------------------*/
5305
5306 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5307 {
5308     flag aSign, bSign, zSign;
5309     int32 aExp, bExp, zExp;
5310     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5311     float128 z;
5312
5313     aSig1 = extractFloat128Frac1( a );
5314     aSig0 = extractFloat128Frac0( a );
5315     aExp = extractFloat128Exp( a );
5316     aSign = extractFloat128Sign( a );
5317     bSig1 = extractFloat128Frac1( b );
5318     bSig0 = extractFloat128Frac0( b );
5319     bExp = extractFloat128Exp( b );
5320     bSign = extractFloat128Sign( b );
5321     zSign = aSign ^ bSign;
5322     if ( aExp == 0x7FFF ) {
5323         if (    ( aSig0 | aSig1 )
5324              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5325             return propagateFloat128NaN( a, b STATUS_VAR );
5326         }
5327         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5328         return packFloat128( zSign, 0x7FFF, 0, 0 );
5329     }
5330     if ( bExp == 0x7FFF ) {
5331         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5332         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5333  invalid:
5334             float_raise( float_flag_invalid STATUS_VAR);
5335             z.low = float128_default_nan_low;
5336             z.high = float128_default_nan_high;
5337             return z;
5338         }
5339         return packFloat128( zSign, 0x7FFF, 0, 0 );
5340     }
5341     if ( aExp == 0 ) {
5342         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5343         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5344     }
5345     if ( bExp == 0 ) {
5346         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5347         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5348     }
5349     zExp = aExp + bExp - 0x4000;
5350     aSig0 |= LIT64( 0x0001000000000000 );
5351     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5352     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5353     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5354     zSig2 |= ( zSig3 != 0 );
5355     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5356         shift128ExtraRightJamming(
5357             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5358         ++zExp;
5359     }
5360     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5361
5362 }
5363
5364 /*----------------------------------------------------------------------------
5365 | Returns the result of dividing the quadruple-precision floating-point value
5366 | `a' by the corresponding value `b'.  The operation is performed according to
5367 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5368 *----------------------------------------------------------------------------*/
5369
5370 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5371 {
5372     flag aSign, bSign, zSign;
5373     int32 aExp, bExp, zExp;
5374     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5375     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5376     float128 z;
5377
5378     aSig1 = extractFloat128Frac1( a );
5379     aSig0 = extractFloat128Frac0( a );
5380     aExp = extractFloat128Exp( a );
5381     aSign = extractFloat128Sign( a );
5382     bSig1 = extractFloat128Frac1( b );
5383     bSig0 = extractFloat128Frac0( b );
5384     bExp = extractFloat128Exp( b );
5385     bSign = extractFloat128Sign( b );
5386     zSign = aSign ^ bSign;
5387     if ( aExp == 0x7FFF ) {
5388         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5389         if ( bExp == 0x7FFF ) {
5390             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5391             goto invalid;
5392         }
5393         return packFloat128( zSign, 0x7FFF, 0, 0 );
5394     }
5395     if ( bExp == 0x7FFF ) {
5396         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5397         return packFloat128( zSign, 0, 0, 0 );
5398     }
5399     if ( bExp == 0 ) {
5400         if ( ( bSig0 | bSig1 ) == 0 ) {
5401             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5402  invalid:
5403                 float_raise( float_flag_invalid STATUS_VAR);
5404                 z.low = float128_default_nan_low;
5405                 z.high = float128_default_nan_high;
5406                 return z;
5407             }
5408             float_raise( float_flag_divbyzero STATUS_VAR);
5409             return packFloat128( zSign, 0x7FFF, 0, 0 );
5410         }
5411         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5412     }
5413     if ( aExp == 0 ) {
5414         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5415         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5416     }
5417     zExp = aExp - bExp + 0x3FFD;
5418     shortShift128Left(
5419         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5420     shortShift128Left(
5421         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5422     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5423         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5424         ++zExp;
5425     }
5426     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5427     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5428     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5429     while ( (int64_t) rem0 < 0 ) {
5430         --zSig0;
5431         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5432     }
5433     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5434     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5435         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5436         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5437         while ( (int64_t) rem1 < 0 ) {
5438             --zSig1;
5439             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5440         }
5441         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5442     }
5443     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5444     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5445
5446 }
5447
5448 /*----------------------------------------------------------------------------
5449 | Returns the remainder of the quadruple-precision floating-point value `a'
5450 | with respect to the corresponding value `b'.  The operation is performed
5451 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5452 *----------------------------------------------------------------------------*/
5453
5454 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5455 {
5456     flag aSign, zSign;
5457     int32 aExp, bExp, expDiff;
5458     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5459     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
5460     int64_t sigMean0;
5461     float128 z;
5462
5463     aSig1 = extractFloat128Frac1( a );
5464     aSig0 = extractFloat128Frac0( a );
5465     aExp = extractFloat128Exp( a );
5466     aSign = extractFloat128Sign( a );
5467     bSig1 = extractFloat128Frac1( b );
5468     bSig0 = extractFloat128Frac0( b );
5469     bExp = extractFloat128Exp( b );
5470     if ( aExp == 0x7FFF ) {
5471         if (    ( aSig0 | aSig1 )
5472              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5473             return propagateFloat128NaN( a, b STATUS_VAR );
5474         }
5475         goto invalid;
5476     }
5477     if ( bExp == 0x7FFF ) {
5478         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5479         return a;
5480     }
5481     if ( bExp == 0 ) {
5482         if ( ( bSig0 | bSig1 ) == 0 ) {
5483  invalid:
5484             float_raise( float_flag_invalid STATUS_VAR);
5485             z.low = float128_default_nan_low;
5486             z.high = float128_default_nan_high;
5487             return z;
5488         }
5489         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5490     }
5491     if ( aExp == 0 ) {
5492         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5493         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5494     }
5495     expDiff = aExp - bExp;
5496     if ( expDiff < -1 ) return a;
5497     shortShift128Left(
5498         aSig0 | LIT64( 0x0001000000000000 ),
5499         aSig1,
5500         15 - ( expDiff < 0 ),
5501         &aSig0,
5502         &aSig1
5503     );
5504     shortShift128Left(
5505         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5506     q = le128( bSig0, bSig1, aSig0, aSig1 );
5507     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5508     expDiff -= 64;
5509     while ( 0 < expDiff ) {
5510         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5511         q = ( 4 < q ) ? q - 4 : 0;
5512         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5513         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5514         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5515         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5516         expDiff -= 61;
5517     }
5518     if ( -64 < expDiff ) {
5519         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5520         q = ( 4 < q ) ? q - 4 : 0;
5521         q >>= - expDiff;
5522         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5523         expDiff += 52;
5524         if ( expDiff < 0 ) {
5525             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5526         }
5527         else {
5528             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5529         }
5530         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5531         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5532     }
5533     else {
5534         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5535         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5536     }
5537     do {
5538         alternateASig0 = aSig0;
5539         alternateASig1 = aSig1;
5540         ++q;
5541         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5542     } while ( 0 <= (int64_t) aSig0 );
5543     add128(
5544         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
5545     if (    ( sigMean0 < 0 )
5546          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5547         aSig0 = alternateASig0;
5548         aSig1 = alternateASig1;
5549     }
5550     zSign = ( (int64_t) aSig0 < 0 );
5551     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5552     return
5553         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5554
5555 }
5556
5557 /*----------------------------------------------------------------------------
5558 | Returns the square root of the quadruple-precision floating-point value `a'.
5559 | The operation is performed according to the IEC/IEEE Standard for Binary
5560 | Floating-Point Arithmetic.
5561 *----------------------------------------------------------------------------*/
5562
5563 float128 float128_sqrt( float128 a STATUS_PARAM )
5564 {
5565     flag aSign;
5566     int32 aExp, zExp;
5567     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5568     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5569     float128 z;
5570
5571     aSig1 = extractFloat128Frac1( a );
5572     aSig0 = extractFloat128Frac0( a );
5573     aExp = extractFloat128Exp( a );
5574     aSign = extractFloat128Sign( a );
5575     if ( aExp == 0x7FFF ) {
5576         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5577         if ( ! aSign ) return a;
5578         goto invalid;
5579     }
5580     if ( aSign ) {
5581         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5582  invalid:
5583         float_raise( float_flag_invalid STATUS_VAR);
5584         z.low = float128_default_nan_low;
5585         z.high = float128_default_nan_high;
5586         return z;
5587     }
5588     if ( aExp == 0 ) {
5589         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5590         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5591     }
5592     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5593     aSig0 |= LIT64( 0x0001000000000000 );
5594     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5595     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5596     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5597     doubleZSig0 = zSig0<<1;
5598     mul64To128( zSig0, zSig0, &term0, &term1 );
5599     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5600     while ( (int64_t) rem0 < 0 ) {
5601         --zSig0;
5602         doubleZSig0 -= 2;
5603         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5604     }
5605     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5606     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5607         if ( zSig1 == 0 ) zSig1 = 1;
5608         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5609         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5610         mul64To128( zSig1, zSig1, &term2, &term3 );
5611         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5612         while ( (int64_t) rem1 < 0 ) {
5613             --zSig1;
5614             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5615             term3 |= 1;
5616             term2 |= doubleZSig0;
5617             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5618         }
5619         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5620     }
5621     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5622     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5623
5624 }
5625
5626 /*----------------------------------------------------------------------------
5627 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5628 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5629 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5630 *----------------------------------------------------------------------------*/
5631
5632 int float128_eq( float128 a, float128 b STATUS_PARAM )
5633 {
5634
5635     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5636               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5637          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5638               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5639        ) {
5640         if (    float128_is_signaling_nan( a )
5641              || float128_is_signaling_nan( b ) ) {
5642             float_raise( float_flag_invalid STATUS_VAR);
5643         }
5644         return 0;
5645     }
5646     return
5647            ( a.low == b.low )
5648         && (    ( a.high == b.high )
5649              || (    ( a.low == 0 )
5650                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5651            );
5652
5653 }
5654
5655 /*----------------------------------------------------------------------------
5656 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5657 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
5658 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5659 | Arithmetic.
5660 *----------------------------------------------------------------------------*/
5661
5662 int float128_le( float128 a, float128 b STATUS_PARAM )
5663 {
5664     flag aSign, bSign;
5665
5666     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5667               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5668          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5669               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5670        ) {
5671         float_raise( float_flag_invalid STATUS_VAR);
5672         return 0;
5673     }
5674     aSign = extractFloat128Sign( a );
5675     bSign = extractFloat128Sign( b );
5676     if ( aSign != bSign ) {
5677         return
5678                aSign
5679             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5680                  == 0 );
5681     }
5682     return
5683           aSign ? le128( b.high, b.low, a.high, a.low )
5684         : le128( a.high, a.low, b.high, b.low );
5685
5686 }
5687
5688 /*----------------------------------------------------------------------------
5689 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5690 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5691 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5692 *----------------------------------------------------------------------------*/
5693
5694 int float128_lt( float128 a, float128 b STATUS_PARAM )
5695 {
5696     flag aSign, bSign;
5697
5698     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5699               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5700          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5701               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5702        ) {
5703         float_raise( float_flag_invalid STATUS_VAR);
5704         return 0;
5705     }
5706     aSign = extractFloat128Sign( a );
5707     bSign = extractFloat128Sign( b );
5708     if ( aSign != bSign ) {
5709         return
5710                aSign
5711             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5712                  != 0 );
5713     }
5714     return
5715           aSign ? lt128( b.high, b.low, a.high, a.low )
5716         : lt128( a.high, a.low, b.high, b.low );
5717
5718 }
5719
5720 /*----------------------------------------------------------------------------
5721 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5722 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5723 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5724 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5725 *----------------------------------------------------------------------------*/
5726
5727 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5728 {
5729
5730     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5731               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5732          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5733               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5734        ) {
5735         float_raise( float_flag_invalid STATUS_VAR);
5736         return 0;
5737     }
5738     return
5739            ( a.low == b.low )
5740         && (    ( a.high == b.high )
5741              || (    ( a.low == 0 )
5742                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5743            );
5744
5745 }
5746
5747 /*----------------------------------------------------------------------------
5748 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5749 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5750 | cause an exception.  Otherwise, the comparison is performed according to the
5751 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5752 *----------------------------------------------------------------------------*/
5753
5754 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5755 {
5756     flag aSign, bSign;
5757
5758     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5759               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5760          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5761               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5762        ) {
5763         if (    float128_is_signaling_nan( a )
5764              || float128_is_signaling_nan( b ) ) {
5765             float_raise( float_flag_invalid STATUS_VAR);
5766         }
5767         return 0;
5768     }
5769     aSign = extractFloat128Sign( a );
5770     bSign = extractFloat128Sign( b );
5771     if ( aSign != bSign ) {
5772         return
5773                aSign
5774             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5775                  == 0 );
5776     }
5777     return
5778           aSign ? le128( b.high, b.low, a.high, a.low )
5779         : le128( a.high, a.low, b.high, b.low );
5780
5781 }
5782
5783 /*----------------------------------------------------------------------------
5784 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5785 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5786 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5787 | Standard for Binary Floating-Point Arithmetic.
5788 *----------------------------------------------------------------------------*/
5789
5790 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5791 {
5792     flag aSign, bSign;
5793
5794     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5795               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5796          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5797               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5798        ) {
5799         if (    float128_is_signaling_nan( a )
5800              || float128_is_signaling_nan( b ) ) {
5801             float_raise( float_flag_invalid STATUS_VAR);
5802         }
5803         return 0;
5804     }
5805     aSign = extractFloat128Sign( a );
5806     bSign = extractFloat128Sign( b );
5807     if ( aSign != bSign ) {
5808         return
5809                aSign
5810             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5811                  != 0 );
5812     }
5813     return
5814           aSign ? lt128( b.high, b.low, a.high, a.low )
5815         : lt128( a.high, a.low, b.high, b.low );
5816
5817 }
5818
5819 #endif
5820
5821 /* misc functions */
5822 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5823 {
5824     return int64_to_float32(a STATUS_VAR);
5825 }
5826
5827 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5828 {
5829     return int64_to_float64(a STATUS_VAR);
5830 }
5831
5832 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5833 {
5834     int64_t v;
5835     unsigned int res;
5836
5837     v = float32_to_int64(a STATUS_VAR);
5838     if (v < 0) {
5839         res = 0;
5840         float_raise( float_flag_invalid STATUS_VAR);
5841     } else if (v > 0xffffffff) {
5842         res = 0xffffffff;
5843         float_raise( float_flag_invalid STATUS_VAR);
5844     } else {
5845         res = v;
5846     }
5847     return res;
5848 }
5849
5850 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5851 {
5852     int64_t v;
5853     unsigned int res;
5854
5855     v = float32_to_int64_round_to_zero(a STATUS_VAR);
5856     if (v < 0) {
5857         res = 0;
5858         float_raise( float_flag_invalid STATUS_VAR);
5859     } else if (v > 0xffffffff) {
5860         res = 0xffffffff;
5861         float_raise( float_flag_invalid STATUS_VAR);
5862     } else {
5863         res = v;
5864     }
5865     return res;
5866 }
5867
5868 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
5869 {
5870     int64_t v;
5871     unsigned int res;
5872
5873     v = float32_to_int64_round_to_zero(a STATUS_VAR);
5874     if (v < 0) {
5875         res = 0;
5876         float_raise( float_flag_invalid STATUS_VAR);
5877     } else if (v > 0xffff) {
5878         res = 0xffff;
5879         float_raise( float_flag_invalid STATUS_VAR);
5880     } else {
5881         res = v;
5882     }
5883     return res;
5884 }
5885
5886 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5887 {
5888     int64_t v;
5889     unsigned int res;
5890
5891     v = float64_to_int64(a STATUS_VAR);
5892     if (v < 0) {
5893         res = 0;
5894         float_raise( float_flag_invalid STATUS_VAR);
5895     } else if (v > 0xffffffff) {
5896         res = 0xffffffff;
5897         float_raise( float_flag_invalid STATUS_VAR);
5898     } else {
5899         res = v;
5900     }
5901     return res;
5902 }
5903
5904 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5905 {
5906     int64_t v;
5907     unsigned int res;
5908
5909     v = float64_to_int64_round_to_zero(a STATUS_VAR);
5910     if (v < 0) {
5911         res = 0;
5912         float_raise( float_flag_invalid STATUS_VAR);
5913     } else if (v > 0xffffffff) {
5914         res = 0xffffffff;
5915         float_raise( float_flag_invalid STATUS_VAR);
5916     } else {
5917         res = v;
5918     }
5919     return res;
5920 }
5921
5922 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
5923 {
5924     int64_t v;
5925     unsigned int res;
5926
5927     v = float64_to_int64_round_to_zero(a STATUS_VAR);
5928     if (v < 0) {
5929         res = 0;
5930         float_raise( float_flag_invalid STATUS_VAR);
5931     } else if (v > 0xffff) {
5932         res = 0xffff;
5933         float_raise( float_flag_invalid STATUS_VAR);
5934     } else {
5935         res = v;
5936     }
5937     return res;
5938 }
5939
5940 /* FIXME: This looks broken.  */
5941 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5942 {
5943     int64_t v;
5944
5945     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5946     v += float64_val(a);
5947     v = float64_to_int64(make_float64(v) STATUS_VAR);
5948
5949     return v - INT64_MIN;
5950 }
5951
5952 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5953 {
5954     int64_t v;
5955
5956     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5957     v += float64_val(a);
5958     v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5959
5960     return v - INT64_MIN;
5961 }
5962
5963 #define COMPARE(s, nan_exp)                                                  \
5964 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
5965                                       int is_quiet STATUS_PARAM )            \
5966 {                                                                            \
5967     flag aSign, bSign;                                                       \
5968     uint ## s ## _t av, bv;                                                  \
5969     a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
5970     b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
5971                                                                              \
5972     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5973          extractFloat ## s ## Frac( a ) ) ||                                 \
5974         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5975           extractFloat ## s ## Frac( b ) )) {                                \
5976         if (!is_quiet ||                                                     \
5977             float ## s ## _is_signaling_nan( a ) ||                          \
5978             float ## s ## _is_signaling_nan( b ) ) {                         \
5979             float_raise( float_flag_invalid STATUS_VAR);                     \
5980         }                                                                    \
5981         return float_relation_unordered;                                     \
5982     }                                                                        \
5983     aSign = extractFloat ## s ## Sign( a );                                  \
5984     bSign = extractFloat ## s ## Sign( b );                                  \
5985     av = float ## s ## _val(a);                                              \
5986     bv = float ## s ## _val(b);                                              \
5987     if ( aSign != bSign ) {                                                  \
5988         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
5989             /* zero case */                                                  \
5990             return float_relation_equal;                                     \
5991         } else {                                                             \
5992             return 1 - (2 * aSign);                                          \
5993         }                                                                    \
5994     } else {                                                                 \
5995         if (av == bv) {                                                      \
5996             return float_relation_equal;                                     \
5997         } else {                                                             \
5998             return 1 - 2 * (aSign ^ ( av < bv ));                            \
5999         }                                                                    \
6000     }                                                                        \
6001 }                                                                            \
6002                                                                              \
6003 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
6004 {                                                                            \
6005     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
6006 }                                                                            \
6007                                                                              \
6008 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
6009 {                                                                            \
6010     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
6011 }
6012
6013 COMPARE(32, 0xff)
6014 COMPARE(64, 0x7ff)
6015
6016 INLINE int float128_compare_internal( float128 a, float128 b,
6017                                       int is_quiet STATUS_PARAM )
6018 {
6019     flag aSign, bSign;
6020
6021     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6022           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6023         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6024           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6025         if (!is_quiet ||
6026             float128_is_signaling_nan( a ) ||
6027             float128_is_signaling_nan( b ) ) {
6028             float_raise( float_flag_invalid STATUS_VAR);
6029         }
6030         return float_relation_unordered;
6031     }
6032     aSign = extractFloat128Sign( a );
6033     bSign = extractFloat128Sign( b );
6034     if ( aSign != bSign ) {
6035         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6036             /* zero case */
6037             return float_relation_equal;
6038         } else {
6039             return 1 - (2 * aSign);
6040         }
6041     } else {
6042         if (a.low == b.low && a.high == b.high) {
6043             return float_relation_equal;
6044         } else {
6045             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6046         }
6047     }
6048 }
6049
6050 int float128_compare( float128 a, float128 b STATUS_PARAM )
6051 {
6052     return float128_compare_internal(a, b, 0 STATUS_VAR);
6053 }
6054
6055 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6056 {
6057     return float128_compare_internal(a, b, 1 STATUS_VAR);
6058 }
6059
6060 /* Multiply A by 2 raised to the power N.  */
6061 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6062 {
6063     flag aSign;
6064     int16 aExp;
6065     uint32_t aSig;
6066
6067     a = float32_squash_input_denormal(a STATUS_VAR);
6068     aSig = extractFloat32Frac( a );
6069     aExp = extractFloat32Exp( a );
6070     aSign = extractFloat32Sign( a );
6071
6072     if ( aExp == 0xFF ) {
6073         return a;
6074     }
6075     if ( aExp != 0 )
6076         aSig |= 0x00800000;
6077     else if ( aSig == 0 )
6078         return a;
6079
6080     aExp += n - 1;
6081     aSig <<= 7;
6082     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6083 }
6084
6085 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6086 {
6087     flag aSign;
6088     int16 aExp;
6089     uint64_t aSig;
6090
6091     a = float64_squash_input_denormal(a STATUS_VAR);
6092     aSig = extractFloat64Frac( a );
6093     aExp = extractFloat64Exp( a );
6094     aSign = extractFloat64Sign( a );
6095
6096     if ( aExp == 0x7FF ) {
6097         return a;
6098     }
6099     if ( aExp != 0 )
6100         aSig |= LIT64( 0x0010000000000000 );
6101     else if ( aSig == 0 )
6102         return a;
6103
6104     aExp += n - 1;
6105     aSig <<= 10;
6106     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6107 }
6108
6109 #ifdef FLOATX80
6110 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6111 {
6112     flag aSign;
6113     int16 aExp;
6114     uint64_t aSig;
6115
6116     aSig = extractFloatx80Frac( a );
6117     aExp = extractFloatx80Exp( a );
6118     aSign = extractFloatx80Sign( a );
6119
6120     if ( aExp == 0x7FF ) {
6121         return a;
6122     }
6123     if (aExp == 0 && aSig == 0)
6124         return a;
6125
6126     aExp += n;
6127     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6128                                           aSign, aExp, aSig, 0 STATUS_VAR );
6129 }
6130 #endif
6131
6132 #ifdef FLOAT128
6133 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6134 {
6135     flag aSign;
6136     int32 aExp;
6137     uint64_t aSig0, aSig1;
6138
6139     aSig1 = extractFloat128Frac1( a );
6140     aSig0 = extractFloat128Frac0( a );
6141     aExp = extractFloat128Exp( a );
6142     aSign = extractFloat128Sign( a );
6143     if ( aExp == 0x7FFF ) {
6144         return a;
6145     }
6146     if ( aExp != 0 )
6147         aSig0 |= LIT64( 0x0001000000000000 );
6148     else if ( aSig0 == 0 && aSig1 == 0 )
6149         return a;
6150
6151     aExp += n - 1;
6152     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6153                                           STATUS_VAR );
6154
6155 }
6156 #endif