fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * Derived from SoftFloat.
   5  */
   6
   7 /*============================================================================
   8
   9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  10 Package, Release 2b.
  11
  12 Written by John R. Hauser.  This work was made possible in part by the
  13 International Computer Science Institute, located at Suite 600, 1947 Center
  14 Street, Berkeley, California 94704.  Funding was partially provided by the
  15 National Science Foundation under grant MIP-9311980.  The original version
  16 of this code was written as part of a project to build a fixed-point vector
  17 processor in collaboration with the University of California at Berkeley,
  18 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  20 arithmetic/SoftFloat.html'.
  21
  22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  24 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  30
  31 Derivative works are acceptable, even for commercial purposes, so long as
  32 (1) the source code for the derivative work includes prominent notice that
  33 the work is derivative, and (2) the source code includes prominent notice with
  34 these four paragraphs for those parts of this code that are retained.
  35
  36 =============================================================================*/
  37
  38 #include "softfloat.h"
  39
  40 /*----------------------------------------------------------------------------
  41 | Primitive arithmetic functions, including multi-word arithmetic, and
  42 | division and square root approximations.  (Can be specialized to target if
  43 | desired.)
  44 *----------------------------------------------------------------------------*/
  45 #include "softfloat-macros.h"
  46
  47 /*----------------------------------------------------------------------------
  48 | Functions and definitions to determine:  (1) whether tininess for underflow
  49 | is detected before or after rounding by default, (2) what (if anything)
  50 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  51 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  52 | are propagated from function inputs to output.  These details are target-
  53 | specific.
  54 *----------------------------------------------------------------------------*/
  55 #include "softfloat-specialize.h"
  56
  57 void set_float_rounding_mode(int val STATUS_PARAM)
  58 {
  59     STATUS(float_rounding_mode) = val;
  60 }
  61
  62 void set_float_exception_flags(int val STATUS_PARAM)
  63 {
  64     STATUS(float_exception_flags) = val;
  65 }
  66
  67 #ifdef FLOATX80
  68 void set_floatx80_rounding_precision(int val STATUS_PARAM)
  69 {
  70     STATUS(floatx80_rounding_precision) = val;
  71 }
  72 #endif
  73
  74 /*----------------------------------------------------------------------------
  75 | Returns the fraction bits of the half-precision floating-point value `a'.
  76 *----------------------------------------------------------------------------*/
  77
  78 INLINE uint32_t extractFloat16Frac(float16 a)
  79 {
  80     return float16_val(a) & 0x3ff;
  81 }
  82
  83 /*----------------------------------------------------------------------------
  84 | Returns the exponent bits of the half-precision floating-point value `a'.
  85 *----------------------------------------------------------------------------*/
  86
  87 INLINE int16 extractFloat16Exp(float16 a)
  88 {
  89     return (float16_val(a) >> 10) & 0x1f;
  90 }
  91
  92 /*----------------------------------------------------------------------------
  93 | Returns the sign bit of the single-precision floating-point value `a'.
  94 *----------------------------------------------------------------------------*/
  95
  96 INLINE flag extractFloat16Sign(float16 a)
  97 {
  98     return float16_val(a)>>15;
  99 }
 100
 101 /*----------------------------------------------------------------------------
 102 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
 103 | and 7, and returns the properly rounded 32-bit integer corresponding to the
 104 | input.  If `zSign' is 1, the input is negated before being converted to an
 105 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 106 | is simply rounded to an integer, with the inexact exception raised if the
 107 | input cannot be represented exactly as an integer.  However, if the fixed-
 108 | point input is too large, the invalid exception is raised and the largest
 109 | positive or negative integer is returned.
 110 *----------------------------------------------------------------------------*/
 111
 112 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
 113 {
 114     int8 roundingMode;
 115     flag roundNearestEven;
 116     int8 roundIncrement, roundBits;
 117     int32 z;
 118
 119     roundingMode = STATUS(float_rounding_mode);
 120     roundNearestEven = ( roundingMode == float_round_nearest_even );
 121     roundIncrement = 0x40;
 122     if ( ! roundNearestEven ) {
 123         if ( roundingMode == float_round_to_zero ) {
 124             roundIncrement = 0;
 125         }
 126         else {
 127             roundIncrement = 0x7F;
 128             if ( zSign ) {
 129                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 130             }
 131             else {
 132                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 133             }
 134         }
 135     }
 136     roundBits = absZ & 0x7F;
 137     absZ = ( absZ + roundIncrement )>>7;
 138     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 139     z = absZ;
 140     if ( zSign ) z = - z;
 141     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 142         float_raise( float_flag_invalid STATUS_VAR);
 143         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
 144     }
 145     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 146     return z;
 147
 148 }
 149
 150 /*----------------------------------------------------------------------------
 151 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 152 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 153 | and returns the properly rounded 64-bit integer corresponding to the input.
 154 | If `zSign' is 1, the input is negated before being converted to an integer.
 155 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 156 | the inexact exception raised if the input cannot be represented exactly as
 157 | an integer.  However, if the fixed-point input is too large, the invalid
 158 | exception is raised and the largest positive or negative integer is
 159 | returned.
 160 *----------------------------------------------------------------------------*/
 161
 162 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
 163 {
 164     int8 roundingMode;
 165     flag roundNearestEven, increment;
 166     int64 z;
 167
 168     roundingMode = STATUS(float_rounding_mode);
 169     roundNearestEven = ( roundingMode == float_round_nearest_even );
 170     increment = ( (int64_t) absZ1 < 0 );
 171     if ( ! roundNearestEven ) {
 172         if ( roundingMode == float_round_to_zero ) {
 173             increment = 0;
 174         }
 175         else {
 176             if ( zSign ) {
 177                 increment = ( roundingMode == float_round_down ) && absZ1;
 178             }
 179             else {
 180                 increment = ( roundingMode == float_round_up ) && absZ1;
 181             }
 182         }
 183     }
 184     if ( increment ) {
 185         ++absZ0;
 186         if ( absZ0 == 0 ) goto overflow;
 187         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 188     }
 189     z = absZ0;
 190     if ( zSign ) z = - z;
 191     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 192  overflow:
 193         float_raise( float_flag_invalid STATUS_VAR);
 194         return
 195               zSign ? (int64_t) LIT64( 0x8000000000000000 )
 196             : LIT64( 0x7FFFFFFFFFFFFFFF );
 197     }
 198     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 199     return z;
 200
 201 }
 202
 203 /*----------------------------------------------------------------------------
 204 | Returns the fraction bits of the single-precision floating-point value `a'.
 205 *----------------------------------------------------------------------------*/
 206
 207 INLINE uint32_t extractFloat32Frac( float32 a )
 208 {
 209
 210     return float32_val(a) & 0x007FFFFF;
 211
 212 }
 213
 214 /*----------------------------------------------------------------------------
 215 | Returns the exponent bits of the single-precision floating-point value `a'.
 216 *----------------------------------------------------------------------------*/
 217
 218 INLINE int16 extractFloat32Exp( float32 a )
 219 {
 220
 221     return ( float32_val(a)>>23 ) & 0xFF;
 222
 223 }
 224
 225 /*----------------------------------------------------------------------------
 226 | Returns the sign bit of the single-precision floating-point value `a'.
 227 *----------------------------------------------------------------------------*/
 228
 229 INLINE flag extractFloat32Sign( float32 a )
 230 {
 231
 232     return float32_val(a)>>31;
 233
 234 }
 235
 236 /*----------------------------------------------------------------------------
 237 | If `a' is denormal and we are in flush-to-zero mode then set the
 238 | input-denormal exception and return zero. Otherwise just return the value.
 239 *----------------------------------------------------------------------------*/
 240 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
 241 {
 242     if (STATUS(flush_inputs_to_zero)) {
 243         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
 244             float_raise(float_flag_input_denormal STATUS_VAR);
 245             return make_float32(float32_val(a) & 0x80000000);
 246         }
 247     }
 248     return a;
 249 }
 250
 251 /*----------------------------------------------------------------------------
 252 | Normalizes the subnormal single-precision floating-point value represented
 253 | by the denormalized significand `aSig'.  The normalized exponent and
 254 | significand are stored at the locations pointed to by `zExpPtr' and
 255 | `zSigPtr', respectively.
 256 *----------------------------------------------------------------------------*/
 257
 258 static void
 259  normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr )
 260 {
 261     int8 shiftCount;
 262
 263     shiftCount = countLeadingZeros32( aSig ) - 8;
 264     *zSigPtr = aSig<<shiftCount;
 265     *zExpPtr = 1 - shiftCount;
 266
 267 }
 268
 269 /*----------------------------------------------------------------------------
 270 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 271 | single-precision floating-point value, returning the result.  After being
 272 | shifted into the proper positions, the three fields are simply added
 273 | together to form the result.  This means that any integer portion of `zSig'
 274 | will be added into the exponent.  Since a properly normalized significand
 275 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 276 | than the desired result exponent whenever `zSig' is a complete, normalized
 277 | significand.
 278 *----------------------------------------------------------------------------*/
 279
 280 INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig )
 281 {
 282
 283     return make_float32(
 284           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
 285
 286 }
 287
 288 /*----------------------------------------------------------------------------
 289 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 290 | and significand `zSig', and returns the proper single-precision floating-
 291 | point value corresponding to the abstract input.  Ordinarily, the abstract
 292 | value is simply rounded and packed into the single-precision format, with
 293 | the inexact exception raised if the abstract input cannot be represented
 294 | exactly.  However, if the abstract value is too large, the overflow and
 295 | inexact exceptions are raised and an infinity or maximal finite value is
 296 | returned.  If the abstract value is too small, the input value is rounded to
 297 | a subnormal number, and the underflow and inexact exceptions are raised if
 298 | the abstract input cannot be represented exactly as a subnormal single-
 299 | precision floating-point number.
 300 |     The input significand `zSig' has its binary point between bits 30
 301 | and 29, which is 7 bits to the left of the usual location.  This shifted
 302 | significand must be normalized or smaller.  If `zSig' is not normalized,
 303 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 304 | and it must not require rounding.  In the usual case that `zSig' is
 305 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 306 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 307 | Binary Floating-Point Arithmetic.
 308 *----------------------------------------------------------------------------*/
 309
 310 static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
 311 {
 312     int8 roundingMode;
 313     flag roundNearestEven;
 314     int8 roundIncrement, roundBits;
 315     flag isTiny;
 316
 317     roundingMode = STATUS(float_rounding_mode);
 318     roundNearestEven = ( roundingMode == float_round_nearest_even );
 319     roundIncrement = 0x40;
 320     if ( ! roundNearestEven ) {
 321         if ( roundingMode == float_round_to_zero ) {
 322             roundIncrement = 0;
 323         }
 324         else {
 325             roundIncrement = 0x7F;
 326             if ( zSign ) {
 327                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 328             }
 329             else {
 330                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 331             }
 332         }
 333     }
 334     roundBits = zSig & 0x7F;
 335     if ( 0xFD <= (uint16_t) zExp ) {
 336         if (    ( 0xFD < zExp )
 337              || (    ( zExp == 0xFD )
 338                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
 339            ) {
 340             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 341             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 342         }
 343         if ( zExp < 0 ) {
 344             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
 345             isTiny =
 346                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 347                 || ( zExp < -1 )
 348                 || ( zSig + roundIncrement < 0x80000000 );
 349             shift32RightJamming( zSig, - zExp, &zSig );
 350             zExp = 0;
 351             roundBits = zSig & 0x7F;
 352             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 353         }
 354     }
 355     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 356     zSig = ( zSig + roundIncrement )>>7;
 357     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 358     if ( zSig == 0 ) zExp = 0;
 359     return packFloat32( zSign, zExp, zSig );
 360
 361 }
 362
 363 /*----------------------------------------------------------------------------
 364 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 365 | and significand `zSig', and returns the proper single-precision floating-
 366 | point value corresponding to the abstract input.  This routine is just like
 367 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 368 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 369 | floating-point exponent.
 370 *----------------------------------------------------------------------------*/
 371
 372 static float32
 373  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
 374 {
 375     int8 shiftCount;
 376
 377     shiftCount = countLeadingZeros32( zSig ) - 1;
 378     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 379
 380 }
 381
 382 /*----------------------------------------------------------------------------
 383 | Returns the fraction bits of the double-precision floating-point value `a'.
 384 *----------------------------------------------------------------------------*/
 385
 386 INLINE uint64_t extractFloat64Frac( float64 a )
 387 {
 388
 389     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 390
 391 }
 392
 393 /*----------------------------------------------------------------------------
 394 | Returns the exponent bits of the double-precision floating-point value `a'.
 395 *----------------------------------------------------------------------------*/
 396
 397 INLINE int16 extractFloat64Exp( float64 a )
 398 {
 399
 400     return ( float64_val(a)>>52 ) & 0x7FF;
 401
 402 }
 403
 404 /*----------------------------------------------------------------------------
 405 | Returns the sign bit of the double-precision floating-point value `a'.
 406 *----------------------------------------------------------------------------*/
 407
 408 INLINE flag extractFloat64Sign( float64 a )
 409 {
 410
 411     return float64_val(a)>>63;
 412
 413 }
 414
 415 /*----------------------------------------------------------------------------
 416 | If `a' is denormal and we are in flush-to-zero mode then set the
 417 | input-denormal exception and return zero. Otherwise just return the value.
 418 *----------------------------------------------------------------------------*/
 419 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
 420 {
 421     if (STATUS(flush_inputs_to_zero)) {
 422         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
 423             float_raise(float_flag_input_denormal STATUS_VAR);
 424             return make_float64(float64_val(a) & (1ULL << 63));
 425         }
 426     }
 427     return a;
 428 }
 429
 430 /*----------------------------------------------------------------------------
 431 | Normalizes the subnormal double-precision floating-point value represented
 432 | by the denormalized significand `aSig'.  The normalized exponent and
 433 | significand are stored at the locations pointed to by `zExpPtr' and
 434 | `zSigPtr', respectively.
 435 *----------------------------------------------------------------------------*/
 436
 437 static void
 438  normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr )
 439 {
 440     int8 shiftCount;
 441
 442     shiftCount = countLeadingZeros64( aSig ) - 11;
 443     *zSigPtr = aSig<<shiftCount;
 444     *zExpPtr = 1 - shiftCount;
 445
 446 }
 447
 448 /*----------------------------------------------------------------------------
 449 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 450 | double-precision floating-point value, returning the result.  After being
 451 | shifted into the proper positions, the three fields are simply added
 452 | together to form the result.  This means that any integer portion of `zSig'
 453 | will be added into the exponent.  Since a properly normalized significand
 454 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 455 | than the desired result exponent whenever `zSig' is a complete, normalized
 456 | significand.
 457 *----------------------------------------------------------------------------*/
 458
 459 INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig )
 460 {
 461
 462     return make_float64(
 463         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
 464
 465 }
 466
 467 /*----------------------------------------------------------------------------
 468 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 469 | and significand `zSig', and returns the proper double-precision floating-
 470 | point value corresponding to the abstract input.  Ordinarily, the abstract
 471 | value is simply rounded and packed into the double-precision format, with
 472 | the inexact exception raised if the abstract input cannot be represented
 473 | exactly.  However, if the abstract value is too large, the overflow and
 474 | inexact exceptions are raised and an infinity or maximal finite value is
 475 | returned.  If the abstract value is too small, the input value is rounded
 476 | to a subnormal number, and the underflow and inexact exceptions are raised
 477 | if the abstract input cannot be represented exactly as a subnormal double-
 478 | precision floating-point number.
 479 |     The input significand `zSig' has its binary point between bits 62
 480 | and 61, which is 10 bits to the left of the usual location.  This shifted
 481 | significand must be normalized or smaller.  If `zSig' is not normalized,
 482 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 483 | and it must not require rounding.  In the usual case that `zSig' is
 484 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 485 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 486 | Binary Floating-Point Arithmetic.
 487 *----------------------------------------------------------------------------*/
 488
 489 static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
 490 {
 491     int8 roundingMode;
 492     flag roundNearestEven;
 493     int16 roundIncrement, roundBits;
 494     flag isTiny;
 495
 496     roundingMode = STATUS(float_rounding_mode);
 497     roundNearestEven = ( roundingMode == float_round_nearest_even );
 498     roundIncrement = 0x200;
 499     if ( ! roundNearestEven ) {
 500         if ( roundingMode == float_round_to_zero ) {
 501             roundIncrement = 0;
 502         }
 503         else {
 504             roundIncrement = 0x3FF;
 505             if ( zSign ) {
 506                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 507             }
 508             else {
 509                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 510             }
 511         }
 512     }
 513     roundBits = zSig & 0x3FF;
 514     if ( 0x7FD <= (uint16_t) zExp ) {
 515         if (    ( 0x7FD < zExp )
 516              || (    ( zExp == 0x7FD )
 517                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
 518            ) {
 519             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 520             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 521         }
 522         if ( zExp < 0 ) {
 523             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
 524             isTiny =
 525                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 526                 || ( zExp < -1 )
 527                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 528             shift64RightJamming( zSig, - zExp, &zSig );
 529             zExp = 0;
 530             roundBits = zSig & 0x3FF;
 531             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 532         }
 533     }
 534     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 535     zSig = ( zSig + roundIncrement )>>10;
 536     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 537     if ( zSig == 0 ) zExp = 0;
 538     return packFloat64( zSign, zExp, zSig );
 539
 540 }
 541
 542 /*----------------------------------------------------------------------------
 543 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 544 | and significand `zSig', and returns the proper double-precision floating-
 545 | point value corresponding to the abstract input.  This routine is just like
 546 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 547 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 548 | floating-point exponent.
 549 *----------------------------------------------------------------------------*/
 550
 551 static float64
 552  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
 553 {
 554     int8 shiftCount;
 555
 556     shiftCount = countLeadingZeros64( zSig ) - 1;
 557     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 558
 559 }
 560
 561 #ifdef FLOATX80
 562
 563 /*----------------------------------------------------------------------------
 564 | Returns the fraction bits of the extended double-precision floating-point
 565 | value `a'.
 566 *----------------------------------------------------------------------------*/
 567
 568 INLINE uint64_t extractFloatx80Frac( floatx80 a )
 569 {
 570
 571     return a.low;
 572
 573 }
 574
 575 /*----------------------------------------------------------------------------
 576 | Returns the exponent bits of the extended double-precision floating-point
 577 | value `a'.
 578 *----------------------------------------------------------------------------*/
 579
 580 INLINE int32 extractFloatx80Exp( floatx80 a )
 581 {
 582
 583     return a.high & 0x7FFF;
 584
 585 }
 586
 587 /*----------------------------------------------------------------------------
 588 | Returns the sign bit of the extended double-precision floating-point value
 589 | `a'.
 590 *----------------------------------------------------------------------------*/
 591
 592 INLINE flag extractFloatx80Sign( floatx80 a )
 593 {
 594
 595     return a.high>>15;
 596
 597 }
 598
 599 /*----------------------------------------------------------------------------
 600 | Normalizes the subnormal extended double-precision floating-point value
 601 | represented by the denormalized significand `aSig'.  The normalized exponent
 602 | and significand are stored at the locations pointed to by `zExpPtr' and
 603 | `zSigPtr', respectively.
 604 *----------------------------------------------------------------------------*/
 605
 606 static void
 607  normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
 608 {
 609     int8 shiftCount;
 610
 611     shiftCount = countLeadingZeros64( aSig );
 612     *zSigPtr = aSig<<shiftCount;
 613     *zExpPtr = 1 - shiftCount;
 614
 615 }
 616
 617 /*----------------------------------------------------------------------------
 618 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 619 | extended double-precision floating-point value, returning the result.
 620 *----------------------------------------------------------------------------*/
 621
 622 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
 623 {
 624     floatx80 z;
 625
 626     z.low = zSig;
 627     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
 628     return z;
 629
 630 }
 631
 632 /*----------------------------------------------------------------------------
 633 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 634 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 635 | and returns the proper extended double-precision floating-point value
 636 | corresponding to the abstract input.  Ordinarily, the abstract value is
 637 | rounded and packed into the extended double-precision format, with the
 638 | inexact exception raised if the abstract input cannot be represented
 639 | exactly.  However, if the abstract value is too large, the overflow and
 640 | inexact exceptions are raised and an infinity or maximal finite value is
 641 | returned.  If the abstract value is too small, the input value is rounded to
 642 | a subnormal number, and the underflow and inexact exceptions are raised if
 643 | the abstract input cannot be represented exactly as a subnormal extended
 644 | double-precision floating-point number.
 645 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 646 | number of bits as single or double precision, respectively.  Otherwise, the
 647 | result is rounded to the full precision of the extended double-precision
 648 | format.
 649 |     The input significand must be normalized or smaller.  If the input
 650 | significand is not normalized, `zExp' must be 0; in that case, the result
 651 | returned is a subnormal number, and it must not require rounding.  The
 652 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 653 | Floating-Point Arithmetic.
 654 *----------------------------------------------------------------------------*/
 655
 656 static floatx80
 657  roundAndPackFloatx80(
 658      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 659  STATUS_PARAM)
 660 {
 661     int8 roundingMode;
 662     flag roundNearestEven, increment, isTiny;
 663     int64 roundIncrement, roundMask, roundBits;
 664
 665     roundingMode = STATUS(float_rounding_mode);
 666     roundNearestEven = ( roundingMode == float_round_nearest_even );
 667     if ( roundingPrecision == 80 ) goto precision80;
 668     if ( roundingPrecision == 64 ) {
 669         roundIncrement = LIT64( 0x0000000000000400 );
 670         roundMask = LIT64( 0x00000000000007FF );
 671     }
 672     else if ( roundingPrecision == 32 ) {
 673         roundIncrement = LIT64( 0x0000008000000000 );
 674         roundMask = LIT64( 0x000000FFFFFFFFFF );
 675     }
 676     else {
 677         goto precision80;
 678     }
 679     zSig0 |= ( zSig1 != 0 );
 680     if ( ! roundNearestEven ) {
 681         if ( roundingMode == float_round_to_zero ) {
 682             roundIncrement = 0;
 683         }
 684         else {
 685             roundIncrement = roundMask;
 686             if ( zSign ) {
 687                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 688             }
 689             else {
 690                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 691             }
 692         }
 693     }
 694     roundBits = zSig0 & roundMask;
 695     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 696         if (    ( 0x7FFE < zExp )
 697              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 698            ) {
 699             goto overflow;
 700         }
 701         if ( zExp <= 0 ) {
 702             if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
 703             isTiny =
 704                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 705                 || ( zExp < 0 )
 706                 || ( zSig0 <= zSig0 + roundIncrement );
 707             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 708             zExp = 0;
 709             roundBits = zSig0 & roundMask;
 710             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 711             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 712             zSig0 += roundIncrement;
 713             if ( (int64_t) zSig0 < 0 ) zExp = 1;
 714             roundIncrement = roundMask + 1;
 715             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 716                 roundMask |= roundIncrement;
 717             }
 718             zSig0 &= ~ roundMask;
 719             return packFloatx80( zSign, zExp, zSig0 );
 720         }
 721     }
 722     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 723     zSig0 += roundIncrement;
 724     if ( zSig0 < roundIncrement ) {
 725         ++zExp;
 726         zSig0 = LIT64( 0x8000000000000000 );
 727     }
 728     roundIncrement = roundMask + 1;
 729     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 730         roundMask |= roundIncrement;
 731     }
 732     zSig0 &= ~ roundMask;
 733     if ( zSig0 == 0 ) zExp = 0;
 734     return packFloatx80( zSign, zExp, zSig0 );
 735  precision80:
 736     increment = ( (int64_t) zSig1 < 0 );
 737     if ( ! roundNearestEven ) {
 738         if ( roundingMode == float_round_to_zero ) {
 739             increment = 0;
 740         }
 741         else {
 742             if ( zSign ) {
 743                 increment = ( roundingMode == float_round_down ) && zSig1;
 744             }
 745             else {
 746                 increment = ( roundingMode == float_round_up ) && zSig1;
 747             }
 748         }
 749     }
 750     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 751         if (    ( 0x7FFE < zExp )
 752              || (    ( zExp == 0x7FFE )
 753                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 754                   && increment
 755                 )
 756            ) {
 757             roundMask = 0;
 758  overflow:
 759             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 760             if (    ( roundingMode == float_round_to_zero )
 761                  || ( zSign && ( roundingMode == float_round_up ) )
 762                  || ( ! zSign && ( roundingMode == float_round_down ) )
 763                ) {
 764                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 765             }
 766             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 767         }
 768         if ( zExp <= 0 ) {
 769             isTiny =
 770                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 771                 || ( zExp < 0 )
 772                 || ! increment
 773                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 774             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 775             zExp = 0;
 776             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 777             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 778             if ( roundNearestEven ) {
 779                 increment = ( (int64_t) zSig1 < 0 );
 780             }
 781             else {
 782                 if ( zSign ) {
 783                     increment = ( roundingMode == float_round_down ) && zSig1;
 784                 }
 785                 else {
 786                     increment = ( roundingMode == float_round_up ) && zSig1;
 787                 }
 788             }
 789             if ( increment ) {
 790                 ++zSig0;
 791                 zSig0 &=
 792                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 793                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
 794             }
 795             return packFloatx80( zSign, zExp, zSig0 );
 796         }
 797     }
 798     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 799     if ( increment ) {
 800         ++zSig0;
 801         if ( zSig0 == 0 ) {
 802             ++zExp;
 803             zSig0 = LIT64( 0x8000000000000000 );
 804         }
 805         else {
 806             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 807         }
 808     }
 809     else {
 810         if ( zSig0 == 0 ) zExp = 0;
 811     }
 812     return packFloatx80( zSign, zExp, zSig0 );
 813
 814 }
 815
 816 /*----------------------------------------------------------------------------
 817 | Takes an abstract floating-point value having sign `zSign', exponent
 818 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 819 | and returns the proper extended double-precision floating-point value
 820 | corresponding to the abstract input.  This routine is just like
 821 | `roundAndPackFloatx80' except that the input significand does not have to be
 822 | normalized.
 823 *----------------------------------------------------------------------------*/
 824
 825 static floatx80
 826  normalizeRoundAndPackFloatx80(
 827      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 828  STATUS_PARAM)
 829 {
 830     int8 shiftCount;
 831
 832     if ( zSig0 == 0 ) {
 833         zSig0 = zSig1;
 834         zSig1 = 0;
 835         zExp -= 64;
 836     }
 837     shiftCount = countLeadingZeros64( zSig0 );
 838     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 839     zExp -= shiftCount;
 840     return
 841         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 842
 843 }
 844
 845 #endif
 846
 847 #ifdef FLOAT128
 848
 849 /*----------------------------------------------------------------------------
 850 | Returns the least-significant 64 fraction bits of the quadruple-precision
 851 | floating-point value `a'.
 852 *----------------------------------------------------------------------------*/
 853
 854 INLINE uint64_t extractFloat128Frac1( float128 a )
 855 {
 856
 857     return a.low;
 858
 859 }
 860
 861 /*----------------------------------------------------------------------------
 862 | Returns the most-significant 48 fraction bits of the quadruple-precision
 863 | floating-point value `a'.
 864 *----------------------------------------------------------------------------*/
 865
 866 INLINE uint64_t extractFloat128Frac0( float128 a )
 867 {
 868
 869     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 870
 871 }
 872
 873 /*----------------------------------------------------------------------------
 874 | Returns the exponent bits of the quadruple-precision floating-point value
 875 | `a'.
 876 *----------------------------------------------------------------------------*/
 877
 878 INLINE int32 extractFloat128Exp( float128 a )
 879 {
 880
 881     return ( a.high>>48 ) & 0x7FFF;
 882
 883 }
 884
 885 /*----------------------------------------------------------------------------
 886 | Returns the sign bit of the quadruple-precision floating-point value `a'.
 887 *----------------------------------------------------------------------------*/
 888
 889 INLINE flag extractFloat128Sign( float128 a )
 890 {
 891
 892     return a.high>>63;
 893
 894 }
 895
 896 /*----------------------------------------------------------------------------
 897 | Normalizes the subnormal quadruple-precision floating-point value
 898 | represented by the denormalized significand formed by the concatenation of
 899 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 900 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 901 | significand are stored at the location pointed to by `zSig0Ptr', and the
 902 | least significant 64 bits of the normalized significand are stored at the
 903 | location pointed to by `zSig1Ptr'.
 904 *----------------------------------------------------------------------------*/
 905
 906 static void
 907  normalizeFloat128Subnormal(
 908      uint64_t aSig0,
 909      uint64_t aSig1,
 910      int32 *zExpPtr,
 911      uint64_t *zSig0Ptr,
 912      uint64_t *zSig1Ptr
 913  )
 914 {
 915     int8 shiftCount;
 916
 917     if ( aSig0 == 0 ) {
 918         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 919         if ( shiftCount < 0 ) {
 920             *zSig0Ptr = aSig1>>( - shiftCount );
 921             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 922         }
 923         else {
 924             *zSig0Ptr = aSig1<<shiftCount;
 925             *zSig1Ptr = 0;
 926         }
 927         *zExpPtr = - shiftCount - 63;
 928     }
 929     else {
 930         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 931         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 932         *zExpPtr = 1 - shiftCount;
 933     }
 934
 935 }
 936
 937 /*----------------------------------------------------------------------------
 938 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 939 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 940 | floating-point value, returning the result.  After being shifted into the
 941 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 942 | added together to form the most significant 32 bits of the result.  This
 943 | means that any integer portion of `zSig0' will be added into the exponent.
 944 | Since a properly normalized significand will have an integer portion equal
 945 | to 1, the `zExp' input should be 1 less than the desired result exponent
 946 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 947 | significand.
 948 *----------------------------------------------------------------------------*/
 949
 950 INLINE float128
 951  packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
 952 {
 953     float128 z;
 954
 955     z.low = zSig1;
 956     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
 957     return z;
 958
 959 }
 960
 961 /*----------------------------------------------------------------------------
 962 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 963 | and extended significand formed by the concatenation of `zSig0', `zSig1',
 964 | and `zSig2', and returns the proper quadruple-precision floating-point value
 965 | corresponding to the abstract input.  Ordinarily, the abstract value is
 966 | simply rounded and packed into the quadruple-precision format, with the
 967 | inexact exception raised if the abstract input cannot be represented
 968 | exactly.  However, if the abstract value is too large, the overflow and
 969 | inexact exceptions are raised and an infinity or maximal finite value is
 970 | returned.  If the abstract value is too small, the input value is rounded to
 971 | a subnormal number, and the underflow and inexact exceptions are raised if
 972 | the abstract input cannot be represented exactly as a subnormal quadruple-
 973 | precision floating-point number.
 974 |     The input significand must be normalized or smaller.  If the input
 975 | significand is not normalized, `zExp' must be 0; in that case, the result
 976 | returned is a subnormal number, and it must not require rounding.  In the
 977 | usual case that the input significand is normalized, `zExp' must be 1 less
 978 | than the ``true'' floating-point exponent.  The handling of underflow and
 979 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 980 *----------------------------------------------------------------------------*/
 981
 982 static float128
 983  roundAndPackFloat128(
 984      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
 985 {
 986     int8 roundingMode;
 987     flag roundNearestEven, increment, isTiny;
 988
 989     roundingMode = STATUS(float_rounding_mode);
 990     roundNearestEven = ( roundingMode == float_round_nearest_even );
 991     increment = ( (int64_t) zSig2 < 0 );
 992     if ( ! roundNearestEven ) {
 993         if ( roundingMode == float_round_to_zero ) {
 994             increment = 0;
 995         }
 996         else {
 997             if ( zSign ) {
 998                 increment = ( roundingMode == float_round_down ) && zSig2;
 999             }
1000             else {
1001                 increment = ( roundingMode == float_round_up ) && zSig2;
1002             }
1003         }
1004     }
1005     if ( 0x7FFD <= (uint32_t) zExp ) {
1006         if (    ( 0x7FFD < zExp )
1007              || (    ( zExp == 0x7FFD )
1008                   && eq128(
1009                          LIT64( 0x0001FFFFFFFFFFFF ),
1010                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1011                          zSig0,
1012                          zSig1
1013                      )
1014                   && increment
1015                 )
1016            ) {
1017             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1018             if (    ( roundingMode == float_round_to_zero )
1019                  || ( zSign && ( roundingMode == float_round_up ) )
1020                  || ( ! zSign && ( roundingMode == float_round_down ) )
1021                ) {
1022                 return
1023                     packFloat128(
1024                         zSign,
1025                         0x7FFE,
1026                         LIT64( 0x0000FFFFFFFFFFFF ),
1027                         LIT64( 0xFFFFFFFFFFFFFFFF )
1028                     );
1029             }
1030             return packFloat128( zSign, 0x7FFF, 0, 0 );
1031         }
1032         if ( zExp < 0 ) {
1033             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
1034             isTiny =
1035                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1036                 || ( zExp < -1 )
1037                 || ! increment
1038                 || lt128(
1039                        zSig0,
1040                        zSig1,
1041                        LIT64( 0x0001FFFFFFFFFFFF ),
1042                        LIT64( 0xFFFFFFFFFFFFFFFF )
1043                    );
1044             shift128ExtraRightJamming(
1045                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1046             zExp = 0;
1047             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1048             if ( roundNearestEven ) {
1049                 increment = ( (int64_t) zSig2 < 0 );
1050             }
1051             else {
1052                 if ( zSign ) {
1053                     increment = ( roundingMode == float_round_down ) && zSig2;
1054                 }
1055                 else {
1056                     increment = ( roundingMode == float_round_up ) && zSig2;
1057                 }
1058             }
1059         }
1060     }
1061     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1062     if ( increment ) {
1063         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1064         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1065     }
1066     else {
1067         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1068     }
1069     return packFloat128( zSign, zExp, zSig0, zSig1 );
1070
1071 }
1072
1073 /*----------------------------------------------------------------------------
1074 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1075 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1076 | returns the proper quadruple-precision floating-point value corresponding
1077 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1078 | except that the input significand has fewer bits and does not have to be
1079 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1080 | point exponent.
1081 *----------------------------------------------------------------------------*/
1082
1083 static float128
1084  normalizeRoundAndPackFloat128(
1085      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1086 {
1087     int8 shiftCount;
1088     uint64_t zSig2;
1089
1090     if ( zSig0 == 0 ) {
1091         zSig0 = zSig1;
1092         zSig1 = 0;
1093         zExp -= 64;
1094     }
1095     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1096     if ( 0 <= shiftCount ) {
1097         zSig2 = 0;
1098         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1099     }
1100     else {
1101         shift128ExtraRightJamming(
1102             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1103     }
1104     zExp -= shiftCount;
1105     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1106
1107 }
1108
1109 #endif
1110
1111 /*----------------------------------------------------------------------------
1112 | Returns the result of converting the 32-bit two's complement integer `a'
1113 | to the single-precision floating-point format.  The conversion is performed
1114 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115 *----------------------------------------------------------------------------*/
1116
1117 float32 int32_to_float32( int32 a STATUS_PARAM )
1118 {
1119     flag zSign;
1120
1121     if ( a == 0 ) return float32_zero;
1122     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1123     zSign = ( a < 0 );
1124     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1125
1126 }
1127
1128 /*----------------------------------------------------------------------------
1129 | Returns the result of converting the 32-bit two's complement integer `a'
1130 | to the double-precision floating-point format.  The conversion is performed
1131 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1132 *----------------------------------------------------------------------------*/
1133
1134 float64 int32_to_float64( int32 a STATUS_PARAM )
1135 {
1136     flag zSign;
1137     uint32 absA;
1138     int8 shiftCount;
1139     uint64_t zSig;
1140
1141     if ( a == 0 ) return float64_zero;
1142     zSign = ( a < 0 );
1143     absA = zSign ? - a : a;
1144     shiftCount = countLeadingZeros32( absA ) + 21;
1145     zSig = absA;
1146     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1147
1148 }
1149
1150 #ifdef FLOATX80
1151
1152 /*----------------------------------------------------------------------------
1153 | Returns the result of converting the 32-bit two's complement integer `a'
1154 | to the extended double-precision floating-point format.  The conversion
1155 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1156 | Arithmetic.
1157 *----------------------------------------------------------------------------*/
1158
1159 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1160 {
1161     flag zSign;
1162     uint32 absA;
1163     int8 shiftCount;
1164     uint64_t zSig;
1165
1166     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1167     zSign = ( a < 0 );
1168     absA = zSign ? - a : a;
1169     shiftCount = countLeadingZeros32( absA ) + 32;
1170     zSig = absA;
1171     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1172
1173 }
1174
1175 #endif
1176
1177 #ifdef FLOAT128
1178
1179 /*----------------------------------------------------------------------------
1180 | Returns the result of converting the 32-bit two's complement integer `a' to
1181 | the quadruple-precision floating-point format.  The conversion is performed
1182 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1183 *----------------------------------------------------------------------------*/
1184
1185 float128 int32_to_float128( int32 a STATUS_PARAM )
1186 {
1187     flag zSign;
1188     uint32 absA;
1189     int8 shiftCount;
1190     uint64_t zSig0;
1191
1192     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1193     zSign = ( a < 0 );
1194     absA = zSign ? - a : a;
1195     shiftCount = countLeadingZeros32( absA ) + 17;
1196     zSig0 = absA;
1197     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1198
1199 }
1200
1201 #endif
1202
1203 /*----------------------------------------------------------------------------
1204 | Returns the result of converting the 64-bit two's complement integer `a'
1205 | to the single-precision floating-point format.  The conversion is performed
1206 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1207 *----------------------------------------------------------------------------*/
1208
1209 float32 int64_to_float32( int64 a STATUS_PARAM )
1210 {
1211     flag zSign;
1212     uint64 absA;
1213     int8 shiftCount;
1214
1215     if ( a == 0 ) return float32_zero;
1216     zSign = ( a < 0 );
1217     absA = zSign ? - a : a;
1218     shiftCount = countLeadingZeros64( absA ) - 40;
1219     if ( 0 <= shiftCount ) {
1220         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1221     }
1222     else {
1223         shiftCount += 7;
1224         if ( shiftCount < 0 ) {
1225             shift64RightJamming( absA, - shiftCount, &absA );
1226         }
1227         else {
1228             absA <<= shiftCount;
1229         }
1230         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1231     }
1232
1233 }
1234
1235 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1236 {
1237     int8 shiftCount;
1238
1239     if ( a == 0 ) return float32_zero;
1240     shiftCount = countLeadingZeros64( a ) - 40;
1241     if ( 0 <= shiftCount ) {
1242         return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1243     }
1244     else {
1245         shiftCount += 7;
1246         if ( shiftCount < 0 ) {
1247             shift64RightJamming( a, - shiftCount, &a );
1248         }
1249         else {
1250             a <<= shiftCount;
1251         }
1252         return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1253     }
1254 }
1255
1256 /*----------------------------------------------------------------------------
1257 | Returns the result of converting the 64-bit two's complement integer `a'
1258 | to the double-precision floating-point format.  The conversion is performed
1259 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1260 *----------------------------------------------------------------------------*/
1261
1262 float64 int64_to_float64( int64 a STATUS_PARAM )
1263 {
1264     flag zSign;
1265
1266     if ( a == 0 ) return float64_zero;
1267     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1268         return packFloat64( 1, 0x43E, 0 );
1269     }
1270     zSign = ( a < 0 );
1271     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1272
1273 }
1274
1275 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1276 {
1277     if ( a == 0 ) return float64_zero;
1278     return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1279
1280 }
1281
1282 #ifdef FLOATX80
1283
1284 /*----------------------------------------------------------------------------
1285 | Returns the result of converting the 64-bit two's complement integer `a'
1286 | to the extended double-precision floating-point format.  The conversion
1287 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1288 | Arithmetic.
1289 *----------------------------------------------------------------------------*/
1290
1291 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1292 {
1293     flag zSign;
1294     uint64 absA;
1295     int8 shiftCount;
1296
1297     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1298     zSign = ( a < 0 );
1299     absA = zSign ? - a : a;
1300     shiftCount = countLeadingZeros64( absA );
1301     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1302
1303 }
1304
1305 #endif
1306
1307 #ifdef FLOAT128
1308
1309 /*----------------------------------------------------------------------------
1310 | Returns the result of converting the 64-bit two's complement integer `a' to
1311 | the quadruple-precision floating-point format.  The conversion is performed
1312 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1313 *----------------------------------------------------------------------------*/
1314
1315 float128 int64_to_float128( int64 a STATUS_PARAM )
1316 {
1317     flag zSign;
1318     uint64 absA;
1319     int8 shiftCount;
1320     int32 zExp;
1321     uint64_t zSig0, zSig1;
1322
1323     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1324     zSign = ( a < 0 );
1325     absA = zSign ? - a : a;
1326     shiftCount = countLeadingZeros64( absA ) + 49;
1327     zExp = 0x406E - shiftCount;
1328     if ( 64 <= shiftCount ) {
1329         zSig1 = 0;
1330         zSig0 = absA;
1331         shiftCount -= 64;
1332     }
1333     else {
1334         zSig1 = absA;
1335         zSig0 = 0;
1336     }
1337     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1338     return packFloat128( zSign, zExp, zSig0, zSig1 );
1339
1340 }
1341
1342 #endif
1343
1344 /*----------------------------------------------------------------------------
1345 | Returns the result of converting the single-precision floating-point value
1346 | `a' to the 32-bit two's complement integer format.  The conversion is
1347 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1348 | Arithmetic---which means in particular that the conversion is rounded
1349 | according to the current rounding mode.  If `a' is a NaN, the largest
1350 | positive integer is returned.  Otherwise, if the conversion overflows, the
1351 | largest integer with the same sign as `a' is returned.
1352 *----------------------------------------------------------------------------*/
1353
1354 int32 float32_to_int32( float32 a STATUS_PARAM )
1355 {
1356     flag aSign;
1357     int16 aExp, shiftCount;
1358     uint32_t aSig;
1359     uint64_t aSig64;
1360
1361     a = float32_squash_input_denormal(a STATUS_VAR);
1362     aSig = extractFloat32Frac( a );
1363     aExp = extractFloat32Exp( a );
1364     aSign = extractFloat32Sign( a );
1365     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1366     if ( aExp ) aSig |= 0x00800000;
1367     shiftCount = 0xAF - aExp;
1368     aSig64 = aSig;
1369     aSig64 <<= 32;
1370     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1371     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1372
1373 }
1374
1375 /*----------------------------------------------------------------------------
1376 | Returns the result of converting the single-precision floating-point value
1377 | `a' to the 32-bit two's complement integer format.  The conversion is
1378 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1379 | Arithmetic, except that the conversion is always rounded toward zero.
1380 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1381 | the conversion overflows, the largest integer with the same sign as `a' is
1382 | returned.
1383 *----------------------------------------------------------------------------*/
1384
1385 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1386 {
1387     flag aSign;
1388     int16 aExp, shiftCount;
1389     uint32_t aSig;
1390     int32 z;
1391     a = float32_squash_input_denormal(a STATUS_VAR);
1392
1393     aSig = extractFloat32Frac( a );
1394     aExp = extractFloat32Exp( a );
1395     aSign = extractFloat32Sign( a );
1396     shiftCount = aExp - 0x9E;
1397     if ( 0 <= shiftCount ) {
1398         if ( float32_val(a) != 0xCF000000 ) {
1399             float_raise( float_flag_invalid STATUS_VAR);
1400             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1401         }
1402         return (int32_t) 0x80000000;
1403     }
1404     else if ( aExp <= 0x7E ) {
1405         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1406         return 0;
1407     }
1408     aSig = ( aSig | 0x00800000 )<<8;
1409     z = aSig>>( - shiftCount );
1410     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1411         STATUS(float_exception_flags) |= float_flag_inexact;
1412     }
1413     if ( aSign ) z = - z;
1414     return z;
1415
1416 }
1417
1418 /*----------------------------------------------------------------------------
1419 | Returns the result of converting the single-precision floating-point value
1420 | `a' to the 16-bit two's complement integer format.  The conversion is
1421 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1422 | Arithmetic, except that the conversion is always rounded toward zero.
1423 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1424 | the conversion overflows, the largest integer with the same sign as `a' is
1425 | returned.
1426 *----------------------------------------------------------------------------*/
1427
1428 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1429 {
1430     flag aSign;
1431     int16 aExp, shiftCount;
1432     uint32_t aSig;
1433     int32 z;
1434
1435     aSig = extractFloat32Frac( a );
1436     aExp = extractFloat32Exp( a );
1437     aSign = extractFloat32Sign( a );
1438     shiftCount = aExp - 0x8E;
1439     if ( 0 <= shiftCount ) {
1440         if ( float32_val(a) != 0xC7000000 ) {
1441             float_raise( float_flag_invalid STATUS_VAR);
1442             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1443                 return 0x7FFF;
1444             }
1445         }
1446         return (int32_t) 0xffff8000;
1447     }
1448     else if ( aExp <= 0x7E ) {
1449         if ( aExp | aSig ) {
1450             STATUS(float_exception_flags) |= float_flag_inexact;
1451         }
1452         return 0;
1453     }
1454     shiftCount -= 0x10;
1455     aSig = ( aSig | 0x00800000 )<<8;
1456     z = aSig>>( - shiftCount );
1457     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1458         STATUS(float_exception_flags) |= float_flag_inexact;
1459     }
1460     if ( aSign ) {
1461         z = - z;
1462     }
1463     return z;
1464
1465 }
1466
1467 /*----------------------------------------------------------------------------
1468 | Returns the result of converting the single-precision floating-point value
1469 | `a' to the 64-bit two's complement integer format.  The conversion is
1470 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1471 | Arithmetic---which means in particular that the conversion is rounded
1472 | according to the current rounding mode.  If `a' is a NaN, the largest
1473 | positive integer is returned.  Otherwise, if the conversion overflows, the
1474 | largest integer with the same sign as `a' is returned.
1475 *----------------------------------------------------------------------------*/
1476
1477 int64 float32_to_int64( float32 a STATUS_PARAM )
1478 {
1479     flag aSign;
1480     int16 aExp, shiftCount;
1481     uint32_t aSig;
1482     uint64_t aSig64, aSigExtra;
1483     a = float32_squash_input_denormal(a STATUS_VAR);
1484
1485     aSig = extractFloat32Frac( a );
1486     aExp = extractFloat32Exp( a );
1487     aSign = extractFloat32Sign( a );
1488     shiftCount = 0xBE - aExp;
1489     if ( shiftCount < 0 ) {
1490         float_raise( float_flag_invalid STATUS_VAR);
1491         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1492             return LIT64( 0x7FFFFFFFFFFFFFFF );
1493         }
1494         return (int64_t) LIT64( 0x8000000000000000 );
1495     }
1496     if ( aExp ) aSig |= 0x00800000;
1497     aSig64 = aSig;
1498     aSig64 <<= 40;
1499     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1500     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1501
1502 }
1503
1504 /*----------------------------------------------------------------------------
1505 | Returns the result of converting the single-precision floating-point value
1506 | `a' to the 64-bit two's complement integer format.  The conversion is
1507 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1508 | Arithmetic, except that the conversion is always rounded toward zero.  If
1509 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1510 | conversion overflows, the largest integer with the same sign as `a' is
1511 | returned.
1512 *----------------------------------------------------------------------------*/
1513
1514 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1515 {
1516     flag aSign;
1517     int16 aExp, shiftCount;
1518     uint32_t aSig;
1519     uint64_t aSig64;
1520     int64 z;
1521     a = float32_squash_input_denormal(a STATUS_VAR);
1522
1523     aSig = extractFloat32Frac( a );
1524     aExp = extractFloat32Exp( a );
1525     aSign = extractFloat32Sign( a );
1526     shiftCount = aExp - 0xBE;
1527     if ( 0 <= shiftCount ) {
1528         if ( float32_val(a) != 0xDF000000 ) {
1529             float_raise( float_flag_invalid STATUS_VAR);
1530             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1531                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1532             }
1533         }
1534         return (int64_t) LIT64( 0x8000000000000000 );
1535     }
1536     else if ( aExp <= 0x7E ) {
1537         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1538         return 0;
1539     }
1540     aSig64 = aSig | 0x00800000;
1541     aSig64 <<= 40;
1542     z = aSig64>>( - shiftCount );
1543     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1544         STATUS(float_exception_flags) |= float_flag_inexact;
1545     }
1546     if ( aSign ) z = - z;
1547     return z;
1548
1549 }
1550
1551 /*----------------------------------------------------------------------------
1552 | Returns the result of converting the single-precision floating-point value
1553 | `a' to the double-precision floating-point format.  The conversion is
1554 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1555 | Arithmetic.
1556 *----------------------------------------------------------------------------*/
1557
1558 float64 float32_to_float64( float32 a STATUS_PARAM )
1559 {
1560     flag aSign;
1561     int16 aExp;
1562     uint32_t aSig;
1563     a = float32_squash_input_denormal(a STATUS_VAR);
1564
1565     aSig = extractFloat32Frac( a );
1566     aExp = extractFloat32Exp( a );
1567     aSign = extractFloat32Sign( a );
1568     if ( aExp == 0xFF ) {
1569         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1570         return packFloat64( aSign, 0x7FF, 0 );
1571     }
1572     if ( aExp == 0 ) {
1573         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1574         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1575         --aExp;
1576     }
1577     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1578
1579 }
1580
1581 #ifdef FLOATX80
1582
1583 /*----------------------------------------------------------------------------
1584 | Returns the result of converting the single-precision floating-point value
1585 | `a' to the extended double-precision floating-point format.  The conversion
1586 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1587 | Arithmetic.
1588 *----------------------------------------------------------------------------*/
1589
1590 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1591 {
1592     flag aSign;
1593     int16 aExp;
1594     uint32_t aSig;
1595
1596     a = float32_squash_input_denormal(a STATUS_VAR);
1597     aSig = extractFloat32Frac( a );
1598     aExp = extractFloat32Exp( a );
1599     aSign = extractFloat32Sign( a );
1600     if ( aExp == 0xFF ) {
1601         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1602         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1603     }
1604     if ( aExp == 0 ) {
1605         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1606         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1607     }
1608     aSig |= 0x00800000;
1609     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1610
1611 }
1612
1613 #endif
1614
1615 #ifdef FLOAT128
1616
1617 /*----------------------------------------------------------------------------
1618 | Returns the result of converting the single-precision floating-point value
1619 | `a' to the double-precision floating-point format.  The conversion is
1620 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1621 | Arithmetic.
1622 *----------------------------------------------------------------------------*/
1623
1624 float128 float32_to_float128( float32 a STATUS_PARAM )
1625 {
1626     flag aSign;
1627     int16 aExp;
1628     uint32_t aSig;
1629
1630     a = float32_squash_input_denormal(a STATUS_VAR);
1631     aSig = extractFloat32Frac( a );
1632     aExp = extractFloat32Exp( a );
1633     aSign = extractFloat32Sign( a );
1634     if ( aExp == 0xFF ) {
1635         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1636         return packFloat128( aSign, 0x7FFF, 0, 0 );
1637     }
1638     if ( aExp == 0 ) {
1639         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1640         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1641         --aExp;
1642     }
1643     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1644
1645 }
1646
1647 #endif
1648
1649 /*----------------------------------------------------------------------------
1650 | Rounds the single-precision floating-point value `a' to an integer, and
1651 | returns the result as a single-precision floating-point value.  The
1652 | operation is performed according to the IEC/IEEE Standard for Binary
1653 | Floating-Point Arithmetic.
1654 *----------------------------------------------------------------------------*/
1655
1656 float32 float32_round_to_int( float32 a STATUS_PARAM)
1657 {
1658     flag aSign;
1659     int16 aExp;
1660     uint32_t lastBitMask, roundBitsMask;
1661     int8 roundingMode;
1662     uint32_t z;
1663     a = float32_squash_input_denormal(a STATUS_VAR);
1664
1665     aExp = extractFloat32Exp( a );
1666     if ( 0x96 <= aExp ) {
1667         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1668             return propagateFloat32NaN( a, a STATUS_VAR );
1669         }
1670         return a;
1671     }
1672     if ( aExp <= 0x7E ) {
1673         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1674         STATUS(float_exception_flags) |= float_flag_inexact;
1675         aSign = extractFloat32Sign( a );
1676         switch ( STATUS(float_rounding_mode) ) {
1677          case float_round_nearest_even:
1678             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1679                 return packFloat32( aSign, 0x7F, 0 );
1680             }
1681             break;
1682          case float_round_down:
1683             return make_float32(aSign ? 0xBF800000 : 0);
1684          case float_round_up:
1685             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1686         }
1687         return packFloat32( aSign, 0, 0 );
1688     }
1689     lastBitMask = 1;
1690     lastBitMask <<= 0x96 - aExp;
1691     roundBitsMask = lastBitMask - 1;
1692     z = float32_val(a);
1693     roundingMode = STATUS(float_rounding_mode);
1694     if ( roundingMode == float_round_nearest_even ) {
1695         z += lastBitMask>>1;
1696         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1697     }
1698     else if ( roundingMode != float_round_to_zero ) {
1699         if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1700             z += roundBitsMask;
1701         }
1702     }
1703     z &= ~ roundBitsMask;
1704     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1705     return make_float32(z);
1706
1707 }
1708
1709 /*----------------------------------------------------------------------------
1710 | Returns the result of adding the absolute values of the single-precision
1711 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1712 | before being returned.  `zSign' is ignored if the result is a NaN.
1713 | The addition is performed according to the IEC/IEEE Standard for Binary
1714 | Floating-Point Arithmetic.
1715 *----------------------------------------------------------------------------*/
1716
1717 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1718 {
1719     int16 aExp, bExp, zExp;
1720     uint32_t aSig, bSig, zSig;
1721     int16 expDiff;
1722
1723     aSig = extractFloat32Frac( a );
1724     aExp = extractFloat32Exp( a );
1725     bSig = extractFloat32Frac( b );
1726     bExp = extractFloat32Exp( b );
1727     expDiff = aExp - bExp;
1728     aSig <<= 6;
1729     bSig <<= 6;
1730     if ( 0 < expDiff ) {
1731         if ( aExp == 0xFF ) {
1732             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1733             return a;
1734         }
1735         if ( bExp == 0 ) {
1736             --expDiff;
1737         }
1738         else {
1739             bSig |= 0x20000000;
1740         }
1741         shift32RightJamming( bSig, expDiff, &bSig );
1742         zExp = aExp;
1743     }
1744     else if ( expDiff < 0 ) {
1745         if ( bExp == 0xFF ) {
1746             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1747             return packFloat32( zSign, 0xFF, 0 );
1748         }
1749         if ( aExp == 0 ) {
1750             ++expDiff;
1751         }
1752         else {
1753             aSig |= 0x20000000;
1754         }
1755         shift32RightJamming( aSig, - expDiff, &aSig );
1756         zExp = bExp;
1757     }
1758     else {
1759         if ( aExp == 0xFF ) {
1760             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1761             return a;
1762         }
1763         if ( aExp == 0 ) {
1764             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1765             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1766         }
1767         zSig = 0x40000000 + aSig + bSig;
1768         zExp = aExp;
1769         goto roundAndPack;
1770     }
1771     aSig |= 0x20000000;
1772     zSig = ( aSig + bSig )<<1;
1773     --zExp;
1774     if ( (int32_t) zSig < 0 ) {
1775         zSig = aSig + bSig;
1776         ++zExp;
1777     }
1778  roundAndPack:
1779     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1780
1781 }
1782
1783 /*----------------------------------------------------------------------------
1784 | Returns the result of subtracting the absolute values of the single-
1785 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1786 | difference is negated before being returned.  `zSign' is ignored if the
1787 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1788 | Standard for Binary Floating-Point Arithmetic.
1789 *----------------------------------------------------------------------------*/
1790
1791 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1792 {
1793     int16 aExp, bExp, zExp;
1794     uint32_t aSig, bSig, zSig;
1795     int16 expDiff;
1796
1797     aSig = extractFloat32Frac( a );
1798     aExp = extractFloat32Exp( a );
1799     bSig = extractFloat32Frac( b );
1800     bExp = extractFloat32Exp( b );
1801     expDiff = aExp - bExp;
1802     aSig <<= 7;
1803     bSig <<= 7;
1804     if ( 0 < expDiff ) goto aExpBigger;
1805     if ( expDiff < 0 ) goto bExpBigger;
1806     if ( aExp == 0xFF ) {
1807         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1808         float_raise( float_flag_invalid STATUS_VAR);
1809         return float32_default_nan;
1810     }
1811     if ( aExp == 0 ) {
1812         aExp = 1;
1813         bExp = 1;
1814     }
1815     if ( bSig < aSig ) goto aBigger;
1816     if ( aSig < bSig ) goto bBigger;
1817     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1818  bExpBigger:
1819     if ( bExp == 0xFF ) {
1820         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1821         return packFloat32( zSign ^ 1, 0xFF, 0 );
1822     }
1823     if ( aExp == 0 ) {
1824         ++expDiff;
1825     }
1826     else {
1827         aSig |= 0x40000000;
1828     }
1829     shift32RightJamming( aSig, - expDiff, &aSig );
1830     bSig |= 0x40000000;
1831  bBigger:
1832     zSig = bSig - aSig;
1833     zExp = bExp;
1834     zSign ^= 1;
1835     goto normalizeRoundAndPack;
1836  aExpBigger:
1837     if ( aExp == 0xFF ) {
1838         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1839         return a;
1840     }
1841     if ( bExp == 0 ) {
1842         --expDiff;
1843     }
1844     else {
1845         bSig |= 0x40000000;
1846     }
1847     shift32RightJamming( bSig, expDiff, &bSig );
1848     aSig |= 0x40000000;
1849  aBigger:
1850     zSig = aSig - bSig;
1851     zExp = aExp;
1852  normalizeRoundAndPack:
1853     --zExp;
1854     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1855
1856 }
1857
1858 /*----------------------------------------------------------------------------
1859 | Returns the result of adding the single-precision floating-point values `a'
1860 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1861 | Binary Floating-Point Arithmetic.
1862 *----------------------------------------------------------------------------*/
1863
1864 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1865 {
1866     flag aSign, bSign;
1867     a = float32_squash_input_denormal(a STATUS_VAR);
1868     b = float32_squash_input_denormal(b STATUS_VAR);
1869
1870     aSign = extractFloat32Sign( a );
1871     bSign = extractFloat32Sign( b );
1872     if ( aSign == bSign ) {
1873         return addFloat32Sigs( a, b, aSign STATUS_VAR);
1874     }
1875     else {
1876         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1877     }
1878
1879 }
1880
1881 /*----------------------------------------------------------------------------
1882 | Returns the result of subtracting the single-precision floating-point values
1883 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1884 | for Binary Floating-Point Arithmetic.
1885 *----------------------------------------------------------------------------*/
1886
1887 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1888 {
1889     flag aSign, bSign;
1890     a = float32_squash_input_denormal(a STATUS_VAR);
1891     b = float32_squash_input_denormal(b STATUS_VAR);
1892
1893     aSign = extractFloat32Sign( a );
1894     bSign = extractFloat32Sign( b );
1895     if ( aSign == bSign ) {
1896         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1897     }
1898     else {
1899         return addFloat32Sigs( a, b, aSign STATUS_VAR );
1900     }
1901
1902 }
1903
1904 /*----------------------------------------------------------------------------
1905 | Returns the result of multiplying the single-precision floating-point values
1906 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1907 | for Binary Floating-Point Arithmetic.
1908 *----------------------------------------------------------------------------*/
1909
1910 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1911 {
1912     flag aSign, bSign, zSign;
1913     int16 aExp, bExp, zExp;
1914     uint32_t aSig, bSig;
1915     uint64_t zSig64;
1916     uint32_t zSig;
1917
1918     a = float32_squash_input_denormal(a STATUS_VAR);
1919     b = float32_squash_input_denormal(b STATUS_VAR);
1920
1921     aSig = extractFloat32Frac( a );
1922     aExp = extractFloat32Exp( a );
1923     aSign = extractFloat32Sign( a );
1924     bSig = extractFloat32Frac( b );
1925     bExp = extractFloat32Exp( b );
1926     bSign = extractFloat32Sign( b );
1927     zSign = aSign ^ bSign;
1928     if ( aExp == 0xFF ) {
1929         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1930             return propagateFloat32NaN( a, b STATUS_VAR );
1931         }
1932         if ( ( bExp | bSig ) == 0 ) {
1933             float_raise( float_flag_invalid STATUS_VAR);
1934             return float32_default_nan;
1935         }
1936         return packFloat32( zSign, 0xFF, 0 );
1937     }
1938     if ( bExp == 0xFF ) {
1939         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1940         if ( ( aExp | aSig ) == 0 ) {
1941             float_raise( float_flag_invalid STATUS_VAR);
1942             return float32_default_nan;
1943         }
1944         return packFloat32( zSign, 0xFF, 0 );
1945     }
1946     if ( aExp == 0 ) {
1947         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1948         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1949     }
1950     if ( bExp == 0 ) {
1951         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1952         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1953     }
1954     zExp = aExp + bExp - 0x7F;
1955     aSig = ( aSig | 0x00800000 )<<7;
1956     bSig = ( bSig | 0x00800000 )<<8;
1957     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
1958     zSig = zSig64;
1959     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
1960         zSig <<= 1;
1961         --zExp;
1962     }
1963     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1964
1965 }
1966
1967 /*----------------------------------------------------------------------------
1968 | Returns the result of dividing the single-precision floating-point value `a'
1969 | by the corresponding value `b'.  The operation is performed according to the
1970 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1971 *----------------------------------------------------------------------------*/
1972
1973 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1974 {
1975     flag aSign, bSign, zSign;
1976     int16 aExp, bExp, zExp;
1977     uint32_t aSig, bSig, zSig;
1978     a = float32_squash_input_denormal(a STATUS_VAR);
1979     b = float32_squash_input_denormal(b STATUS_VAR);
1980
1981     aSig = extractFloat32Frac( a );
1982     aExp = extractFloat32Exp( a );
1983     aSign = extractFloat32Sign( a );
1984     bSig = extractFloat32Frac( b );
1985     bExp = extractFloat32Exp( b );
1986     bSign = extractFloat32Sign( b );
1987     zSign = aSign ^ bSign;
1988     if ( aExp == 0xFF ) {
1989         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1990         if ( bExp == 0xFF ) {
1991             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1992             float_raise( float_flag_invalid STATUS_VAR);
1993             return float32_default_nan;
1994         }
1995         return packFloat32( zSign, 0xFF, 0 );
1996     }
1997     if ( bExp == 0xFF ) {
1998         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1999         return packFloat32( zSign, 0, 0 );
2000     }
2001     if ( bExp == 0 ) {
2002         if ( bSig == 0 ) {
2003             if ( ( aExp | aSig ) == 0 ) {
2004                 float_raise( float_flag_invalid STATUS_VAR);
2005                 return float32_default_nan;
2006             }
2007             float_raise( float_flag_divbyzero STATUS_VAR);
2008             return packFloat32( zSign, 0xFF, 0 );
2009         }
2010         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2011     }
2012     if ( aExp == 0 ) {
2013         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2014         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2015     }
2016     zExp = aExp - bExp + 0x7D;
2017     aSig = ( aSig | 0x00800000 )<<7;
2018     bSig = ( bSig | 0x00800000 )<<8;
2019     if ( bSig <= ( aSig + aSig ) ) {
2020         aSig >>= 1;
2021         ++zExp;
2022     }
2023     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2024     if ( ( zSig & 0x3F ) == 0 ) {
2025         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2026     }
2027     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2028
2029 }
2030
2031 /*----------------------------------------------------------------------------
2032 | Returns the remainder of the single-precision floating-point value `a'
2033 | with respect to the corresponding value `b'.  The operation is performed
2034 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2035 *----------------------------------------------------------------------------*/
2036
2037 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2038 {
2039     flag aSign, zSign;
2040     int16 aExp, bExp, expDiff;
2041     uint32_t aSig, bSig;
2042     uint32_t q;
2043     uint64_t aSig64, bSig64, q64;
2044     uint32_t alternateASig;
2045     int32_t sigMean;
2046     a = float32_squash_input_denormal(a STATUS_VAR);
2047     b = float32_squash_input_denormal(b STATUS_VAR);
2048
2049     aSig = extractFloat32Frac( a );
2050     aExp = extractFloat32Exp( a );
2051     aSign = extractFloat32Sign( a );
2052     bSig = extractFloat32Frac( b );
2053     bExp = extractFloat32Exp( b );
2054     if ( aExp == 0xFF ) {
2055         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2056             return propagateFloat32NaN( a, b STATUS_VAR );
2057         }
2058         float_raise( float_flag_invalid STATUS_VAR);
2059         return float32_default_nan;
2060     }
2061     if ( bExp == 0xFF ) {
2062         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2063         return a;
2064     }
2065     if ( bExp == 0 ) {
2066         if ( bSig == 0 ) {
2067             float_raise( float_flag_invalid STATUS_VAR);
2068             return float32_default_nan;
2069         }
2070         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2071     }
2072     if ( aExp == 0 ) {
2073         if ( aSig == 0 ) return a;
2074         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2075     }
2076     expDiff = aExp - bExp;
2077     aSig |= 0x00800000;
2078     bSig |= 0x00800000;
2079     if ( expDiff < 32 ) {
2080         aSig <<= 8;
2081         bSig <<= 8;
2082         if ( expDiff < 0 ) {
2083             if ( expDiff < -1 ) return a;
2084             aSig >>= 1;
2085         }
2086         q = ( bSig <= aSig );
2087         if ( q ) aSig -= bSig;
2088         if ( 0 < expDiff ) {
2089             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2090             q >>= 32 - expDiff;
2091             bSig >>= 2;
2092             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2093         }
2094         else {
2095             aSig >>= 2;
2096             bSig >>= 2;
2097         }
2098     }
2099     else {
2100         if ( bSig <= aSig ) aSig -= bSig;
2101         aSig64 = ( (uint64_t) aSig )<<40;
2102         bSig64 = ( (uint64_t) bSig )<<40;
2103         expDiff -= 64;
2104         while ( 0 < expDiff ) {
2105             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2106             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2107             aSig64 = - ( ( bSig * q64 )<<38 );
2108             expDiff -= 62;
2109         }
2110         expDiff += 64;
2111         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2112         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2113         q = q64>>( 64 - expDiff );
2114         bSig <<= 6;
2115         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2116     }
2117     do {
2118         alternateASig = aSig;
2119         ++q;
2120         aSig -= bSig;
2121     } while ( 0 <= (int32_t) aSig );
2122     sigMean = aSig + alternateASig;
2123     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2124         aSig = alternateASig;
2125     }
2126     zSign = ( (int32_t) aSig < 0 );
2127     if ( zSign ) aSig = - aSig;
2128     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2129
2130 }
2131
2132 /*----------------------------------------------------------------------------
2133 | Returns the square root of the single-precision floating-point value `a'.
2134 | The operation is performed according to the IEC/IEEE Standard for Binary
2135 | Floating-Point Arithmetic.
2136 *----------------------------------------------------------------------------*/
2137
2138 float32 float32_sqrt( float32 a STATUS_PARAM )
2139 {
2140     flag aSign;
2141     int16 aExp, zExp;
2142     uint32_t aSig, zSig;
2143     uint64_t rem, term;
2144     a = float32_squash_input_denormal(a STATUS_VAR);
2145
2146     aSig = extractFloat32Frac( a );
2147     aExp = extractFloat32Exp( a );
2148     aSign = extractFloat32Sign( a );
2149     if ( aExp == 0xFF ) {
2150         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2151         if ( ! aSign ) return a;
2152         float_raise( float_flag_invalid STATUS_VAR);
2153         return float32_default_nan;
2154     }
2155     if ( aSign ) {
2156         if ( ( aExp | aSig ) == 0 ) return a;
2157         float_raise( float_flag_invalid STATUS_VAR);
2158         return float32_default_nan;
2159     }
2160     if ( aExp == 0 ) {
2161         if ( aSig == 0 ) return float32_zero;
2162         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2163     }
2164     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2165     aSig = ( aSig | 0x00800000 )<<8;
2166     zSig = estimateSqrt32( aExp, aSig ) + 2;
2167     if ( ( zSig & 0x7F ) <= 5 ) {
2168         if ( zSig < 2 ) {
2169             zSig = 0x7FFFFFFF;
2170             goto roundAndPack;
2171         }
2172         aSig >>= aExp & 1;
2173         term = ( (uint64_t) zSig ) * zSig;
2174         rem = ( ( (uint64_t) aSig )<<32 ) - term;
2175         while ( (int64_t) rem < 0 ) {
2176             --zSig;
2177             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2178         }
2179         zSig |= ( rem != 0 );
2180     }
2181     shift32RightJamming( zSig, 1, &zSig );
2182  roundAndPack:
2183     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2184
2185 }
2186
2187 /*----------------------------------------------------------------------------
2188 | Returns the binary exponential of the single-precision floating-point value
2189 | `a'. The operation is performed according to the IEC/IEEE Standard for
2190 | Binary Floating-Point Arithmetic.
2191 |
2192 | Uses the following identities:
2193 |
2194 | 1. -------------------------------------------------------------------------
2195 |      x    x*ln(2)
2196 |     2  = e
2197 |
2198 | 2. -------------------------------------------------------------------------
2199 |                      2     3     4     5           n
2200 |      x        x     x     x     x     x           x
2201 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2202 |               1!    2!    3!    4!    5!          n!
2203 *----------------------------------------------------------------------------*/
2204
2205 static const float64 float32_exp2_coefficients[15] =
2206 {
2207     const_float64( 0x3ff0000000000000ll ), /*  1 */
2208     const_float64( 0x3fe0000000000000ll ), /*  2 */
2209     const_float64( 0x3fc5555555555555ll ), /*  3 */
2210     const_float64( 0x3fa5555555555555ll ), /*  4 */
2211     const_float64( 0x3f81111111111111ll ), /*  5 */
2212     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2213     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2214     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2215     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2216     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2217     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2218     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2219     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2220     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2221     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2222 };
2223
2224 float32 float32_exp2( float32 a STATUS_PARAM )
2225 {
2226     flag aSign;
2227     int16 aExp;
2228     uint32_t aSig;
2229     float64 r, x, xn;
2230     int i;
2231     a = float32_squash_input_denormal(a STATUS_VAR);
2232
2233     aSig = extractFloat32Frac( a );
2234     aExp = extractFloat32Exp( a );
2235     aSign = extractFloat32Sign( a );
2236
2237     if ( aExp == 0xFF) {
2238         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2239         return (aSign) ? float32_zero : a;
2240     }
2241     if (aExp == 0) {
2242         if (aSig == 0) return float32_one;
2243     }
2244
2245     float_raise( float_flag_inexact STATUS_VAR);
2246
2247     /* ******************************* */
2248     /* using float64 for approximation */
2249     /* ******************************* */
2250     x = float32_to_float64(a STATUS_VAR);
2251     x = float64_mul(x, float64_ln2 STATUS_VAR);
2252
2253     xn = x;
2254     r = float64_one;
2255     for (i = 0 ; i < 15 ; i++) {
2256         float64 f;
2257
2258         f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2259         r = float64_add(r, f STATUS_VAR);
2260
2261         xn = float64_mul(xn, x STATUS_VAR);
2262     }
2263
2264     return float64_to_float32(r, status);
2265 }
2266
2267 /*----------------------------------------------------------------------------
2268 | Returns the binary log of the single-precision floating-point value `a'.
2269 | The operation is performed according to the IEC/IEEE Standard for Binary
2270 | Floating-Point Arithmetic.
2271 *----------------------------------------------------------------------------*/
2272 float32 float32_log2( float32 a STATUS_PARAM )
2273 {
2274     flag aSign, zSign;
2275     int16 aExp;
2276     uint32_t aSig, zSig, i;
2277
2278     a = float32_squash_input_denormal(a STATUS_VAR);
2279     aSig = extractFloat32Frac( a );
2280     aExp = extractFloat32Exp( a );
2281     aSign = extractFloat32Sign( a );
2282
2283     if ( aExp == 0 ) {
2284         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2285         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2286     }
2287     if ( aSign ) {
2288         float_raise( float_flag_invalid STATUS_VAR);
2289         return float32_default_nan;
2290     }
2291     if ( aExp == 0xFF ) {
2292         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2293         return a;
2294     }
2295
2296     aExp -= 0x7F;
2297     aSig |= 0x00800000;
2298     zSign = aExp < 0;
2299     zSig = aExp << 23;
2300
2301     for (i = 1 << 22; i > 0; i >>= 1) {
2302         aSig = ( (uint64_t)aSig * aSig ) >> 23;
2303         if ( aSig & 0x01000000 ) {
2304             aSig >>= 1;
2305             zSig |= i;
2306         }
2307     }
2308
2309     if ( zSign )
2310         zSig = -zSig;
2311
2312     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2313 }
2314
2315 /*----------------------------------------------------------------------------
2316 | Returns 1 if the single-precision floating-point value `a' is equal to
2317 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2318 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2319 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2320 *----------------------------------------------------------------------------*/
2321
2322 int float32_eq( float32 a, float32 b STATUS_PARAM )
2323 {
2324     uint32_t av, bv;
2325     a = float32_squash_input_denormal(a STATUS_VAR);
2326     b = float32_squash_input_denormal(b STATUS_VAR);
2327
2328     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2329          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2330        ) {
2331         float_raise( float_flag_invalid STATUS_VAR);
2332         return 0;
2333     }
2334     av = float32_val(a);
2335     bv = float32_val(b);
2336     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2337 }
2338
2339 /*----------------------------------------------------------------------------
2340 | Returns 1 if the single-precision floating-point value `a' is less than
2341 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
2342 | exception is raised if either operand is a NaN.  The comparison is performed
2343 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2344 *----------------------------------------------------------------------------*/
2345
2346 int float32_le( float32 a, float32 b STATUS_PARAM )
2347 {
2348     flag aSign, bSign;
2349     uint32_t av, bv;
2350     a = float32_squash_input_denormal(a STATUS_VAR);
2351     b = float32_squash_input_denormal(b STATUS_VAR);
2352
2353     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2354          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2355        ) {
2356         float_raise( float_flag_invalid STATUS_VAR);
2357         return 0;
2358     }
2359     aSign = extractFloat32Sign( a );
2360     bSign = extractFloat32Sign( b );
2361     av = float32_val(a);
2362     bv = float32_val(b);
2363     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2364     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2365
2366 }
2367
2368 /*----------------------------------------------------------------------------
2369 | Returns 1 if the single-precision floating-point value `a' is less than
2370 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2371 | raised if either operand is a NaN.  The comparison is performed according
2372 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2373 *----------------------------------------------------------------------------*/
2374
2375 int float32_lt( float32 a, float32 b STATUS_PARAM )
2376 {
2377     flag aSign, bSign;
2378     uint32_t av, bv;
2379     a = float32_squash_input_denormal(a STATUS_VAR);
2380     b = float32_squash_input_denormal(b STATUS_VAR);
2381
2382     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2383          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2384        ) {
2385         float_raise( float_flag_invalid STATUS_VAR);
2386         return 0;
2387     }
2388     aSign = extractFloat32Sign( a );
2389     bSign = extractFloat32Sign( b );
2390     av = float32_val(a);
2391     bv = float32_val(b);
2392     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2393     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2394
2395 }
2396
2397 /*----------------------------------------------------------------------------
2398 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2399 | be compared, and 0 otherwise.  The invalid exception is raised if either
2400 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
2401 | Standard for Binary Floating-Point Arithmetic.
2402 *----------------------------------------------------------------------------*/
2403
2404 int float32_unordered( float32 a, float32 b STATUS_PARAM )
2405 {
2406     a = float32_squash_input_denormal(a STATUS_VAR);
2407     b = float32_squash_input_denormal(b STATUS_VAR);
2408
2409     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2410          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2411        ) {
2412         float_raise( float_flag_invalid STATUS_VAR);
2413         return 1;
2414     }
2415     return 0;
2416 }
2417
2418 /*----------------------------------------------------------------------------
2419 | Returns 1 if the single-precision floating-point value `a' is equal to
2420 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2421 | exception.  The comparison is performed according to the IEC/IEEE Standard
2422 | for Binary Floating-Point Arithmetic.
2423 *----------------------------------------------------------------------------*/
2424
2425 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2426 {
2427     a = float32_squash_input_denormal(a STATUS_VAR);
2428     b = float32_squash_input_denormal(b STATUS_VAR);
2429
2430     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2431          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2432        ) {
2433         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2434             float_raise( float_flag_invalid STATUS_VAR);
2435         }
2436         return 0;
2437     }
2438     return ( float32_val(a) == float32_val(b) ) ||
2439             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2440 }
2441
2442 /*----------------------------------------------------------------------------
2443 | Returns 1 if the single-precision floating-point value `a' is less than or
2444 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2445 | cause an exception.  Otherwise, the comparison is performed according to the
2446 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2447 *----------------------------------------------------------------------------*/
2448
2449 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2450 {
2451     flag aSign, bSign;
2452     uint32_t av, bv;
2453     a = float32_squash_input_denormal(a STATUS_VAR);
2454     b = float32_squash_input_denormal(b STATUS_VAR);
2455
2456     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2457          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2458        ) {
2459         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2460             float_raise( float_flag_invalid STATUS_VAR);
2461         }
2462         return 0;
2463     }
2464     aSign = extractFloat32Sign( a );
2465     bSign = extractFloat32Sign( b );
2466     av = float32_val(a);
2467     bv = float32_val(b);
2468     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2469     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2470
2471 }
2472
2473 /*----------------------------------------------------------------------------
2474 | Returns 1 if the single-precision floating-point value `a' is less than
2475 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2476 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2477 | Standard for Binary Floating-Point Arithmetic.
2478 *----------------------------------------------------------------------------*/
2479
2480 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2481 {
2482     flag aSign, bSign;
2483     uint32_t av, bv;
2484     a = float32_squash_input_denormal(a STATUS_VAR);
2485     b = float32_squash_input_denormal(b STATUS_VAR);
2486
2487     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2488          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2489        ) {
2490         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2491             float_raise( float_flag_invalid STATUS_VAR);
2492         }
2493         return 0;
2494     }
2495     aSign = extractFloat32Sign( a );
2496     bSign = extractFloat32Sign( b );
2497     av = float32_val(a);
2498     bv = float32_val(b);
2499     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2500     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2501
2502 }
2503
2504 /*----------------------------------------------------------------------------
2505 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2506 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
2507 | comparison is performed according to the IEC/IEEE Standard for Binary
2508 | Floating-Point Arithmetic.
2509 *----------------------------------------------------------------------------*/
2510
2511 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2512 {
2513     a = float32_squash_input_denormal(a STATUS_VAR);
2514     b = float32_squash_input_denormal(b STATUS_VAR);
2515
2516     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2517          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2518        ) {
2519         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2520             float_raise( float_flag_invalid STATUS_VAR);
2521         }
2522         return 1;
2523     }
2524     return 0;
2525 }
2526
2527 /*----------------------------------------------------------------------------
2528 | Returns the result of converting the double-precision floating-point value
2529 | `a' to the 32-bit two's complement integer format.  The conversion is
2530 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2531 | Arithmetic---which means in particular that the conversion is rounded
2532 | according to the current rounding mode.  If `a' is a NaN, the largest
2533 | positive integer is returned.  Otherwise, if the conversion overflows, the
2534 | largest integer with the same sign as `a' is returned.
2535 *----------------------------------------------------------------------------*/
2536
2537 int32 float64_to_int32( float64 a STATUS_PARAM )
2538 {
2539     flag aSign;
2540     int16 aExp, shiftCount;
2541     uint64_t aSig;
2542     a = float64_squash_input_denormal(a STATUS_VAR);
2543
2544     aSig = extractFloat64Frac( a );
2545     aExp = extractFloat64Exp( a );
2546     aSign = extractFloat64Sign( a );
2547     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2548     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2549     shiftCount = 0x42C - aExp;
2550     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2551     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2552
2553 }
2554
2555 /*----------------------------------------------------------------------------
2556 | Returns the result of converting the double-precision floating-point value
2557 | `a' to the 32-bit two's complement integer format.  The conversion is
2558 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2559 | Arithmetic, except that the conversion is always rounded toward zero.
2560 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2561 | the conversion overflows, the largest integer with the same sign as `a' is
2562 | returned.
2563 *----------------------------------------------------------------------------*/
2564
2565 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2566 {
2567     flag aSign;
2568     int16 aExp, shiftCount;
2569     uint64_t aSig, savedASig;
2570     int32 z;
2571     a = float64_squash_input_denormal(a STATUS_VAR);
2572
2573     aSig = extractFloat64Frac( a );
2574     aExp = extractFloat64Exp( a );
2575     aSign = extractFloat64Sign( a );
2576     if ( 0x41E < aExp ) {
2577         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2578         goto invalid;
2579     }
2580     else if ( aExp < 0x3FF ) {
2581         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2582         return 0;
2583     }
2584     aSig |= LIT64( 0x0010000000000000 );
2585     shiftCount = 0x433 - aExp;
2586     savedASig = aSig;
2587     aSig >>= shiftCount;
2588     z = aSig;
2589     if ( aSign ) z = - z;
2590     if ( ( z < 0 ) ^ aSign ) {
2591  invalid:
2592         float_raise( float_flag_invalid STATUS_VAR);
2593         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2594     }
2595     if ( ( aSig<<shiftCount ) != savedASig ) {
2596         STATUS(float_exception_flags) |= float_flag_inexact;
2597     }
2598     return z;
2599
2600 }
2601
2602 /*----------------------------------------------------------------------------
2603 | Returns the result of converting the double-precision floating-point value
2604 | `a' to the 16-bit two's complement integer format.  The conversion is
2605 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2606 | Arithmetic, except that the conversion is always rounded toward zero.
2607 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2608 | the conversion overflows, the largest integer with the same sign as `a' is
2609 | returned.
2610 *----------------------------------------------------------------------------*/
2611
2612 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2613 {
2614     flag aSign;
2615     int16 aExp, shiftCount;
2616     uint64_t aSig, savedASig;
2617     int32 z;
2618
2619     aSig = extractFloat64Frac( a );
2620     aExp = extractFloat64Exp( a );
2621     aSign = extractFloat64Sign( a );
2622     if ( 0x40E < aExp ) {
2623         if ( ( aExp == 0x7FF ) && aSig ) {
2624             aSign = 0;
2625         }
2626         goto invalid;
2627     }
2628     else if ( aExp < 0x3FF ) {
2629         if ( aExp || aSig ) {
2630             STATUS(float_exception_flags) |= float_flag_inexact;
2631         }
2632         return 0;
2633     }
2634     aSig |= LIT64( 0x0010000000000000 );
2635     shiftCount = 0x433 - aExp;
2636     savedASig = aSig;
2637     aSig >>= shiftCount;
2638     z = aSig;
2639     if ( aSign ) {
2640         z = - z;
2641     }
2642     if ( ( (int16_t)z < 0 ) ^ aSign ) {
2643  invalid:
2644         float_raise( float_flag_invalid STATUS_VAR);
2645         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2646     }
2647     if ( ( aSig<<shiftCount ) != savedASig ) {
2648         STATUS(float_exception_flags) |= float_flag_inexact;
2649     }
2650     return z;
2651 }
2652
2653 /*----------------------------------------------------------------------------
2654 | Returns the result of converting the double-precision floating-point value
2655 | `a' to the 64-bit two's complement integer format.  The conversion is
2656 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2657 | Arithmetic---which means in particular that the conversion is rounded
2658 | according to the current rounding mode.  If `a' is a NaN, the largest
2659 | positive integer is returned.  Otherwise, if the conversion overflows, the
2660 | largest integer with the same sign as `a' is returned.
2661 *----------------------------------------------------------------------------*/
2662
2663 int64 float64_to_int64( float64 a STATUS_PARAM )
2664 {
2665     flag aSign;
2666     int16 aExp, shiftCount;
2667     uint64_t aSig, aSigExtra;
2668     a = float64_squash_input_denormal(a STATUS_VAR);
2669
2670     aSig = extractFloat64Frac( a );
2671     aExp = extractFloat64Exp( a );
2672     aSign = extractFloat64Sign( a );
2673     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2674     shiftCount = 0x433 - aExp;
2675     if ( shiftCount <= 0 ) {
2676         if ( 0x43E < aExp ) {
2677             float_raise( float_flag_invalid STATUS_VAR);
2678             if (    ! aSign
2679                  || (    ( aExp == 0x7FF )
2680                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2681                ) {
2682                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2683             }
2684             return (int64_t) LIT64( 0x8000000000000000 );
2685         }
2686         aSigExtra = 0;
2687         aSig <<= - shiftCount;
2688     }
2689     else {
2690         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2691     }
2692     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2693
2694 }
2695
2696 /*----------------------------------------------------------------------------
2697 | Returns the result of converting the double-precision floating-point value
2698 | `a' to the 64-bit two's complement integer format.  The conversion is
2699 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2700 | Arithmetic, except that the conversion is always rounded toward zero.
2701 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2702 | the conversion overflows, the largest integer with the same sign as `a' is
2703 | returned.
2704 *----------------------------------------------------------------------------*/
2705
2706 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2707 {
2708     flag aSign;
2709     int16 aExp, shiftCount;
2710     uint64_t aSig;
2711     int64 z;
2712     a = float64_squash_input_denormal(a STATUS_VAR);
2713
2714     aSig = extractFloat64Frac( a );
2715     aExp = extractFloat64Exp( a );
2716     aSign = extractFloat64Sign( a );
2717     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2718     shiftCount = aExp - 0x433;
2719     if ( 0 <= shiftCount ) {
2720         if ( 0x43E <= aExp ) {
2721             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2722                 float_raise( float_flag_invalid STATUS_VAR);
2723                 if (    ! aSign
2724                      || (    ( aExp == 0x7FF )
2725                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2726                    ) {
2727                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2728                 }
2729             }
2730             return (int64_t) LIT64( 0x8000000000000000 );
2731         }
2732         z = aSig<<shiftCount;
2733     }
2734     else {
2735         if ( aExp < 0x3FE ) {
2736             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2737             return 0;
2738         }
2739         z = aSig>>( - shiftCount );
2740         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
2741             STATUS(float_exception_flags) |= float_flag_inexact;
2742         }
2743     }
2744     if ( aSign ) z = - z;
2745     return z;
2746
2747 }
2748
2749 /*----------------------------------------------------------------------------
2750 | Returns the result of converting the double-precision floating-point value
2751 | `a' to the single-precision floating-point format.  The conversion is
2752 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2753 | Arithmetic.
2754 *----------------------------------------------------------------------------*/
2755
2756 float32 float64_to_float32( float64 a STATUS_PARAM )
2757 {
2758     flag aSign;
2759     int16 aExp;
2760     uint64_t aSig;
2761     uint32_t zSig;
2762     a = float64_squash_input_denormal(a STATUS_VAR);
2763
2764     aSig = extractFloat64Frac( a );
2765     aExp = extractFloat64Exp( a );
2766     aSign = extractFloat64Sign( a );
2767     if ( aExp == 0x7FF ) {
2768         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2769         return packFloat32( aSign, 0xFF, 0 );
2770     }
2771     shift64RightJamming( aSig, 22, &aSig );
2772     zSig = aSig;
2773     if ( aExp || zSig ) {
2774         zSig |= 0x40000000;
2775         aExp -= 0x381;
2776     }
2777     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2778
2779 }
2780
2781
2782 /*----------------------------------------------------------------------------
2783 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2784 | half-precision floating-point value, returning the result.  After being
2785 | shifted into the proper positions, the three fields are simply added
2786 | together to form the result.  This means that any integer portion of `zSig'
2787 | will be added into the exponent.  Since a properly normalized significand
2788 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2789 | than the desired result exponent whenever `zSig' is a complete, normalized
2790 | significand.
2791 *----------------------------------------------------------------------------*/
2792 static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig)
2793 {
2794     return make_float16(
2795         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
2796 }
2797
2798 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2799    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2800
2801 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2802 {
2803     flag aSign;
2804     int16 aExp;
2805     uint32_t aSig;
2806
2807     aSign = extractFloat16Sign(a);
2808     aExp = extractFloat16Exp(a);
2809     aSig = extractFloat16Frac(a);
2810
2811     if (aExp == 0x1f && ieee) {
2812         if (aSig) {
2813             return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
2814         }
2815         return packFloat32(aSign, 0xff, aSig << 13);
2816     }
2817     if (aExp == 0) {
2818         int8 shiftCount;
2819
2820         if (aSig == 0) {
2821             return packFloat32(aSign, 0, 0);
2822         }
2823
2824         shiftCount = countLeadingZeros32( aSig ) - 21;
2825         aSig = aSig << shiftCount;
2826         aExp = -shiftCount;
2827     }
2828     return packFloat32( aSign, aExp + 0x70, aSig << 13);
2829 }
2830
2831 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
2832 {
2833     flag aSign;
2834     int16 aExp;
2835     uint32_t aSig;
2836     uint32_t mask;
2837     uint32_t increment;
2838     int8 roundingMode;
2839     a = float32_squash_input_denormal(a STATUS_VAR);
2840
2841     aSig = extractFloat32Frac( a );
2842     aExp = extractFloat32Exp( a );
2843     aSign = extractFloat32Sign( a );
2844     if ( aExp == 0xFF ) {
2845         if (aSig) {
2846             /* Input is a NaN */
2847             float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2848             if (!ieee) {
2849                 return packFloat16(aSign, 0, 0);
2850             }
2851             return r;
2852         }
2853         /* Infinity */
2854         if (!ieee) {
2855             float_raise(float_flag_invalid STATUS_VAR);
2856             return packFloat16(aSign, 0x1f, 0x3ff);
2857         }
2858         return packFloat16(aSign, 0x1f, 0);
2859     }
2860     if (aExp == 0 && aSig == 0) {
2861         return packFloat16(aSign, 0, 0);
2862     }
2863     /* Decimal point between bits 22 and 23.  */
2864     aSig |= 0x00800000;
2865     aExp -= 0x7f;
2866     if (aExp < -14) {
2867         mask = 0x00ffffff;
2868         if (aExp >= -24) {
2869             mask >>= 25 + aExp;
2870         }
2871     } else {
2872         mask = 0x00001fff;
2873     }
2874     if (aSig & mask) {
2875         float_raise( float_flag_underflow STATUS_VAR );
2876         roundingMode = STATUS(float_rounding_mode);
2877         switch (roundingMode) {
2878         case float_round_nearest_even:
2879             increment = (mask + 1) >> 1;
2880             if ((aSig & mask) == increment) {
2881                 increment = aSig & (increment << 1);
2882             }
2883             break;
2884         case float_round_up:
2885             increment = aSign ? 0 : mask;
2886             break;
2887         case float_round_down:
2888             increment = aSign ? mask : 0;
2889             break;
2890         default: /* round_to_zero */
2891             increment = 0;
2892             break;
2893         }
2894         aSig += increment;
2895         if (aSig >= 0x01000000) {
2896             aSig >>= 1;
2897             aExp++;
2898         }
2899     } else if (aExp < -14
2900           && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2901         float_raise( float_flag_underflow STATUS_VAR);
2902     }
2903
2904     if (ieee) {
2905         if (aExp > 15) {
2906             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2907             return packFloat16(aSign, 0x1f, 0);
2908         }
2909     } else {
2910         if (aExp > 16) {
2911             float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
2912             return packFloat16(aSign, 0x1f, 0x3ff);
2913         }
2914     }
2915     if (aExp < -24) {
2916         return packFloat16(aSign, 0, 0);
2917     }
2918     if (aExp < -14) {
2919         aSig >>= -14 - aExp;
2920         aExp = -14;
2921     }
2922     return packFloat16(aSign, aExp + 14, aSig >> 13);
2923 }
2924
2925 #ifdef FLOATX80
2926
2927 /*----------------------------------------------------------------------------
2928 | Returns the result of converting the double-precision floating-point value
2929 | `a' to the extended double-precision floating-point format.  The conversion
2930 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2931 | Arithmetic.
2932 *----------------------------------------------------------------------------*/
2933
2934 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2935 {
2936     flag aSign;
2937     int16 aExp;
2938     uint64_t aSig;
2939
2940     a = float64_squash_input_denormal(a STATUS_VAR);
2941     aSig = extractFloat64Frac( a );
2942     aExp = extractFloat64Exp( a );
2943     aSign = extractFloat64Sign( a );
2944     if ( aExp == 0x7FF ) {
2945         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2946         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2947     }
2948     if ( aExp == 0 ) {
2949         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2950         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2951     }
2952     return
2953         packFloatx80(
2954             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2955
2956 }
2957
2958 #endif
2959
2960 #ifdef FLOAT128
2961
2962 /*----------------------------------------------------------------------------
2963 | Returns the result of converting the double-precision floating-point value
2964 | `a' to the quadruple-precision floating-point format.  The conversion is
2965 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2966 | Arithmetic.
2967 *----------------------------------------------------------------------------*/
2968
2969 float128 float64_to_float128( float64 a STATUS_PARAM )
2970 {
2971     flag aSign;
2972     int16 aExp;
2973     uint64_t aSig, zSig0, zSig1;
2974
2975     a = float64_squash_input_denormal(a STATUS_VAR);
2976     aSig = extractFloat64Frac( a );
2977     aExp = extractFloat64Exp( a );
2978     aSign = extractFloat64Sign( a );
2979     if ( aExp == 0x7FF ) {
2980         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2981         return packFloat128( aSign, 0x7FFF, 0, 0 );
2982     }
2983     if ( aExp == 0 ) {
2984         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2985         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2986         --aExp;
2987     }
2988     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2989     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2990
2991 }
2992
2993 #endif
2994
2995 /*----------------------------------------------------------------------------
2996 | Rounds the double-precision floating-point value `a' to an integer, and
2997 | returns the result as a double-precision floating-point value.  The
2998 | operation is performed according to the IEC/IEEE Standard for Binary
2999 | Floating-Point Arithmetic.
3000 *----------------------------------------------------------------------------*/
3001
3002 float64 float64_round_to_int( float64 a STATUS_PARAM )
3003 {
3004     flag aSign;
3005     int16 aExp;
3006     uint64_t lastBitMask, roundBitsMask;
3007     int8 roundingMode;
3008     uint64_t z;
3009     a = float64_squash_input_denormal(a STATUS_VAR);
3010
3011     aExp = extractFloat64Exp( a );
3012     if ( 0x433 <= aExp ) {
3013         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3014             return propagateFloat64NaN( a, a STATUS_VAR );
3015         }
3016         return a;
3017     }
3018     if ( aExp < 0x3FF ) {
3019         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3020         STATUS(float_exception_flags) |= float_flag_inexact;
3021         aSign = extractFloat64Sign( a );
3022         switch ( STATUS(float_rounding_mode) ) {
3023          case float_round_nearest_even:
3024             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3025                 return packFloat64( aSign, 0x3FF, 0 );
3026             }
3027             break;
3028          case float_round_down:
3029             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3030          case float_round_up:
3031             return make_float64(
3032             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3033         }
3034         return packFloat64( aSign, 0, 0 );
3035     }
3036     lastBitMask = 1;
3037     lastBitMask <<= 0x433 - aExp;
3038     roundBitsMask = lastBitMask - 1;
3039     z = float64_val(a);
3040     roundingMode = STATUS(float_rounding_mode);
3041     if ( roundingMode == float_round_nearest_even ) {
3042         z += lastBitMask>>1;
3043         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3044     }
3045     else if ( roundingMode != float_round_to_zero ) {
3046         if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3047             z += roundBitsMask;
3048         }
3049     }
3050     z &= ~ roundBitsMask;
3051     if ( z != float64_val(a) )
3052         STATUS(float_exception_flags) |= float_flag_inexact;
3053     return make_float64(z);
3054
3055 }
3056
3057 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3058 {
3059     int oldmode;
3060     float64 res;
3061     oldmode = STATUS(float_rounding_mode);
3062     STATUS(float_rounding_mode) = float_round_to_zero;
3063     res = float64_round_to_int(a STATUS_VAR);
3064     STATUS(float_rounding_mode) = oldmode;
3065     return res;
3066 }
3067
3068 /*----------------------------------------------------------------------------
3069 | Returns the result of adding the absolute values of the double-precision
3070 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3071 | before being returned.  `zSign' is ignored if the result is a NaN.
3072 | The addition is performed according to the IEC/IEEE Standard for Binary
3073 | Floating-Point Arithmetic.
3074 *----------------------------------------------------------------------------*/
3075
3076 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3077 {
3078     int16 aExp, bExp, zExp;
3079     uint64_t aSig, bSig, zSig;
3080     int16 expDiff;
3081
3082     aSig = extractFloat64Frac( a );
3083     aExp = extractFloat64Exp( a );
3084     bSig = extractFloat64Frac( b );
3085     bExp = extractFloat64Exp( b );
3086     expDiff = aExp - bExp;
3087     aSig <<= 9;
3088     bSig <<= 9;
3089     if ( 0 < expDiff ) {
3090         if ( aExp == 0x7FF ) {
3091             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3092             return a;
3093         }
3094         if ( bExp == 0 ) {
3095             --expDiff;
3096         }
3097         else {
3098             bSig |= LIT64( 0x2000000000000000 );
3099         }
3100         shift64RightJamming( bSig, expDiff, &bSig );
3101         zExp = aExp;
3102     }
3103     else if ( expDiff < 0 ) {
3104         if ( bExp == 0x7FF ) {
3105             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3106             return packFloat64( zSign, 0x7FF, 0 );
3107         }
3108         if ( aExp == 0 ) {
3109             ++expDiff;
3110         }
3111         else {
3112             aSig |= LIT64( 0x2000000000000000 );
3113         }
3114         shift64RightJamming( aSig, - expDiff, &aSig );
3115         zExp = bExp;
3116     }
3117     else {
3118         if ( aExp == 0x7FF ) {
3119             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3120             return a;
3121         }
3122         if ( aExp == 0 ) {
3123             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
3124             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3125         }
3126         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3127         zExp = aExp;
3128         goto roundAndPack;
3129     }
3130     aSig |= LIT64( 0x2000000000000000 );
3131     zSig = ( aSig + bSig )<<1;
3132     --zExp;
3133     if ( (int64_t) zSig < 0 ) {
3134         zSig = aSig + bSig;
3135         ++zExp;
3136     }
3137  roundAndPack:
3138     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3139
3140 }
3141
3142 /*----------------------------------------------------------------------------
3143 | Returns the result of subtracting the absolute values of the double-
3144 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3145 | difference is negated before being returned.  `zSign' is ignored if the
3146 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3147 | Standard for Binary Floating-Point Arithmetic.
3148 *----------------------------------------------------------------------------*/
3149
3150 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3151 {
3152     int16 aExp, bExp, zExp;
3153     uint64_t aSig, bSig, zSig;
3154     int16 expDiff;
3155
3156     aSig = extractFloat64Frac( a );
3157     aExp = extractFloat64Exp( a );
3158     bSig = extractFloat64Frac( b );
3159     bExp = extractFloat64Exp( b );
3160     expDiff = aExp - bExp;
3161     aSig <<= 10;
3162     bSig <<= 10;
3163     if ( 0 < expDiff ) goto aExpBigger;
3164     if ( expDiff < 0 ) goto bExpBigger;
3165     if ( aExp == 0x7FF ) {
3166         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3167         float_raise( float_flag_invalid STATUS_VAR);
3168         return float64_default_nan;
3169     }
3170     if ( aExp == 0 ) {
3171         aExp = 1;
3172         bExp = 1;
3173     }
3174     if ( bSig < aSig ) goto aBigger;
3175     if ( aSig < bSig ) goto bBigger;
3176     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3177  bExpBigger:
3178     if ( bExp == 0x7FF ) {
3179         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3180         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3181     }
3182     if ( aExp == 0 ) {
3183         ++expDiff;
3184     }
3185     else {
3186         aSig |= LIT64( 0x4000000000000000 );
3187     }
3188     shift64RightJamming( aSig, - expDiff, &aSig );
3189     bSig |= LIT64( 0x4000000000000000 );
3190  bBigger:
3191     zSig = bSig - aSig;
3192     zExp = bExp;
3193     zSign ^= 1;
3194     goto normalizeRoundAndPack;
3195  aExpBigger:
3196     if ( aExp == 0x7FF ) {
3197         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3198         return a;
3199     }
3200     if ( bExp == 0 ) {
3201         --expDiff;
3202     }
3203     else {
3204         bSig |= LIT64( 0x4000000000000000 );
3205     }
3206     shift64RightJamming( bSig, expDiff, &bSig );
3207     aSig |= LIT64( 0x4000000000000000 );
3208  aBigger:
3209     zSig = aSig - bSig;
3210     zExp = aExp;
3211  normalizeRoundAndPack:
3212     --zExp;
3213     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3214
3215 }
3216
3217 /*----------------------------------------------------------------------------
3218 | Returns the result of adding the double-precision floating-point values `a'
3219 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3220 | Binary Floating-Point Arithmetic.
3221 *----------------------------------------------------------------------------*/
3222
3223 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3224 {
3225     flag aSign, bSign;
3226     a = float64_squash_input_denormal(a STATUS_VAR);
3227     b = float64_squash_input_denormal(b STATUS_VAR);
3228
3229     aSign = extractFloat64Sign( a );
3230     bSign = extractFloat64Sign( b );
3231     if ( aSign == bSign ) {
3232         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3233     }
3234     else {
3235         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3236     }
3237
3238 }
3239
3240 /*----------------------------------------------------------------------------
3241 | Returns the result of subtracting the double-precision floating-point values
3242 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3243 | for Binary Floating-Point Arithmetic.
3244 *----------------------------------------------------------------------------*/
3245
3246 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3247 {
3248     flag aSign, bSign;
3249     a = float64_squash_input_denormal(a STATUS_VAR);
3250     b = float64_squash_input_denormal(b STATUS_VAR);
3251
3252     aSign = extractFloat64Sign( a );
3253     bSign = extractFloat64Sign( b );
3254     if ( aSign == bSign ) {
3255         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3256     }
3257     else {
3258         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3259     }
3260
3261 }
3262
3263 /*----------------------------------------------------------------------------
3264 | Returns the result of multiplying the double-precision floating-point values
3265 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3266 | for Binary Floating-Point Arithmetic.
3267 *----------------------------------------------------------------------------*/
3268
3269 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3270 {
3271     flag aSign, bSign, zSign;
3272     int16 aExp, bExp, zExp;
3273     uint64_t aSig, bSig, zSig0, zSig1;
3274
3275     a = float64_squash_input_denormal(a STATUS_VAR);
3276     b = float64_squash_input_denormal(b STATUS_VAR);
3277
3278     aSig = extractFloat64Frac( a );
3279     aExp = extractFloat64Exp( a );
3280     aSign = extractFloat64Sign( a );
3281     bSig = extractFloat64Frac( b );
3282     bExp = extractFloat64Exp( b );
3283     bSign = extractFloat64Sign( b );
3284     zSign = aSign ^ bSign;
3285     if ( aExp == 0x7FF ) {
3286         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3287             return propagateFloat64NaN( a, b STATUS_VAR );
3288         }
3289         if ( ( bExp | bSig ) == 0 ) {
3290             float_raise( float_flag_invalid STATUS_VAR);
3291             return float64_default_nan;
3292         }
3293         return packFloat64( zSign, 0x7FF, 0 );
3294     }
3295     if ( bExp == 0x7FF ) {
3296         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3297         if ( ( aExp | aSig ) == 0 ) {
3298             float_raise( float_flag_invalid STATUS_VAR);
3299             return float64_default_nan;
3300         }
3301         return packFloat64( zSign, 0x7FF, 0 );
3302     }
3303     if ( aExp == 0 ) {
3304         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3305         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3306     }
3307     if ( bExp == 0 ) {
3308         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3309         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3310     }
3311     zExp = aExp + bExp - 0x3FF;
3312     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3313     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3314     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3315     zSig0 |= ( zSig1 != 0 );
3316     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3317         zSig0 <<= 1;
3318         --zExp;
3319     }
3320     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3321
3322 }
3323
3324 /*----------------------------------------------------------------------------
3325 | Returns the result of dividing the double-precision floating-point value `a'
3326 | by the corresponding value `b'.  The operation is performed according to
3327 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3328 *----------------------------------------------------------------------------*/
3329
3330 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3331 {
3332     flag aSign, bSign, zSign;
3333     int16 aExp, bExp, zExp;
3334     uint64_t aSig, bSig, zSig;
3335     uint64_t rem0, rem1;
3336     uint64_t term0, term1;
3337     a = float64_squash_input_denormal(a STATUS_VAR);
3338     b = float64_squash_input_denormal(b STATUS_VAR);
3339
3340     aSig = extractFloat64Frac( a );
3341     aExp = extractFloat64Exp( a );
3342     aSign = extractFloat64Sign( a );
3343     bSig = extractFloat64Frac( b );
3344     bExp = extractFloat64Exp( b );
3345     bSign = extractFloat64Sign( b );
3346     zSign = aSign ^ bSign;
3347     if ( aExp == 0x7FF ) {
3348         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3349         if ( bExp == 0x7FF ) {
3350             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3351             float_raise( float_flag_invalid STATUS_VAR);
3352             return float64_default_nan;
3353         }
3354         return packFloat64( zSign, 0x7FF, 0 );
3355     }
3356     if ( bExp == 0x7FF ) {
3357         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3358         return packFloat64( zSign, 0, 0 );
3359     }
3360     if ( bExp == 0 ) {
3361         if ( bSig == 0 ) {
3362             if ( ( aExp | aSig ) == 0 ) {
3363                 float_raise( float_flag_invalid STATUS_VAR);
3364                 return float64_default_nan;
3365             }
3366             float_raise( float_flag_divbyzero STATUS_VAR);
3367             return packFloat64( zSign, 0x7FF, 0 );
3368         }
3369         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3370     }
3371     if ( aExp == 0 ) {
3372         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3373         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3374     }
3375     zExp = aExp - bExp + 0x3FD;
3376     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3377     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3378     if ( bSig <= ( aSig + aSig ) ) {
3379         aSig >>= 1;
3380         ++zExp;
3381     }
3382     zSig = estimateDiv128To64( aSig, 0, bSig );
3383     if ( ( zSig & 0x1FF ) <= 2 ) {
3384         mul64To128( bSig, zSig, &term0, &term1 );
3385         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3386         while ( (int64_t) rem0 < 0 ) {
3387             --zSig;
3388             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3389         }
3390         zSig |= ( rem1 != 0 );
3391     }
3392     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3393
3394 }
3395
3396 /*----------------------------------------------------------------------------
3397 | Returns the remainder of the double-precision floating-point value `a'
3398 | with respect to the corresponding value `b'.  The operation is performed
3399 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3400 *----------------------------------------------------------------------------*/
3401
3402 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3403 {
3404     flag aSign, zSign;
3405     int16 aExp, bExp, expDiff;
3406     uint64_t aSig, bSig;
3407     uint64_t q, alternateASig;
3408     int64_t sigMean;
3409
3410     a = float64_squash_input_denormal(a STATUS_VAR);
3411     b = float64_squash_input_denormal(b STATUS_VAR);
3412     aSig = extractFloat64Frac( a );
3413     aExp = extractFloat64Exp( a );
3414     aSign = extractFloat64Sign( a );
3415     bSig = extractFloat64Frac( b );
3416     bExp = extractFloat64Exp( b );
3417     if ( aExp == 0x7FF ) {
3418         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3419             return propagateFloat64NaN( a, b STATUS_VAR );
3420         }
3421         float_raise( float_flag_invalid STATUS_VAR);
3422         return float64_default_nan;
3423     }
3424     if ( bExp == 0x7FF ) {
3425         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3426         return a;
3427     }
3428     if ( bExp == 0 ) {
3429         if ( bSig == 0 ) {
3430             float_raise( float_flag_invalid STATUS_VAR);
3431             return float64_default_nan;
3432         }
3433         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3434     }
3435     if ( aExp == 0 ) {
3436         if ( aSig == 0 ) return a;
3437         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3438     }
3439     expDiff = aExp - bExp;
3440     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3441     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3442     if ( expDiff < 0 ) {
3443         if ( expDiff < -1 ) return a;
3444         aSig >>= 1;
3445     }
3446     q = ( bSig <= aSig );
3447     if ( q ) aSig -= bSig;
3448     expDiff -= 64;
3449     while ( 0 < expDiff ) {
3450         q = estimateDiv128To64( aSig, 0, bSig );
3451         q = ( 2 < q ) ? q - 2 : 0;
3452         aSig = - ( ( bSig>>2 ) * q );
3453         expDiff -= 62;
3454     }
3455     expDiff += 64;
3456     if ( 0 < expDiff ) {
3457         q = estimateDiv128To64( aSig, 0, bSig );
3458         q = ( 2 < q ) ? q - 2 : 0;
3459         q >>= 64 - expDiff;
3460         bSig >>= 2;
3461         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3462     }
3463     else {
3464         aSig >>= 2;
3465         bSig >>= 2;
3466     }
3467     do {
3468         alternateASig = aSig;
3469         ++q;
3470         aSig -= bSig;
3471     } while ( 0 <= (int64_t) aSig );
3472     sigMean = aSig + alternateASig;
3473     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3474         aSig = alternateASig;
3475     }
3476     zSign = ( (int64_t) aSig < 0 );
3477     if ( zSign ) aSig = - aSig;
3478     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3479
3480 }
3481
3482 /*----------------------------------------------------------------------------
3483 | Returns the square root of the double-precision floating-point value `a'.
3484 | The operation is performed according to the IEC/IEEE Standard for Binary
3485 | Floating-Point Arithmetic.
3486 *----------------------------------------------------------------------------*/
3487
3488 float64 float64_sqrt( float64 a STATUS_PARAM )
3489 {
3490     flag aSign;
3491     int16 aExp, zExp;
3492     uint64_t aSig, zSig, doubleZSig;
3493     uint64_t rem0, rem1, term0, term1;
3494     a = float64_squash_input_denormal(a STATUS_VAR);
3495
3496     aSig = extractFloat64Frac( a );
3497     aExp = extractFloat64Exp( a );
3498     aSign = extractFloat64Sign( a );
3499     if ( aExp == 0x7FF ) {
3500         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3501         if ( ! aSign ) return a;
3502         float_raise( float_flag_invalid STATUS_VAR);
3503         return float64_default_nan;
3504     }
3505     if ( aSign ) {
3506         if ( ( aExp | aSig ) == 0 ) return a;
3507         float_raise( float_flag_invalid STATUS_VAR);
3508         return float64_default_nan;
3509     }
3510     if ( aExp == 0 ) {
3511         if ( aSig == 0 ) return float64_zero;
3512         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3513     }
3514     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3515     aSig |= LIT64( 0x0010000000000000 );
3516     zSig = estimateSqrt32( aExp, aSig>>21 );
3517     aSig <<= 9 - ( aExp & 1 );
3518     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3519     if ( ( zSig & 0x1FF ) <= 5 ) {
3520         doubleZSig = zSig<<1;
3521         mul64To128( zSig, zSig, &term0, &term1 );
3522         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3523         while ( (int64_t) rem0 < 0 ) {
3524             --zSig;
3525             doubleZSig -= 2;
3526             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3527         }
3528         zSig |= ( ( rem0 | rem1 ) != 0 );
3529     }
3530     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3531
3532 }
3533
3534 /*----------------------------------------------------------------------------
3535 | Returns the binary log of the double-precision floating-point value `a'.
3536 | The operation is performed according to the IEC/IEEE Standard for Binary
3537 | Floating-Point Arithmetic.
3538 *----------------------------------------------------------------------------*/
3539 float64 float64_log2( float64 a STATUS_PARAM )
3540 {
3541     flag aSign, zSign;
3542     int16 aExp;
3543     uint64_t aSig, aSig0, aSig1, zSig, i;
3544     a = float64_squash_input_denormal(a STATUS_VAR);
3545
3546     aSig = extractFloat64Frac( a );
3547     aExp = extractFloat64Exp( a );
3548     aSign = extractFloat64Sign( a );
3549
3550     if ( aExp == 0 ) {
3551         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3552         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3553     }
3554     if ( aSign ) {
3555         float_raise( float_flag_invalid STATUS_VAR);
3556         return float64_default_nan;
3557     }
3558     if ( aExp == 0x7FF ) {
3559         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3560         return a;
3561     }
3562
3563     aExp -= 0x3FF;
3564     aSig |= LIT64( 0x0010000000000000 );
3565     zSign = aExp < 0;
3566     zSig = (uint64_t)aExp << 52;
3567     for (i = 1LL << 51; i > 0; i >>= 1) {
3568         mul64To128( aSig, aSig, &aSig0, &aSig1 );
3569         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3570         if ( aSig & LIT64( 0x0020000000000000 ) ) {
3571             aSig >>= 1;
3572             zSig |= i;
3573         }
3574     }
3575
3576     if ( zSign )
3577         zSig = -zSig;
3578     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3579 }
3580
3581 /*----------------------------------------------------------------------------
3582 | Returns 1 if the double-precision floating-point value `a' is equal to the
3583 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
3584 | if either operand is a NaN.  Otherwise, the comparison is performed
3585 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3586 *----------------------------------------------------------------------------*/
3587
3588 int float64_eq( float64 a, float64 b STATUS_PARAM )
3589 {
3590     uint64_t av, bv;
3591     a = float64_squash_input_denormal(a STATUS_VAR);
3592     b = float64_squash_input_denormal(b STATUS_VAR);
3593
3594     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3595          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3596        ) {
3597         float_raise( float_flag_invalid STATUS_VAR);
3598         return 0;
3599     }
3600     av = float64_val(a);
3601     bv = float64_val(b);
3602     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3603
3604 }
3605
3606 /*----------------------------------------------------------------------------
3607 | Returns 1 if the double-precision floating-point value `a' is less than or
3608 | equal to the corresponding value `b', and 0 otherwise.  The invalid
3609 | exception is raised if either operand is a NaN.  The comparison is performed
3610 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3611 *----------------------------------------------------------------------------*/
3612
3613 int float64_le( float64 a, float64 b STATUS_PARAM )
3614 {
3615     flag aSign, bSign;
3616     uint64_t av, bv;
3617     a = float64_squash_input_denormal(a STATUS_VAR);
3618     b = float64_squash_input_denormal(b STATUS_VAR);
3619
3620     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3621          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3622        ) {
3623         float_raise( float_flag_invalid STATUS_VAR);
3624         return 0;
3625     }
3626     aSign = extractFloat64Sign( a );
3627     bSign = extractFloat64Sign( b );
3628     av = float64_val(a);
3629     bv = float64_val(b);
3630     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3631     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3632
3633 }
3634
3635 /*----------------------------------------------------------------------------
3636 | Returns 1 if the double-precision floating-point value `a' is less than
3637 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3638 | raised if either operand is a NaN.  The comparison is performed according
3639 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3640 *----------------------------------------------------------------------------*/
3641
3642 int float64_lt( float64 a, float64 b STATUS_PARAM )
3643 {
3644     flag aSign, bSign;
3645     uint64_t av, bv;
3646
3647     a = float64_squash_input_denormal(a STATUS_VAR);
3648     b = float64_squash_input_denormal(b STATUS_VAR);
3649     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3650          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3651        ) {
3652         float_raise( float_flag_invalid STATUS_VAR);
3653         return 0;
3654     }
3655     aSign = extractFloat64Sign( a );
3656     bSign = extractFloat64Sign( b );
3657     av = float64_val(a);
3658     bv = float64_val(b);
3659     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3660     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3661
3662 }
3663
3664 /*----------------------------------------------------------------------------
3665 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3666 | be compared, and 0 otherwise.  The invalid exception is raised if either
3667 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
3668 | Standard for Binary Floating-Point Arithmetic.
3669 *----------------------------------------------------------------------------*/
3670
3671 int float64_unordered( float64 a, float64 b STATUS_PARAM )
3672 {
3673     a = float64_squash_input_denormal(a STATUS_VAR);
3674     b = float64_squash_input_denormal(b STATUS_VAR);
3675
3676     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3677          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3678        ) {
3679         float_raise( float_flag_invalid STATUS_VAR);
3680         return 1;
3681     }
3682     return 0;
3683 }
3684
3685 /*----------------------------------------------------------------------------
3686 | Returns 1 if the double-precision floating-point value `a' is equal to the
3687 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3688 | exception.The comparison is performed according to the IEC/IEEE Standard
3689 | for Binary Floating-Point Arithmetic.
3690 *----------------------------------------------------------------------------*/
3691
3692 int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
3693 {
3694     uint64_t av, bv;
3695     a = float64_squash_input_denormal(a STATUS_VAR);
3696     b = float64_squash_input_denormal(b STATUS_VAR);
3697
3698     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3699          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3700        ) {
3701         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3702             float_raise( float_flag_invalid STATUS_VAR);
3703         }
3704         return 0;
3705     }
3706     av = float64_val(a);
3707     bv = float64_val(b);
3708     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3709
3710 }
3711
3712 /*----------------------------------------------------------------------------
3713 | Returns 1 if the double-precision floating-point value `a' is less than or
3714 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3715 | cause an exception.  Otherwise, the comparison is performed according to the
3716 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3717 *----------------------------------------------------------------------------*/
3718
3719 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3720 {
3721     flag aSign, bSign;
3722     uint64_t av, bv;
3723     a = float64_squash_input_denormal(a STATUS_VAR);
3724     b = float64_squash_input_denormal(b STATUS_VAR);
3725
3726     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3727          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3728        ) {
3729         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3730             float_raise( float_flag_invalid STATUS_VAR);
3731         }
3732         return 0;
3733     }
3734     aSign = extractFloat64Sign( a );
3735     bSign = extractFloat64Sign( b );
3736     av = float64_val(a);
3737     bv = float64_val(b);
3738     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3739     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3740
3741 }
3742
3743 /*----------------------------------------------------------------------------
3744 | Returns 1 if the double-precision floating-point value `a' is less than
3745 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3746 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3747 | Standard for Binary Floating-Point Arithmetic.
3748 *----------------------------------------------------------------------------*/
3749
3750 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3751 {
3752     flag aSign, bSign;
3753     uint64_t av, bv;
3754     a = float64_squash_input_denormal(a STATUS_VAR);
3755     b = float64_squash_input_denormal(b STATUS_VAR);
3756
3757     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3758          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3759        ) {
3760         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3761             float_raise( float_flag_invalid STATUS_VAR);
3762         }
3763         return 0;
3764     }
3765     aSign = extractFloat64Sign( a );
3766     bSign = extractFloat64Sign( b );
3767     av = float64_val(a);
3768     bv = float64_val(b);
3769     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3770     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3771
3772 }
3773
3774 /*----------------------------------------------------------------------------
3775 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3776 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3777 | comparison is performed according to the IEC/IEEE Standard for Binary
3778 | Floating-Point Arithmetic.
3779 *----------------------------------------------------------------------------*/
3780
3781 int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
3782 {
3783     a = float64_squash_input_denormal(a STATUS_VAR);
3784     b = float64_squash_input_denormal(b STATUS_VAR);
3785
3786     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3787          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3788        ) {
3789         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3790             float_raise( float_flag_invalid STATUS_VAR);
3791         }
3792         return 1;
3793     }
3794     return 0;
3795 }
3796
3797 #ifdef FLOATX80
3798
3799 /*----------------------------------------------------------------------------
3800 | Returns the result of converting the extended double-precision floating-
3801 | point value `a' to the 32-bit two's complement integer format.  The
3802 | conversion is performed according to the IEC/IEEE Standard for Binary
3803 | Floating-Point Arithmetic---which means in particular that the conversion
3804 | is rounded according to the current rounding mode.  If `a' is a NaN, the
3805 | largest positive integer is returned.  Otherwise, if the conversion
3806 | overflows, the largest integer with the same sign as `a' is returned.
3807 *----------------------------------------------------------------------------*/
3808
3809 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3810 {
3811     flag aSign;
3812     int32 aExp, shiftCount;
3813     uint64_t aSig;
3814
3815     aSig = extractFloatx80Frac( a );
3816     aExp = extractFloatx80Exp( a );
3817     aSign = extractFloatx80Sign( a );
3818     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3819     shiftCount = 0x4037 - aExp;
3820     if ( shiftCount <= 0 ) shiftCount = 1;
3821     shift64RightJamming( aSig, shiftCount, &aSig );
3822     return roundAndPackInt32( aSign, aSig STATUS_VAR );
3823
3824 }
3825
3826 /*----------------------------------------------------------------------------
3827 | Returns the result of converting the extended double-precision floating-
3828 | point value `a' to the 32-bit two's complement integer format.  The
3829 | conversion is performed according to the IEC/IEEE Standard for Binary
3830 | Floating-Point Arithmetic, except that the conversion is always rounded
3831 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3832 | Otherwise, if the conversion overflows, the largest integer with the same
3833 | sign as `a' is returned.
3834 *----------------------------------------------------------------------------*/
3835
3836 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3837 {
3838     flag aSign;
3839     int32 aExp, shiftCount;
3840     uint64_t aSig, savedASig;
3841     int32 z;
3842
3843     aSig = extractFloatx80Frac( a );
3844     aExp = extractFloatx80Exp( a );
3845     aSign = extractFloatx80Sign( a );
3846     if ( 0x401E < aExp ) {
3847         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3848         goto invalid;
3849     }
3850     else if ( aExp < 0x3FFF ) {
3851         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3852         return 0;
3853     }
3854     shiftCount = 0x403E - aExp;
3855     savedASig = aSig;
3856     aSig >>= shiftCount;
3857     z = aSig;
3858     if ( aSign ) z = - z;
3859     if ( ( z < 0 ) ^ aSign ) {
3860  invalid:
3861         float_raise( float_flag_invalid STATUS_VAR);
3862         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3863     }
3864     if ( ( aSig<<shiftCount ) != savedASig ) {
3865         STATUS(float_exception_flags) |= float_flag_inexact;
3866     }
3867     return z;
3868
3869 }
3870
3871 /*----------------------------------------------------------------------------
3872 | Returns the result of converting the extended double-precision floating-
3873 | point value `a' to the 64-bit two's complement integer format.  The
3874 | conversion is performed according to the IEC/IEEE Standard for Binary
3875 | Floating-Point Arithmetic---which means in particular that the conversion
3876 | is rounded according to the current rounding mode.  If `a' is a NaN,
3877 | the largest positive integer is returned.  Otherwise, if the conversion
3878 | overflows, the largest integer with the same sign as `a' is returned.
3879 *----------------------------------------------------------------------------*/
3880
3881 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3882 {
3883     flag aSign;
3884     int32 aExp, shiftCount;
3885     uint64_t aSig, aSigExtra;
3886
3887     aSig = extractFloatx80Frac( a );
3888     aExp = extractFloatx80Exp( a );
3889     aSign = extractFloatx80Sign( a );
3890     shiftCount = 0x403E - aExp;
3891     if ( shiftCount <= 0 ) {
3892         if ( shiftCount ) {
3893             float_raise( float_flag_invalid STATUS_VAR);
3894             if (    ! aSign
3895                  || (    ( aExp == 0x7FFF )
3896                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3897                ) {
3898                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3899             }
3900             return (int64_t) LIT64( 0x8000000000000000 );
3901         }
3902         aSigExtra = 0;
3903     }
3904     else {
3905         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3906     }
3907     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3908
3909 }
3910
3911 /*----------------------------------------------------------------------------
3912 | Returns the result of converting the extended double-precision floating-
3913 | point value `a' to the 64-bit two's complement integer format.  The
3914 | conversion is performed according to the IEC/IEEE Standard for Binary
3915 | Floating-Point Arithmetic, except that the conversion is always rounded
3916 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3917 | Otherwise, if the conversion overflows, the largest integer with the same
3918 | sign as `a' is returned.
3919 *----------------------------------------------------------------------------*/
3920
3921 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3922 {
3923     flag aSign;
3924     int32 aExp, shiftCount;
3925     uint64_t aSig;
3926     int64 z;
3927
3928     aSig = extractFloatx80Frac( a );
3929     aExp = extractFloatx80Exp( a );
3930     aSign = extractFloatx80Sign( a );
3931     shiftCount = aExp - 0x403E;
3932     if ( 0 <= shiftCount ) {
3933         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3934         if ( ( a.high != 0xC03E ) || aSig ) {
3935             float_raise( float_flag_invalid STATUS_VAR);
3936             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3937                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3938             }
3939         }
3940         return (int64_t) LIT64( 0x8000000000000000 );
3941     }
3942     else if ( aExp < 0x3FFF ) {
3943         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3944         return 0;
3945     }
3946     z = aSig>>( - shiftCount );
3947     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3948         STATUS(float_exception_flags) |= float_flag_inexact;
3949     }
3950     if ( aSign ) z = - z;
3951     return z;
3952
3953 }
3954
3955 /*----------------------------------------------------------------------------
3956 | Returns the result of converting the extended double-precision floating-
3957 | point value `a' to the single-precision floating-point format.  The
3958 | conversion is performed according to the IEC/IEEE Standard for Binary
3959 | Floating-Point Arithmetic.
3960 *----------------------------------------------------------------------------*/
3961
3962 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3963 {
3964     flag aSign;
3965     int32 aExp;
3966     uint64_t aSig;
3967
3968     aSig = extractFloatx80Frac( a );
3969     aExp = extractFloatx80Exp( a );
3970     aSign = extractFloatx80Sign( a );
3971     if ( aExp == 0x7FFF ) {
3972         if ( (uint64_t) ( aSig<<1 ) ) {
3973             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3974         }
3975         return packFloat32( aSign, 0xFF, 0 );
3976     }
3977     shift64RightJamming( aSig, 33, &aSig );
3978     if ( aExp || aSig ) aExp -= 0x3F81;
3979     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3980
3981 }
3982
3983 /*----------------------------------------------------------------------------
3984 | Returns the result of converting the extended double-precision floating-
3985 | point value `a' to the double-precision floating-point format.  The
3986 | conversion is performed according to the IEC/IEEE Standard for Binary
3987 | Floating-Point Arithmetic.
3988 *----------------------------------------------------------------------------*/
3989
3990 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3991 {
3992     flag aSign;
3993     int32 aExp;
3994     uint64_t aSig, zSig;
3995
3996     aSig = extractFloatx80Frac( a );
3997     aExp = extractFloatx80Exp( a );
3998     aSign = extractFloatx80Sign( a );
3999     if ( aExp == 0x7FFF ) {
4000         if ( (uint64_t) ( aSig<<1 ) ) {
4001             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4002         }
4003         return packFloat64( aSign, 0x7FF, 0 );
4004     }
4005     shift64RightJamming( aSig, 1, &zSig );
4006     if ( aExp || aSig ) aExp -= 0x3C01;
4007     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4008
4009 }
4010
4011 #ifdef FLOAT128
4012
4013 /*----------------------------------------------------------------------------
4014 | Returns the result of converting the extended double-precision floating-
4015 | point value `a' to the quadruple-precision floating-point format.  The
4016 | conversion is performed according to the IEC/IEEE Standard for Binary
4017 | Floating-Point Arithmetic.
4018 *----------------------------------------------------------------------------*/
4019
4020 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4021 {
4022     flag aSign;
4023     int16 aExp;
4024     uint64_t aSig, zSig0, zSig1;
4025
4026     aSig = extractFloatx80Frac( a );
4027     aExp = extractFloatx80Exp( a );
4028     aSign = extractFloatx80Sign( a );
4029     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4030         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4031     }
4032     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4033     return packFloat128( aSign, aExp, zSig0, zSig1 );
4034
4035 }
4036
4037 #endif
4038
4039 /*----------------------------------------------------------------------------
4040 | Rounds the extended double-precision floating-point value `a' to an integer,
4041 | and returns the result as an extended quadruple-precision floating-point
4042 | value.  The operation is performed according to the IEC/IEEE Standard for
4043 | Binary Floating-Point Arithmetic.
4044 *----------------------------------------------------------------------------*/
4045
4046 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4047 {
4048     flag aSign;
4049     int32 aExp;
4050     uint64_t lastBitMask, roundBitsMask;
4051     int8 roundingMode;
4052     floatx80 z;
4053
4054     aExp = extractFloatx80Exp( a );
4055     if ( 0x403E <= aExp ) {
4056         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4057             return propagateFloatx80NaN( a, a STATUS_VAR );
4058         }
4059         return a;
4060     }
4061     if ( aExp < 0x3FFF ) {
4062         if (    ( aExp == 0 )
4063              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4064             return a;
4065         }
4066         STATUS(float_exception_flags) |= float_flag_inexact;
4067         aSign = extractFloatx80Sign( a );
4068         switch ( STATUS(float_rounding_mode) ) {
4069          case float_round_nearest_even:
4070             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4071                ) {
4072                 return
4073                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4074             }
4075             break;
4076          case float_round_down:
4077             return
4078                   aSign ?
4079                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4080                 : packFloatx80( 0, 0, 0 );
4081          case float_round_up:
4082             return
4083                   aSign ? packFloatx80( 1, 0, 0 )
4084                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4085         }
4086         return packFloatx80( aSign, 0, 0 );
4087     }
4088     lastBitMask = 1;
4089     lastBitMask <<= 0x403E - aExp;
4090     roundBitsMask = lastBitMask - 1;
4091     z = a;
4092     roundingMode = STATUS(float_rounding_mode);
4093     if ( roundingMode == float_round_nearest_even ) {
4094         z.low += lastBitMask>>1;
4095         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4096     }
4097     else if ( roundingMode != float_round_to_zero ) {
4098         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4099             z.low += roundBitsMask;
4100         }
4101     }
4102     z.low &= ~ roundBitsMask;
4103     if ( z.low == 0 ) {
4104         ++z.high;
4105         z.low = LIT64( 0x8000000000000000 );
4106     }
4107     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4108     return z;
4109
4110 }
4111
4112 /*----------------------------------------------------------------------------
4113 | Returns the result of adding the absolute values of the extended double-
4114 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4115 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4116 | The addition is performed according to the IEC/IEEE Standard for Binary
4117 | Floating-Point Arithmetic.
4118 *----------------------------------------------------------------------------*/
4119
4120 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4121 {
4122     int32 aExp, bExp, zExp;
4123     uint64_t aSig, bSig, zSig0, zSig1;
4124     int32 expDiff;
4125
4126     aSig = extractFloatx80Frac( a );
4127     aExp = extractFloatx80Exp( a );
4128     bSig = extractFloatx80Frac( b );
4129     bExp = extractFloatx80Exp( b );
4130     expDiff = aExp - bExp;
4131     if ( 0 < expDiff ) {
4132         if ( aExp == 0x7FFF ) {
4133             if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4134             return a;
4135         }
4136         if ( bExp == 0 ) --expDiff;
4137         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4138         zExp = aExp;
4139     }
4140     else if ( expDiff < 0 ) {
4141         if ( bExp == 0x7FFF ) {
4142             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4143             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4144         }
4145         if ( aExp == 0 ) ++expDiff;
4146         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4147         zExp = bExp;
4148     }
4149     else {
4150         if ( aExp == 0x7FFF ) {
4151             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4152                 return propagateFloatx80NaN( a, b STATUS_VAR );
4153             }
4154             return a;
4155         }
4156         zSig1 = 0;
4157         zSig0 = aSig + bSig;
4158         if ( aExp == 0 ) {
4159             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4160             goto roundAndPack;
4161         }
4162         zExp = aExp;
4163         goto shiftRight1;
4164     }
4165     zSig0 = aSig + bSig;
4166     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4167  shiftRight1:
4168     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4169     zSig0 |= LIT64( 0x8000000000000000 );
4170     ++zExp;
4171  roundAndPack:
4172     return
4173         roundAndPackFloatx80(
4174             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4175
4176 }
4177
4178 /*----------------------------------------------------------------------------
4179 | Returns the result of subtracting the absolute values of the extended
4180 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4181 | difference is negated before being returned.  `zSign' is ignored if the
4182 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4183 | Standard for Binary Floating-Point Arithmetic.
4184 *----------------------------------------------------------------------------*/
4185
4186 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4187 {
4188     int32 aExp, bExp, zExp;
4189     uint64_t aSig, bSig, zSig0, zSig1;
4190     int32 expDiff;
4191     floatx80 z;
4192
4193     aSig = extractFloatx80Frac( a );
4194     aExp = extractFloatx80Exp( a );
4195     bSig = extractFloatx80Frac( b );
4196     bExp = extractFloatx80Exp( b );
4197     expDiff = aExp - bExp;
4198     if ( 0 < expDiff ) goto aExpBigger;
4199     if ( expDiff < 0 ) goto bExpBigger;
4200     if ( aExp == 0x7FFF ) {
4201         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4202             return propagateFloatx80NaN( a, b STATUS_VAR );
4203         }
4204         float_raise( float_flag_invalid STATUS_VAR);
4205         z.low = floatx80_default_nan_low;
4206         z.high = floatx80_default_nan_high;
4207         return z;
4208     }
4209     if ( aExp == 0 ) {
4210         aExp = 1;
4211         bExp = 1;
4212     }
4213     zSig1 = 0;
4214     if ( bSig < aSig ) goto aBigger;
4215     if ( aSig < bSig ) goto bBigger;
4216     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4217  bExpBigger:
4218     if ( bExp == 0x7FFF ) {
4219         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4220         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4221     }
4222     if ( aExp == 0 ) ++expDiff;
4223     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4224  bBigger:
4225     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4226     zExp = bExp;
4227     zSign ^= 1;
4228     goto normalizeRoundAndPack;
4229  aExpBigger:
4230     if ( aExp == 0x7FFF ) {
4231         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4232         return a;
4233     }
4234     if ( bExp == 0 ) --expDiff;
4235     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4236  aBigger:
4237     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4238     zExp = aExp;
4239  normalizeRoundAndPack:
4240     return
4241         normalizeRoundAndPackFloatx80(
4242             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4243
4244 }
4245
4246 /*----------------------------------------------------------------------------
4247 | Returns the result of adding the extended double-precision floating-point
4248 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4249 | Standard for Binary Floating-Point Arithmetic.
4250 *----------------------------------------------------------------------------*/
4251
4252 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4253 {
4254     flag aSign, bSign;
4255
4256     aSign = extractFloatx80Sign( a );
4257     bSign = extractFloatx80Sign( b );
4258     if ( aSign == bSign ) {
4259         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4260     }
4261     else {
4262         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4263     }
4264
4265 }
4266
4267 /*----------------------------------------------------------------------------
4268 | Returns the result of subtracting the extended double-precision floating-
4269 | point values `a' and `b'.  The operation is performed according to the
4270 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4271 *----------------------------------------------------------------------------*/
4272
4273 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4274 {
4275     flag aSign, bSign;
4276
4277     aSign = extractFloatx80Sign( a );
4278     bSign = extractFloatx80Sign( b );
4279     if ( aSign == bSign ) {
4280         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4281     }
4282     else {
4283         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4284     }
4285
4286 }
4287
4288 /*----------------------------------------------------------------------------
4289 | Returns the result of multiplying the extended double-precision floating-
4290 | point values `a' and `b'.  The operation is performed according to the
4291 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4292 *----------------------------------------------------------------------------*/
4293
4294 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4295 {
4296     flag aSign, bSign, zSign;
4297     int32 aExp, bExp, zExp;
4298     uint64_t aSig, bSig, zSig0, zSig1;
4299     floatx80 z;
4300
4301     aSig = extractFloatx80Frac( a );
4302     aExp = extractFloatx80Exp( a );
4303     aSign = extractFloatx80Sign( a );
4304     bSig = extractFloatx80Frac( b );
4305     bExp = extractFloatx80Exp( b );
4306     bSign = extractFloatx80Sign( b );
4307     zSign = aSign ^ bSign;
4308     if ( aExp == 0x7FFF ) {
4309         if (    (uint64_t) ( aSig<<1 )
4310              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4311             return propagateFloatx80NaN( a, b STATUS_VAR );
4312         }
4313         if ( ( bExp | bSig ) == 0 ) goto invalid;
4314         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4315     }
4316     if ( bExp == 0x7FFF ) {
4317         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4318         if ( ( aExp | aSig ) == 0 ) {
4319  invalid:
4320             float_raise( float_flag_invalid STATUS_VAR);
4321             z.low = floatx80_default_nan_low;
4322             z.high = floatx80_default_nan_high;
4323             return z;
4324         }
4325         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4326     }
4327     if ( aExp == 0 ) {
4328         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4329         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4330     }
4331     if ( bExp == 0 ) {
4332         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4333         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4334     }
4335     zExp = aExp + bExp - 0x3FFE;
4336     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4337     if ( 0 < (int64_t) zSig0 ) {
4338         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4339         --zExp;
4340     }
4341     return
4342         roundAndPackFloatx80(
4343             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4344
4345 }
4346
4347 /*----------------------------------------------------------------------------
4348 | Returns the result of dividing the extended double-precision floating-point
4349 | value `a' by the corresponding value `b'.  The operation is performed
4350 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4351 *----------------------------------------------------------------------------*/
4352
4353 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4354 {
4355     flag aSign, bSign, zSign;
4356     int32 aExp, bExp, zExp;
4357     uint64_t aSig, bSig, zSig0, zSig1;
4358     uint64_t rem0, rem1, rem2, term0, term1, term2;
4359     floatx80 z;
4360
4361     aSig = extractFloatx80Frac( a );
4362     aExp = extractFloatx80Exp( a );
4363     aSign = extractFloatx80Sign( a );
4364     bSig = extractFloatx80Frac( b );
4365     bExp = extractFloatx80Exp( b );
4366     bSign = extractFloatx80Sign( b );
4367     zSign = aSign ^ bSign;
4368     if ( aExp == 0x7FFF ) {
4369         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4370         if ( bExp == 0x7FFF ) {
4371             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4372             goto invalid;
4373         }
4374         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4375     }
4376     if ( bExp == 0x7FFF ) {
4377         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4378         return packFloatx80( zSign, 0, 0 );
4379     }
4380     if ( bExp == 0 ) {
4381         if ( bSig == 0 ) {
4382             if ( ( aExp | aSig ) == 0 ) {
4383  invalid:
4384                 float_raise( float_flag_invalid STATUS_VAR);
4385                 z.low = floatx80_default_nan_low;
4386                 z.high = floatx80_default_nan_high;
4387                 return z;
4388             }
4389             float_raise( float_flag_divbyzero STATUS_VAR);
4390             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4391         }
4392         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4393     }
4394     if ( aExp == 0 ) {
4395         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4396         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4397     }
4398     zExp = aExp - bExp + 0x3FFE;
4399     rem1 = 0;
4400     if ( bSig <= aSig ) {
4401         shift128Right( aSig, 0, 1, &aSig, &rem1 );
4402         ++zExp;
4403     }
4404     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4405     mul64To128( bSig, zSig0, &term0, &term1 );
4406     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4407     while ( (int64_t) rem0 < 0 ) {
4408         --zSig0;
4409         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4410     }
4411     zSig1 = estimateDiv128To64( rem1, 0, bSig );
4412     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4413         mul64To128( bSig, zSig1, &term1, &term2 );
4414         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4415         while ( (int64_t) rem1 < 0 ) {
4416             --zSig1;
4417             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4418         }
4419         zSig1 |= ( ( rem1 | rem2 ) != 0 );
4420     }
4421     return
4422         roundAndPackFloatx80(
4423             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4424
4425 }
4426
4427 /*----------------------------------------------------------------------------
4428 | Returns the remainder of the extended double-precision floating-point value
4429 | `a' with respect to the corresponding value `b'.  The operation is performed
4430 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4431 *----------------------------------------------------------------------------*/
4432
4433 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4434 {
4435     flag aSign, zSign;
4436     int32 aExp, bExp, expDiff;
4437     uint64_t aSig0, aSig1, bSig;
4438     uint64_t q, term0, term1, alternateASig0, alternateASig1;
4439     floatx80 z;
4440
4441     aSig0 = extractFloatx80Frac( a );
4442     aExp = extractFloatx80Exp( a );
4443     aSign = extractFloatx80Sign( a );
4444     bSig = extractFloatx80Frac( b );
4445     bExp = extractFloatx80Exp( b );
4446     if ( aExp == 0x7FFF ) {
4447         if (    (uint64_t) ( aSig0<<1 )
4448              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4449             return propagateFloatx80NaN( a, b STATUS_VAR );
4450         }
4451         goto invalid;
4452     }
4453     if ( bExp == 0x7FFF ) {
4454         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4455         return a;
4456     }
4457     if ( bExp == 0 ) {
4458         if ( bSig == 0 ) {
4459  invalid:
4460             float_raise( float_flag_invalid STATUS_VAR);
4461             z.low = floatx80_default_nan_low;
4462             z.high = floatx80_default_nan_high;
4463             return z;
4464         }
4465         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4466     }
4467     if ( aExp == 0 ) {
4468         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4469         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4470     }
4471     bSig |= LIT64( 0x8000000000000000 );
4472     zSign = aSign;
4473     expDiff = aExp - bExp;
4474     aSig1 = 0;
4475     if ( expDiff < 0 ) {
4476         if ( expDiff < -1 ) return a;
4477         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4478         expDiff = 0;
4479     }
4480     q = ( bSig <= aSig0 );
4481     if ( q ) aSig0 -= bSig;
4482     expDiff -= 64;
4483     while ( 0 < expDiff ) {
4484         q = estimateDiv128To64( aSig0, aSig1, bSig );
4485         q = ( 2 < q ) ? q - 2 : 0;
4486         mul64To128( bSig, q, &term0, &term1 );
4487         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4488         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4489         expDiff -= 62;
4490     }
4491     expDiff += 64;
4492     if ( 0 < expDiff ) {
4493         q = estimateDiv128To64( aSig0, aSig1, bSig );
4494         q = ( 2 < q ) ? q - 2 : 0;
4495         q >>= 64 - expDiff;
4496         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4497         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4498         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4499         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4500             ++q;
4501             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4502         }
4503     }
4504     else {
4505         term1 = 0;
4506         term0 = bSig;
4507     }
4508     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4509     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4510          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4511               && ( q & 1 ) )
4512        ) {
4513         aSig0 = alternateASig0;
4514         aSig1 = alternateASig1;
4515         zSign = ! zSign;
4516     }
4517     return
4518         normalizeRoundAndPackFloatx80(
4519             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4520
4521 }
4522
4523 /*----------------------------------------------------------------------------
4524 | Returns the square root of the extended double-precision floating-point
4525 | value `a'.  The operation is performed according to the IEC/IEEE Standard
4526 | for Binary Floating-Point Arithmetic.
4527 *----------------------------------------------------------------------------*/
4528
4529 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4530 {
4531     flag aSign;
4532     int32 aExp, zExp;
4533     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4534     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4535     floatx80 z;
4536
4537     aSig0 = extractFloatx80Frac( a );
4538     aExp = extractFloatx80Exp( a );
4539     aSign = extractFloatx80Sign( a );
4540     if ( aExp == 0x7FFF ) {
4541         if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4542         if ( ! aSign ) return a;
4543         goto invalid;
4544     }
4545     if ( aSign ) {
4546         if ( ( aExp | aSig0 ) == 0 ) return a;
4547  invalid:
4548         float_raise( float_flag_invalid STATUS_VAR);
4549         z.low = floatx80_default_nan_low;
4550         z.high = floatx80_default_nan_high;
4551         return z;
4552     }
4553     if ( aExp == 0 ) {
4554         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4555         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4556     }
4557     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4558     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4559     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4560     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4561     doubleZSig0 = zSig0<<1;
4562     mul64To128( zSig0, zSig0, &term0, &term1 );
4563     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4564     while ( (int64_t) rem0 < 0 ) {
4565         --zSig0;
4566         doubleZSig0 -= 2;
4567         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4568     }
4569     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4570     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4571         if ( zSig1 == 0 ) zSig1 = 1;
4572         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4573         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4574         mul64To128( zSig1, zSig1, &term2, &term3 );
4575         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4576         while ( (int64_t) rem1 < 0 ) {
4577             --zSig1;
4578             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4579             term3 |= 1;
4580             term2 |= doubleZSig0;
4581             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4582         }
4583         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4584     }
4585     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4586     zSig0 |= doubleZSig0;
4587     return
4588         roundAndPackFloatx80(
4589             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4590
4591 }
4592
4593 /*----------------------------------------------------------------------------
4594 | Returns 1 if the extended double-precision floating-point value `a' is equal
4595 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
4596 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4597 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4598 *----------------------------------------------------------------------------*/
4599
4600 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4601 {
4602
4603     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4604               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4605          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4606               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4607        ) {
4608         float_raise( float_flag_invalid STATUS_VAR);
4609         return 0;
4610     }
4611     return
4612            ( a.low == b.low )
4613         && (    ( a.high == b.high )
4614              || (    ( a.low == 0 )
4615                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4616            );
4617
4618 }
4619
4620 /*----------------------------------------------------------------------------
4621 | Returns 1 if the extended double-precision floating-point value `a' is
4622 | less than or equal to the corresponding value `b', and 0 otherwise.  The
4623 | invalid exception is raised if either operand is a NaN.  The comparison is
4624 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4625 | Arithmetic.
4626 *----------------------------------------------------------------------------*/
4627
4628 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4629 {
4630     flag aSign, bSign;
4631
4632     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4633               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4634          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4635               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4636        ) {
4637         float_raise( float_flag_invalid STATUS_VAR);
4638         return 0;
4639     }
4640     aSign = extractFloatx80Sign( a );
4641     bSign = extractFloatx80Sign( b );
4642     if ( aSign != bSign ) {
4643         return
4644                aSign
4645             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4646                  == 0 );
4647     }
4648     return
4649           aSign ? le128( b.high, b.low, a.high, a.low )
4650         : le128( a.high, a.low, b.high, b.low );
4651
4652 }
4653
4654 /*----------------------------------------------------------------------------
4655 | Returns 1 if the extended double-precision floating-point value `a' is
4656 | less than the corresponding value `b', and 0 otherwise.  The invalid
4657 | exception is raised if either operand is a NaN.  The comparison is performed
4658 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4659 *----------------------------------------------------------------------------*/
4660
4661 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4662 {
4663     flag aSign, bSign;
4664
4665     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4666               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4667          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4668               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4669        ) {
4670         float_raise( float_flag_invalid STATUS_VAR);
4671         return 0;
4672     }
4673     aSign = extractFloatx80Sign( a );
4674     bSign = extractFloatx80Sign( b );
4675     if ( aSign != bSign ) {
4676         return
4677                aSign
4678             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4679                  != 0 );
4680     }
4681     return
4682           aSign ? lt128( b.high, b.low, a.high, a.low )
4683         : lt128( a.high, a.low, b.high, b.low );
4684
4685 }
4686
4687 /*----------------------------------------------------------------------------
4688 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4689 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
4690 | either operand is a NaN.   The comparison is performed according to the
4691 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4692 *----------------------------------------------------------------------------*/
4693 int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
4694 {
4695     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4696               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4697          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4698               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4699        ) {
4700         float_raise( float_flag_invalid STATUS_VAR);
4701         return 1;
4702     }
4703     return 0;
4704 }
4705
4706 /*----------------------------------------------------------------------------
4707 | Returns 1 if the extended double-precision floating-point value `a' is
4708 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4709 | cause an exception.  The comparison is performed according to the IEC/IEEE
4710 | Standard for Binary Floating-Point Arithmetic.
4711 *----------------------------------------------------------------------------*/
4712
4713 int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4714 {
4715
4716     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4717               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4718          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4719               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4720        ) {
4721         if (    floatx80_is_signaling_nan( a )
4722              || floatx80_is_signaling_nan( b ) ) {
4723             float_raise( float_flag_invalid STATUS_VAR);
4724         }
4725         return 0;
4726     }
4727     return
4728            ( a.low == b.low )
4729         && (    ( a.high == b.high )
4730              || (    ( a.low == 0 )
4731                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4732            );
4733
4734 }
4735
4736 /*----------------------------------------------------------------------------
4737 | Returns 1 if the extended double-precision floating-point value `a' is less
4738 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4739 | do not cause an exception.  Otherwise, the comparison is performed according
4740 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4741 *----------------------------------------------------------------------------*/
4742
4743 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4744 {
4745     flag aSign, bSign;
4746
4747     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4748               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4749          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4750               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4751        ) {
4752         if (    floatx80_is_signaling_nan( a )
4753              || floatx80_is_signaling_nan( b ) ) {
4754             float_raise( float_flag_invalid STATUS_VAR);
4755         }
4756         return 0;
4757     }
4758     aSign = extractFloatx80Sign( a );
4759     bSign = extractFloatx80Sign( b );
4760     if ( aSign != bSign ) {
4761         return
4762                aSign
4763             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4764                  == 0 );
4765     }
4766     return
4767           aSign ? le128( b.high, b.low, a.high, a.low )
4768         : le128( a.high, a.low, b.high, b.low );
4769
4770 }
4771
4772 /*----------------------------------------------------------------------------
4773 | Returns 1 if the extended double-precision floating-point value `a' is less
4774 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4775 | an exception.  Otherwise, the comparison is performed according to the
4776 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4777 *----------------------------------------------------------------------------*/
4778
4779 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4780 {
4781     flag aSign, bSign;
4782
4783     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4784               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4785          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4786               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4787        ) {
4788         if (    floatx80_is_signaling_nan( a )
4789              || floatx80_is_signaling_nan( b ) ) {
4790             float_raise( float_flag_invalid STATUS_VAR);
4791         }
4792         return 0;
4793     }
4794     aSign = extractFloatx80Sign( a );
4795     bSign = extractFloatx80Sign( b );
4796     if ( aSign != bSign ) {
4797         return
4798                aSign
4799             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4800                  != 0 );
4801     }
4802     return
4803           aSign ? lt128( b.high, b.low, a.high, a.low )
4804         : lt128( a.high, a.low, b.high, b.low );
4805
4806 }
4807
4808 /*----------------------------------------------------------------------------
4809 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4810 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
4811 | The comparison is performed according to the IEC/IEEE Standard for Binary
4812 | Floating-Point Arithmetic.
4813 *----------------------------------------------------------------------------*/
4814 int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4815 {
4816     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4817               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4818          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4819               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4820        ) {
4821         if (    floatx80_is_signaling_nan( a )
4822              || floatx80_is_signaling_nan( b ) ) {
4823             float_raise( float_flag_invalid STATUS_VAR);
4824         }
4825         return 1;
4826     }
4827     return 0;
4828 }
4829
4830 #endif
4831
4832 #ifdef FLOAT128
4833
4834 /*----------------------------------------------------------------------------
4835 | Returns the result of converting the quadruple-precision floating-point
4836 | value `a' to the 32-bit two's complement integer format.  The conversion
4837 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4838 | Arithmetic---which means in particular that the conversion is rounded
4839 | according to the current rounding mode.  If `a' is a NaN, the largest
4840 | positive integer is returned.  Otherwise, if the conversion overflows, the
4841 | largest integer with the same sign as `a' is returned.
4842 *----------------------------------------------------------------------------*/
4843
4844 int32 float128_to_int32( float128 a STATUS_PARAM )
4845 {
4846     flag aSign;
4847     int32 aExp, shiftCount;
4848     uint64_t aSig0, aSig1;
4849
4850     aSig1 = extractFloat128Frac1( a );
4851     aSig0 = extractFloat128Frac0( a );
4852     aExp = extractFloat128Exp( a );
4853     aSign = extractFloat128Sign( a );
4854     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4855     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4856     aSig0 |= ( aSig1 != 0 );
4857     shiftCount = 0x4028 - aExp;
4858     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4859     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4860
4861 }
4862
4863 /*----------------------------------------------------------------------------
4864 | Returns the result of converting the quadruple-precision floating-point
4865 | value `a' to the 32-bit two's complement integer format.  The conversion
4866 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4867 | Arithmetic, except that the conversion is always rounded toward zero.  If
4868 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4869 | conversion overflows, the largest integer with the same sign as `a' is
4870 | returned.
4871 *----------------------------------------------------------------------------*/
4872
4873 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4874 {
4875     flag aSign;
4876     int32 aExp, shiftCount;
4877     uint64_t aSig0, aSig1, savedASig;
4878     int32 z;
4879
4880     aSig1 = extractFloat128Frac1( a );
4881     aSig0 = extractFloat128Frac0( a );
4882     aExp = extractFloat128Exp( a );
4883     aSign = extractFloat128Sign( a );
4884     aSig0 |= ( aSig1 != 0 );
4885     if ( 0x401E < aExp ) {
4886         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4887         goto invalid;
4888     }
4889     else if ( aExp < 0x3FFF ) {
4890         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4891         return 0;
4892     }
4893     aSig0 |= LIT64( 0x0001000000000000 );
4894     shiftCount = 0x402F - aExp;
4895     savedASig = aSig0;
4896     aSig0 >>= shiftCount;
4897     z = aSig0;
4898     if ( aSign ) z = - z;
4899     if ( ( z < 0 ) ^ aSign ) {
4900  invalid:
4901         float_raise( float_flag_invalid STATUS_VAR);
4902         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4903     }
4904     if ( ( aSig0<<shiftCount ) != savedASig ) {
4905         STATUS(float_exception_flags) |= float_flag_inexact;
4906     }
4907     return z;
4908
4909 }
4910
4911 /*----------------------------------------------------------------------------
4912 | Returns the result of converting the quadruple-precision floating-point
4913 | value `a' to the 64-bit two's complement integer format.  The conversion
4914 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4915 | Arithmetic---which means in particular that the conversion is rounded
4916 | according to the current rounding mode.  If `a' is a NaN, the largest
4917 | positive integer is returned.  Otherwise, if the conversion overflows, the
4918 | largest integer with the same sign as `a' is returned.
4919 *----------------------------------------------------------------------------*/
4920
4921 int64 float128_to_int64( float128 a STATUS_PARAM )
4922 {
4923     flag aSign;
4924     int32 aExp, shiftCount;
4925     uint64_t aSig0, aSig1;
4926
4927     aSig1 = extractFloat128Frac1( a );
4928     aSig0 = extractFloat128Frac0( a );
4929     aExp = extractFloat128Exp( a );
4930     aSign = extractFloat128Sign( a );
4931     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4932     shiftCount = 0x402F - aExp;
4933     if ( shiftCount <= 0 ) {
4934         if ( 0x403E < aExp ) {
4935             float_raise( float_flag_invalid STATUS_VAR);
4936             if (    ! aSign
4937                  || (    ( aExp == 0x7FFF )
4938                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4939                     )
4940                ) {
4941                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4942             }
4943             return (int64_t) LIT64( 0x8000000000000000 );
4944         }
4945         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4946     }
4947     else {
4948         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4949     }
4950     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4951
4952 }
4953
4954 /*----------------------------------------------------------------------------
4955 | Returns the result of converting the quadruple-precision floating-point
4956 | value `a' to the 64-bit two's complement integer format.  The conversion
4957 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4958 | Arithmetic, except that the conversion is always rounded toward zero.
4959 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4960 | the conversion overflows, the largest integer with the same sign as `a' is
4961 | returned.
4962 *----------------------------------------------------------------------------*/
4963
4964 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4965 {
4966     flag aSign;
4967     int32 aExp, shiftCount;
4968     uint64_t aSig0, aSig1;
4969     int64 z;
4970
4971     aSig1 = extractFloat128Frac1( a );
4972     aSig0 = extractFloat128Frac0( a );
4973     aExp = extractFloat128Exp( a );
4974     aSign = extractFloat128Sign( a );
4975     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4976     shiftCount = aExp - 0x402F;
4977     if ( 0 < shiftCount ) {
4978         if ( 0x403E <= aExp ) {
4979             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4980             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4981                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4982                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4983             }
4984             else {
4985                 float_raise( float_flag_invalid STATUS_VAR);
4986                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4987                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4988                 }
4989             }
4990             return (int64_t) LIT64( 0x8000000000000000 );
4991         }
4992         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4993         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
4994             STATUS(float_exception_flags) |= float_flag_inexact;
4995         }
4996     }
4997     else {
4998         if ( aExp < 0x3FFF ) {
4999             if ( aExp | aSig0 | aSig1 ) {
5000                 STATUS(float_exception_flags) |= float_flag_inexact;
5001             }
5002             return 0;
5003         }
5004         z = aSig0>>( - shiftCount );
5005         if (    aSig1
5006              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5007             STATUS(float_exception_flags) |= float_flag_inexact;
5008         }
5009     }
5010     if ( aSign ) z = - z;
5011     return z;
5012
5013 }
5014
5015 /*----------------------------------------------------------------------------
5016 | Returns the result of converting the quadruple-precision floating-point
5017 | value `a' to the single-precision floating-point format.  The conversion
5018 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5019 | Arithmetic.
5020 *----------------------------------------------------------------------------*/
5021
5022 float32 float128_to_float32( float128 a STATUS_PARAM )
5023 {
5024     flag aSign;
5025     int32 aExp;
5026     uint64_t aSig0, aSig1;
5027     uint32_t zSig;
5028
5029     aSig1 = extractFloat128Frac1( a );
5030     aSig0 = extractFloat128Frac0( a );
5031     aExp = extractFloat128Exp( a );
5032     aSign = extractFloat128Sign( a );
5033     if ( aExp == 0x7FFF ) {
5034         if ( aSig0 | aSig1 ) {
5035             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5036         }
5037         return packFloat32( aSign, 0xFF, 0 );
5038     }
5039     aSig0 |= ( aSig1 != 0 );
5040     shift64RightJamming( aSig0, 18, &aSig0 );
5041     zSig = aSig0;
5042     if ( aExp || zSig ) {
5043         zSig |= 0x40000000;
5044         aExp -= 0x3F81;
5045     }
5046     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5047
5048 }
5049
5050 /*----------------------------------------------------------------------------
5051 | Returns the result of converting the quadruple-precision floating-point
5052 | value `a' to the double-precision floating-point format.  The conversion
5053 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5054 | Arithmetic.
5055 *----------------------------------------------------------------------------*/
5056
5057 float64 float128_to_float64( float128 a STATUS_PARAM )
5058 {
5059     flag aSign;
5060     int32 aExp;
5061     uint64_t aSig0, aSig1;
5062
5063     aSig1 = extractFloat128Frac1( a );
5064     aSig0 = extractFloat128Frac0( a );
5065     aExp = extractFloat128Exp( a );
5066     aSign = extractFloat128Sign( a );
5067     if ( aExp == 0x7FFF ) {
5068         if ( aSig0 | aSig1 ) {
5069             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5070         }
5071         return packFloat64( aSign, 0x7FF, 0 );
5072     }
5073     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5074     aSig0 |= ( aSig1 != 0 );
5075     if ( aExp || aSig0 ) {
5076         aSig0 |= LIT64( 0x4000000000000000 );
5077         aExp -= 0x3C01;
5078     }
5079     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5080
5081 }
5082
5083 #ifdef FLOATX80
5084
5085 /*----------------------------------------------------------------------------
5086 | Returns the result of converting the quadruple-precision floating-point
5087 | value `a' to the extended double-precision floating-point format.  The
5088 | conversion is performed according to the IEC/IEEE Standard for Binary
5089 | Floating-Point Arithmetic.
5090 *----------------------------------------------------------------------------*/
5091
5092 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5093 {
5094     flag aSign;
5095     int32 aExp;
5096     uint64_t aSig0, aSig1;
5097
5098     aSig1 = extractFloat128Frac1( a );
5099     aSig0 = extractFloat128Frac0( a );
5100     aExp = extractFloat128Exp( a );
5101     aSign = extractFloat128Sign( a );
5102     if ( aExp == 0x7FFF ) {
5103         if ( aSig0 | aSig1 ) {
5104             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5105         }
5106         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5107     }
5108     if ( aExp == 0 ) {
5109         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5110         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5111     }
5112     else {
5113         aSig0 |= LIT64( 0x0001000000000000 );
5114     }
5115     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5116     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5117
5118 }
5119
5120 #endif
5121
5122 /*----------------------------------------------------------------------------
5123 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5124 | returns the result as a quadruple-precision floating-point value.  The
5125 | operation is performed according to the IEC/IEEE Standard for Binary
5126 | Floating-Point Arithmetic.
5127 *----------------------------------------------------------------------------*/
5128
5129 float128 float128_round_to_int( float128 a STATUS_PARAM )
5130 {
5131     flag aSign;
5132     int32 aExp;
5133     uint64_t lastBitMask, roundBitsMask;
5134     int8 roundingMode;
5135     float128 z;
5136
5137     aExp = extractFloat128Exp( a );
5138     if ( 0x402F <= aExp ) {
5139         if ( 0x406F <= aExp ) {
5140             if (    ( aExp == 0x7FFF )
5141                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5142                ) {
5143                 return propagateFloat128NaN( a, a STATUS_VAR );
5144             }
5145             return a;
5146         }
5147         lastBitMask = 1;
5148         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5149         roundBitsMask = lastBitMask - 1;
5150         z = a;
5151         roundingMode = STATUS(float_rounding_mode);
5152         if ( roundingMode == float_round_nearest_even ) {
5153             if ( lastBitMask ) {
5154                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5155                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5156             }
5157             else {
5158                 if ( (int64_t) z.low < 0 ) {
5159                     ++z.high;
5160                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5161                 }
5162             }
5163         }
5164         else if ( roundingMode != float_round_to_zero ) {
5165             if (   extractFloat128Sign( z )
5166                  ^ ( roundingMode == float_round_up ) ) {
5167                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5168             }
5169         }
5170         z.low &= ~ roundBitsMask;
5171     }
5172     else {
5173         if ( aExp < 0x3FFF ) {
5174             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5175             STATUS(float_exception_flags) |= float_flag_inexact;
5176             aSign = extractFloat128Sign( a );
5177             switch ( STATUS(float_rounding_mode) ) {
5178              case float_round_nearest_even:
5179                 if (    ( aExp == 0x3FFE )
5180                      && (   extractFloat128Frac0( a )
5181                           | extractFloat128Frac1( a ) )
5182                    ) {
5183                     return packFloat128( aSign, 0x3FFF, 0, 0 );
5184                 }
5185                 break;
5186              case float_round_down:
5187                 return
5188                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5189                     : packFloat128( 0, 0, 0, 0 );
5190              case float_round_up:
5191                 return
5192                       aSign ? packFloat128( 1, 0, 0, 0 )
5193                     : packFloat128( 0, 0x3FFF, 0, 0 );
5194             }
5195             return packFloat128( aSign, 0, 0, 0 );
5196         }
5197         lastBitMask = 1;
5198         lastBitMask <<= 0x402F - aExp;
5199         roundBitsMask = lastBitMask - 1;
5200         z.low = 0;
5201         z.high = a.high;
5202         roundingMode = STATUS(float_rounding_mode);
5203         if ( roundingMode == float_round_nearest_even ) {
5204             z.high += lastBitMask>>1;
5205             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5206                 z.high &= ~ lastBitMask;
5207             }
5208         }
5209         else if ( roundingMode != float_round_to_zero ) {
5210             if (   extractFloat128Sign( z )
5211                  ^ ( roundingMode == float_round_up ) ) {
5212                 z.high |= ( a.low != 0 );
5213                 z.high += roundBitsMask;
5214             }
5215         }
5216         z.high &= ~ roundBitsMask;
5217     }
5218     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5219         STATUS(float_exception_flags) |= float_flag_inexact;
5220     }
5221     return z;
5222
5223 }
5224
5225 /*----------------------------------------------------------------------------
5226 | Returns the result of adding the absolute values of the quadruple-precision
5227 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5228 | before being returned.  `zSign' is ignored if the result is a NaN.
5229 | The addition is performed according to the IEC/IEEE Standard for Binary
5230 | Floating-Point Arithmetic.
5231 *----------------------------------------------------------------------------*/
5232
5233 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5234 {
5235     int32 aExp, bExp, zExp;
5236     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5237     int32 expDiff;
5238
5239     aSig1 = extractFloat128Frac1( a );
5240     aSig0 = extractFloat128Frac0( a );
5241     aExp = extractFloat128Exp( a );
5242     bSig1 = extractFloat128Frac1( b );
5243     bSig0 = extractFloat128Frac0( b );
5244     bExp = extractFloat128Exp( b );
5245     expDiff = aExp - bExp;
5246     if ( 0 < expDiff ) {
5247         if ( aExp == 0x7FFF ) {
5248             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5249             return a;
5250         }
5251         if ( bExp == 0 ) {
5252             --expDiff;
5253         }
5254         else {
5255             bSig0 |= LIT64( 0x0001000000000000 );
5256         }
5257         shift128ExtraRightJamming(
5258             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5259         zExp = aExp;
5260     }
5261     else if ( expDiff < 0 ) {
5262         if ( bExp == 0x7FFF ) {
5263             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5264             return packFloat128( zSign, 0x7FFF, 0, 0 );
5265         }
5266         if ( aExp == 0 ) {
5267             ++expDiff;
5268         }
5269         else {
5270             aSig0 |= LIT64( 0x0001000000000000 );
5271         }
5272         shift128ExtraRightJamming(
5273             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5274         zExp = bExp;
5275     }
5276     else {
5277         if ( aExp == 0x7FFF ) {
5278             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5279                 return propagateFloat128NaN( a, b STATUS_VAR );
5280             }
5281             return a;
5282         }
5283         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5284         if ( aExp == 0 ) {
5285             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
5286             return packFloat128( zSign, 0, zSig0, zSig1 );
5287         }
5288         zSig2 = 0;
5289         zSig0 |= LIT64( 0x0002000000000000 );
5290         zExp = aExp;
5291         goto shiftRight1;
5292     }
5293     aSig0 |= LIT64( 0x0001000000000000 );
5294     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5295     --zExp;
5296     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5297     ++zExp;
5298  shiftRight1:
5299     shift128ExtraRightJamming(
5300         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5301  roundAndPack:
5302     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5303
5304 }
5305
5306 /*----------------------------------------------------------------------------
5307 | Returns the result of subtracting the absolute values of the quadruple-
5308 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
5309 | difference is negated before being returned.  `zSign' is ignored if the
5310 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5311 | Standard for Binary Floating-Point Arithmetic.
5312 *----------------------------------------------------------------------------*/
5313
5314 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5315 {
5316     int32 aExp, bExp, zExp;
5317     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5318     int32 expDiff;
5319     float128 z;
5320
5321     aSig1 = extractFloat128Frac1( a );
5322     aSig0 = extractFloat128Frac0( a );
5323     aExp = extractFloat128Exp( a );
5324     bSig1 = extractFloat128Frac1( b );
5325     bSig0 = extractFloat128Frac0( b );
5326     bExp = extractFloat128Exp( b );
5327     expDiff = aExp - bExp;
5328     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5329     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5330     if ( 0 < expDiff ) goto aExpBigger;
5331     if ( expDiff < 0 ) goto bExpBigger;
5332     if ( aExp == 0x7FFF ) {
5333         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5334             return propagateFloat128NaN( a, b STATUS_VAR );
5335         }
5336         float_raise( float_flag_invalid STATUS_VAR);
5337         z.low = float128_default_nan_low;
5338         z.high = float128_default_nan_high;
5339         return z;
5340     }
5341     if ( aExp == 0 ) {
5342         aExp = 1;
5343         bExp = 1;
5344     }
5345     if ( bSig0 < aSig0 ) goto aBigger;
5346     if ( aSig0 < bSig0 ) goto bBigger;
5347     if ( bSig1 < aSig1 ) goto aBigger;
5348     if ( aSig1 < bSig1 ) goto bBigger;
5349     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5350  bExpBigger:
5351     if ( bExp == 0x7FFF ) {
5352         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5353         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5354     }
5355     if ( aExp == 0 ) {
5356         ++expDiff;
5357     }
5358     else {
5359         aSig0 |= LIT64( 0x4000000000000000 );
5360     }
5361     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5362     bSig0 |= LIT64( 0x4000000000000000 );
5363  bBigger:
5364     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5365     zExp = bExp;
5366     zSign ^= 1;
5367     goto normalizeRoundAndPack;
5368  aExpBigger:
5369     if ( aExp == 0x7FFF ) {
5370         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5371         return a;
5372     }
5373     if ( bExp == 0 ) {
5374         --expDiff;
5375     }
5376     else {
5377         bSig0 |= LIT64( 0x4000000000000000 );
5378     }
5379     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5380     aSig0 |= LIT64( 0x4000000000000000 );
5381  aBigger:
5382     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5383     zExp = aExp;
5384  normalizeRoundAndPack:
5385     --zExp;
5386     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5387
5388 }
5389
5390 /*----------------------------------------------------------------------------
5391 | Returns the result of adding the quadruple-precision floating-point values
5392 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5393 | for Binary Floating-Point Arithmetic.
5394 *----------------------------------------------------------------------------*/
5395
5396 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5397 {
5398     flag aSign, bSign;
5399
5400     aSign = extractFloat128Sign( a );
5401     bSign = extractFloat128Sign( b );
5402     if ( aSign == bSign ) {
5403         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5404     }
5405     else {
5406         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5407     }
5408
5409 }
5410
5411 /*----------------------------------------------------------------------------
5412 | Returns the result of subtracting the quadruple-precision floating-point
5413 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5414 | Standard for Binary Floating-Point Arithmetic.
5415 *----------------------------------------------------------------------------*/
5416
5417 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5418 {
5419     flag aSign, bSign;
5420
5421     aSign = extractFloat128Sign( a );
5422     bSign = extractFloat128Sign( b );
5423     if ( aSign == bSign ) {
5424         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5425     }
5426     else {
5427         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5428     }
5429
5430 }
5431
5432 /*----------------------------------------------------------------------------
5433 | Returns the result of multiplying the quadruple-precision floating-point
5434 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5435 | Standard for Binary Floating-Point Arithmetic.
5436 *----------------------------------------------------------------------------*/
5437
5438 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5439 {
5440     flag aSign, bSign, zSign;
5441     int32 aExp, bExp, zExp;
5442     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5443     float128 z;
5444
5445     aSig1 = extractFloat128Frac1( a );
5446     aSig0 = extractFloat128Frac0( a );
5447     aExp = extractFloat128Exp( a );
5448     aSign = extractFloat128Sign( a );
5449     bSig1 = extractFloat128Frac1( b );
5450     bSig0 = extractFloat128Frac0( b );
5451     bExp = extractFloat128Exp( b );
5452     bSign = extractFloat128Sign( b );
5453     zSign = aSign ^ bSign;
5454     if ( aExp == 0x7FFF ) {
5455         if (    ( aSig0 | aSig1 )
5456              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5457             return propagateFloat128NaN( a, b STATUS_VAR );
5458         }
5459         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5460         return packFloat128( zSign, 0x7FFF, 0, 0 );
5461     }
5462     if ( bExp == 0x7FFF ) {
5463         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5464         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5465  invalid:
5466             float_raise( float_flag_invalid STATUS_VAR);
5467             z.low = float128_default_nan_low;
5468             z.high = float128_default_nan_high;
5469             return z;
5470         }
5471         return packFloat128( zSign, 0x7FFF, 0, 0 );
5472     }
5473     if ( aExp == 0 ) {
5474         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5475         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5476     }
5477     if ( bExp == 0 ) {
5478         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5479         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5480     }
5481     zExp = aExp + bExp - 0x4000;
5482     aSig0 |= LIT64( 0x0001000000000000 );
5483     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5484     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5485     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5486     zSig2 |= ( zSig3 != 0 );
5487     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5488         shift128ExtraRightJamming(
5489             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5490         ++zExp;
5491     }
5492     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5493
5494 }
5495
5496 /*----------------------------------------------------------------------------
5497 | Returns the result of dividing the quadruple-precision floating-point value
5498 | `a' by the corresponding value `b'.  The operation is performed according to
5499 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5500 *----------------------------------------------------------------------------*/
5501
5502 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5503 {
5504     flag aSign, bSign, zSign;
5505     int32 aExp, bExp, zExp;
5506     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5507     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5508     float128 z;
5509
5510     aSig1 = extractFloat128Frac1( a );
5511     aSig0 = extractFloat128Frac0( a );
5512     aExp = extractFloat128Exp( a );
5513     aSign = extractFloat128Sign( a );
5514     bSig1 = extractFloat128Frac1( b );
5515     bSig0 = extractFloat128Frac0( b );
5516     bExp = extractFloat128Exp( b );
5517     bSign = extractFloat128Sign( b );
5518     zSign = aSign ^ bSign;
5519     if ( aExp == 0x7FFF ) {
5520         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5521         if ( bExp == 0x7FFF ) {
5522             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5523             goto invalid;
5524         }
5525         return packFloat128( zSign, 0x7FFF, 0, 0 );
5526     }
5527     if ( bExp == 0x7FFF ) {
5528         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5529         return packFloat128( zSign, 0, 0, 0 );
5530     }
5531     if ( bExp == 0 ) {
5532         if ( ( bSig0 | bSig1 ) == 0 ) {
5533             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5534  invalid:
5535                 float_raise( float_flag_invalid STATUS_VAR);
5536                 z.low = float128_default_nan_low;
5537                 z.high = float128_default_nan_high;
5538                 return z;
5539             }
5540             float_raise( float_flag_divbyzero STATUS_VAR);
5541             return packFloat128( zSign, 0x7FFF, 0, 0 );
5542         }
5543         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5544     }
5545     if ( aExp == 0 ) {
5546         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5547         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5548     }
5549     zExp = aExp - bExp + 0x3FFD;
5550     shortShift128Left(
5551         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5552     shortShift128Left(
5553         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5554     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5555         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5556         ++zExp;
5557     }
5558     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5559     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5560     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5561     while ( (int64_t) rem0 < 0 ) {
5562         --zSig0;
5563         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5564     }
5565     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5566     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5567         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5568         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5569         while ( (int64_t) rem1 < 0 ) {
5570             --zSig1;
5571             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5572         }
5573         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5574     }
5575     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5576     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5577
5578 }
5579
5580 /*----------------------------------------------------------------------------
5581 | Returns the remainder of the quadruple-precision floating-point value `a'
5582 | with respect to the corresponding value `b'.  The operation is performed
5583 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5584 *----------------------------------------------------------------------------*/
5585
5586 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5587 {
5588     flag aSign, zSign;
5589     int32 aExp, bExp, expDiff;
5590     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5591     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
5592     int64_t sigMean0;
5593     float128 z;
5594
5595     aSig1 = extractFloat128Frac1( a );
5596     aSig0 = extractFloat128Frac0( a );
5597     aExp = extractFloat128Exp( a );
5598     aSign = extractFloat128Sign( a );
5599     bSig1 = extractFloat128Frac1( b );
5600     bSig0 = extractFloat128Frac0( b );
5601     bExp = extractFloat128Exp( b );
5602     if ( aExp == 0x7FFF ) {
5603         if (    ( aSig0 | aSig1 )
5604              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5605             return propagateFloat128NaN( a, b STATUS_VAR );
5606         }
5607         goto invalid;
5608     }
5609     if ( bExp == 0x7FFF ) {
5610         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5611         return a;
5612     }
5613     if ( bExp == 0 ) {
5614         if ( ( bSig0 | bSig1 ) == 0 ) {
5615  invalid:
5616             float_raise( float_flag_invalid STATUS_VAR);
5617             z.low = float128_default_nan_low;
5618             z.high = float128_default_nan_high;
5619             return z;
5620         }
5621         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5622     }
5623     if ( aExp == 0 ) {
5624         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5625         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5626     }
5627     expDiff = aExp - bExp;
5628     if ( expDiff < -1 ) return a;
5629     shortShift128Left(
5630         aSig0 | LIT64( 0x0001000000000000 ),
5631         aSig1,
5632         15 - ( expDiff < 0 ),
5633         &aSig0,
5634         &aSig1
5635     );
5636     shortShift128Left(
5637         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5638     q = le128( bSig0, bSig1, aSig0, aSig1 );
5639     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5640     expDiff -= 64;
5641     while ( 0 < expDiff ) {
5642         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5643         q = ( 4 < q ) ? q - 4 : 0;
5644         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5645         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5646         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5647         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5648         expDiff -= 61;
5649     }
5650     if ( -64 < expDiff ) {
5651         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5652         q = ( 4 < q ) ? q - 4 : 0;
5653         q >>= - expDiff;
5654         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5655         expDiff += 52;
5656         if ( expDiff < 0 ) {
5657             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5658         }
5659         else {
5660             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5661         }
5662         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5663         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5664     }
5665     else {
5666         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5667         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5668     }
5669     do {
5670         alternateASig0 = aSig0;
5671         alternateASig1 = aSig1;
5672         ++q;
5673         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5674     } while ( 0 <= (int64_t) aSig0 );
5675     add128(
5676         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
5677     if (    ( sigMean0 < 0 )
5678          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5679         aSig0 = alternateASig0;
5680         aSig1 = alternateASig1;
5681     }
5682     zSign = ( (int64_t) aSig0 < 0 );
5683     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5684     return
5685         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5686
5687 }
5688
5689 /*----------------------------------------------------------------------------
5690 | Returns the square root of the quadruple-precision floating-point value `a'.
5691 | The operation is performed according to the IEC/IEEE Standard for Binary
5692 | Floating-Point Arithmetic.
5693 *----------------------------------------------------------------------------*/
5694
5695 float128 float128_sqrt( float128 a STATUS_PARAM )
5696 {
5697     flag aSign;
5698     int32 aExp, zExp;
5699     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5700     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5701     float128 z;
5702
5703     aSig1 = extractFloat128Frac1( a );
5704     aSig0 = extractFloat128Frac0( a );
5705     aExp = extractFloat128Exp( a );
5706     aSign = extractFloat128Sign( a );
5707     if ( aExp == 0x7FFF ) {
5708         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5709         if ( ! aSign ) return a;
5710         goto invalid;
5711     }
5712     if ( aSign ) {
5713         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5714  invalid:
5715         float_raise( float_flag_invalid STATUS_VAR);
5716         z.low = float128_default_nan_low;
5717         z.high = float128_default_nan_high;
5718         return z;
5719     }
5720     if ( aExp == 0 ) {
5721         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5722         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5723     }
5724     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5725     aSig0 |= LIT64( 0x0001000000000000 );
5726     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5727     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5728     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5729     doubleZSig0 = zSig0<<1;
5730     mul64To128( zSig0, zSig0, &term0, &term1 );
5731     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5732     while ( (int64_t) rem0 < 0 ) {
5733         --zSig0;
5734         doubleZSig0 -= 2;
5735         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5736     }
5737     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5738     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5739         if ( zSig1 == 0 ) zSig1 = 1;
5740         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5741         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5742         mul64To128( zSig1, zSig1, &term2, &term3 );
5743         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5744         while ( (int64_t) rem1 < 0 ) {
5745             --zSig1;
5746             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5747             term3 |= 1;
5748             term2 |= doubleZSig0;
5749             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5750         }
5751         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5752     }
5753     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5754     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5755
5756 }
5757
5758 /*----------------------------------------------------------------------------
5759 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5760 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5761 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5762 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5763 *----------------------------------------------------------------------------*/
5764
5765 int float128_eq( float128 a, float128 b STATUS_PARAM )
5766 {
5767
5768     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5769               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5770          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5771               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5772        ) {
5773         float_raise( float_flag_invalid STATUS_VAR);
5774         return 0;
5775     }
5776     return
5777            ( a.low == b.low )
5778         && (    ( a.high == b.high )
5779              || (    ( a.low == 0 )
5780                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5781            );
5782
5783 }
5784
5785 /*----------------------------------------------------------------------------
5786 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5787 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
5788 | exception is raised if either operand is a NaN.  The comparison is performed
5789 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5790 *----------------------------------------------------------------------------*/
5791
5792 int float128_le( float128 a, float128 b STATUS_PARAM )
5793 {
5794     flag aSign, bSign;
5795
5796     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5797               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5798          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5799               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5800        ) {
5801         float_raise( float_flag_invalid STATUS_VAR);
5802         return 0;
5803     }
5804     aSign = extractFloat128Sign( a );
5805     bSign = extractFloat128Sign( b );
5806     if ( aSign != bSign ) {
5807         return
5808                aSign
5809             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5810                  == 0 );
5811     }
5812     return
5813           aSign ? le128( b.high, b.low, a.high, a.low )
5814         : le128( a.high, a.low, b.high, b.low );
5815
5816 }
5817
5818 /*----------------------------------------------------------------------------
5819 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5820 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5821 | raised if either operand is a NaN.  The comparison is performed according
5822 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5823 *----------------------------------------------------------------------------*/
5824
5825 int float128_lt( float128 a, float128 b STATUS_PARAM )
5826 {
5827     flag aSign, bSign;
5828
5829     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5830               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5831          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5832               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5833        ) {
5834         float_raise( float_flag_invalid STATUS_VAR);
5835         return 0;
5836     }
5837     aSign = extractFloat128Sign( a );
5838     bSign = extractFloat128Sign( b );
5839     if ( aSign != bSign ) {
5840         return
5841                aSign
5842             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5843                  != 0 );
5844     }
5845     return
5846           aSign ? lt128( b.high, b.low, a.high, a.low )
5847         : lt128( a.high, a.low, b.high, b.low );
5848
5849 }
5850
5851 /*----------------------------------------------------------------------------
5852 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5853 | be compared, and 0 otherwise.  The invalid exception is raised if either
5854 | operand is a NaN. The comparison is performed according to the IEC/IEEE
5855 | Standard for Binary Floating-Point Arithmetic.
5856 *----------------------------------------------------------------------------*/
5857
5858 int float128_unordered( float128 a, float128 b STATUS_PARAM )
5859 {
5860     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5861               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5862          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5863               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5864        ) {
5865         float_raise( float_flag_invalid STATUS_VAR);
5866         return 1;
5867     }
5868     return 0;
5869 }
5870
5871 /*----------------------------------------------------------------------------
5872 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5873 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5874 | exception.  The comparison is performed according to the IEC/IEEE Standard
5875 | for Binary Floating-Point Arithmetic.
5876 *----------------------------------------------------------------------------*/
5877
5878 int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
5879 {
5880
5881     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5882               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5883          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5884               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5885        ) {
5886         if (    float128_is_signaling_nan( a )
5887              || float128_is_signaling_nan( b ) ) {
5888             float_raise( float_flag_invalid STATUS_VAR);
5889         }
5890         return 0;
5891     }
5892     return
5893            ( a.low == b.low )
5894         && (    ( a.high == b.high )
5895              || (    ( a.low == 0 )
5896                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5897            );
5898
5899 }
5900
5901 /*----------------------------------------------------------------------------
5902 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5903 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5904 | cause an exception.  Otherwise, the comparison is performed according to the
5905 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5906 *----------------------------------------------------------------------------*/
5907
5908 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5909 {
5910     flag aSign, bSign;
5911
5912     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5913               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5914          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5915               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5916        ) {
5917         if (    float128_is_signaling_nan( a )
5918              || float128_is_signaling_nan( b ) ) {
5919             float_raise( float_flag_invalid STATUS_VAR);
5920         }
5921         return 0;
5922     }
5923     aSign = extractFloat128Sign( a );
5924     bSign = extractFloat128Sign( b );
5925     if ( aSign != bSign ) {
5926         return
5927                aSign
5928             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5929                  == 0 );
5930     }
5931     return
5932           aSign ? le128( b.high, b.low, a.high, a.low )
5933         : le128( a.high, a.low, b.high, b.low );
5934
5935 }
5936
5937 /*----------------------------------------------------------------------------
5938 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5939 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5940 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5941 | Standard for Binary Floating-Point Arithmetic.
5942 *----------------------------------------------------------------------------*/
5943
5944 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5945 {
5946     flag aSign, bSign;
5947
5948     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5949               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5950          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5951               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5952        ) {
5953         if (    float128_is_signaling_nan( a )
5954              || float128_is_signaling_nan( b ) ) {
5955             float_raise( float_flag_invalid STATUS_VAR);
5956         }
5957         return 0;
5958     }
5959     aSign = extractFloat128Sign( a );
5960     bSign = extractFloat128Sign( b );
5961     if ( aSign != bSign ) {
5962         return
5963                aSign
5964             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5965                  != 0 );
5966     }
5967     return
5968           aSign ? lt128( b.high, b.low, a.high, a.low )
5969         : lt128( a.high, a.low, b.high, b.low );
5970
5971 }
5972
5973 /*----------------------------------------------------------------------------
5974 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5975 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
5976 | comparison is performed according to the IEC/IEEE Standard for Binary
5977 | Floating-Point Arithmetic.
5978 *----------------------------------------------------------------------------*/
5979
5980 int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
5981 {
5982     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5983               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5984          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5985               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5986        ) {
5987         if (    float128_is_signaling_nan( a )
5988              || float128_is_signaling_nan( b ) ) {
5989             float_raise( float_flag_invalid STATUS_VAR);
5990         }
5991         return 1;
5992     }
5993     return 0;
5994 }
5995
5996 #endif
5997
5998 /* misc functions */
5999 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
6000 {
6001     return int64_to_float32(a STATUS_VAR);
6002 }
6003
6004 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
6005 {
6006     return int64_to_float64(a STATUS_VAR);
6007 }
6008
6009 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
6010 {
6011     int64_t v;
6012     unsigned int res;
6013
6014     v = float32_to_int64(a STATUS_VAR);
6015     if (v < 0) {
6016         res = 0;
6017         float_raise( float_flag_invalid STATUS_VAR);
6018     } else if (v > 0xffffffff) {
6019         res = 0xffffffff;
6020         float_raise( float_flag_invalid STATUS_VAR);
6021     } else {
6022         res = v;
6023     }
6024     return res;
6025 }
6026
6027 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6028 {
6029     int64_t v;
6030     unsigned int res;
6031
6032     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6033     if (v < 0) {
6034         res = 0;
6035         float_raise( float_flag_invalid STATUS_VAR);
6036     } else if (v > 0xffffffff) {
6037         res = 0xffffffff;
6038         float_raise( float_flag_invalid STATUS_VAR);
6039     } else {
6040         res = v;
6041     }
6042     return res;
6043 }
6044
6045 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
6046 {
6047     int64_t v;
6048     unsigned int res;
6049
6050     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6051     if (v < 0) {
6052         res = 0;
6053         float_raise( float_flag_invalid STATUS_VAR);
6054     } else if (v > 0xffff) {
6055         res = 0xffff;
6056         float_raise( float_flag_invalid STATUS_VAR);
6057     } else {
6058         res = v;
6059     }
6060     return res;
6061 }
6062
6063 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
6064 {
6065     int64_t v;
6066     unsigned int res;
6067
6068     v = float64_to_int64(a STATUS_VAR);
6069     if (v < 0) {
6070         res = 0;
6071         float_raise( float_flag_invalid STATUS_VAR);
6072     } else if (v > 0xffffffff) {
6073         res = 0xffffffff;
6074         float_raise( float_flag_invalid STATUS_VAR);
6075     } else {
6076         res = v;
6077     }
6078     return res;
6079 }
6080
6081 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6082 {
6083     int64_t v;
6084     unsigned int res;
6085
6086     v = float64_to_int64_round_to_zero(a STATUS_VAR);
6087     if (v < 0) {
6088         res = 0;
6089         float_raise( float_flag_invalid STATUS_VAR);
6090     } else if (v > 0xffffffff) {
6091         res = 0xffffffff;
6092         float_raise( float_flag_invalid STATUS_VAR);
6093     } else {
6094         res = v;
6095     }
6096     return res;
6097 }
6098
6099 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
6100 {
6101     int64_t v;
6102     unsigned int res;
6103
6104     v = float64_to_int64_round_to_zero(a STATUS_VAR);
6105     if (v < 0) {
6106         res = 0;
6107         float_raise( float_flag_invalid STATUS_VAR);
6108     } else if (v > 0xffff) {
6109         res = 0xffff;
6110         float_raise( float_flag_invalid STATUS_VAR);
6111     } else {
6112         res = v;
6113     }
6114     return res;
6115 }
6116
6117 /* FIXME: This looks broken.  */
6118 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
6119 {
6120     int64_t v;
6121
6122     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6123     v += float64_val(a);
6124     v = float64_to_int64(make_float64(v) STATUS_VAR);
6125
6126     return v - INT64_MIN;
6127 }
6128
6129 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6130 {
6131     int64_t v;
6132
6133     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6134     v += float64_val(a);
6135     v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
6136
6137     return v - INT64_MIN;
6138 }
6139
6140 #define COMPARE(s, nan_exp)                                                  \
6141 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
6142                                       int is_quiet STATUS_PARAM )            \
6143 {                                                                            \
6144     flag aSign, bSign;                                                       \
6145     uint ## s ## _t av, bv;                                                  \
6146     a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
6147     b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
6148                                                                              \
6149     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
6150          extractFloat ## s ## Frac( a ) ) ||                                 \
6151         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
6152           extractFloat ## s ## Frac( b ) )) {                                \
6153         if (!is_quiet ||                                                     \
6154             float ## s ## _is_signaling_nan( a ) ||                          \
6155             float ## s ## _is_signaling_nan( b ) ) {                         \
6156             float_raise( float_flag_invalid STATUS_VAR);                     \
6157         }                                                                    \
6158         return float_relation_unordered;                                     \
6159     }                                                                        \
6160     aSign = extractFloat ## s ## Sign( a );                                  \
6161     bSign = extractFloat ## s ## Sign( b );                                  \
6162     av = float ## s ## _val(a);                                              \
6163     bv = float ## s ## _val(b);                                              \
6164     if ( aSign != bSign ) {                                                  \
6165         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
6166             /* zero case */                                                  \
6167             return float_relation_equal;                                     \
6168         } else {                                                             \
6169             return 1 - (2 * aSign);                                          \
6170         }                                                                    \
6171     } else {                                                                 \
6172         if (av == bv) {                                                      \
6173             return float_relation_equal;                                     \
6174         } else {                                                             \
6175             return 1 - 2 * (aSign ^ ( av < bv ));                            \
6176         }                                                                    \
6177     }                                                                        \
6178 }                                                                            \
6179                                                                              \
6180 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
6181 {                                                                            \
6182     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
6183 }                                                                            \
6184                                                                              \
6185 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
6186 {                                                                            \
6187     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
6188 }
6189
6190 COMPARE(32, 0xff)
6191 COMPARE(64, 0x7ff)
6192
6193 INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6194                                       int is_quiet STATUS_PARAM )
6195 {
6196     flag aSign, bSign;
6197
6198     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6199           ( extractFloatx80Frac( a )<<1 ) ) ||
6200         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6201           ( extractFloatx80Frac( b )<<1 ) )) {
6202         if (!is_quiet ||
6203             floatx80_is_signaling_nan( a ) ||
6204             floatx80_is_signaling_nan( b ) ) {
6205             float_raise( float_flag_invalid STATUS_VAR);
6206         }
6207         return float_relation_unordered;
6208     }
6209     aSign = extractFloatx80Sign( a );
6210     bSign = extractFloatx80Sign( b );
6211     if ( aSign != bSign ) {
6212
6213         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6214              ( ( a.low | b.low ) == 0 ) ) {
6215             /* zero case */
6216             return float_relation_equal;
6217         } else {
6218             return 1 - (2 * aSign);
6219         }
6220     } else {
6221         if (a.low == b.low && a.high == b.high) {
6222             return float_relation_equal;
6223         } else {
6224             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6225         }
6226     }
6227 }
6228
6229 int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6230 {
6231     return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6232 }
6233
6234 int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6235 {
6236     return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6237 }
6238
6239 INLINE int float128_compare_internal( float128 a, float128 b,
6240                                       int is_quiet STATUS_PARAM )
6241 {
6242     flag aSign, bSign;
6243
6244     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6245           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6246         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6247           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6248         if (!is_quiet ||
6249             float128_is_signaling_nan( a ) ||
6250             float128_is_signaling_nan( b ) ) {
6251             float_raise( float_flag_invalid STATUS_VAR);
6252         }
6253         return float_relation_unordered;
6254     }
6255     aSign = extractFloat128Sign( a );
6256     bSign = extractFloat128Sign( b );
6257     if ( aSign != bSign ) {
6258         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6259             /* zero case */
6260             return float_relation_equal;
6261         } else {
6262             return 1 - (2 * aSign);
6263         }
6264     } else {
6265         if (a.low == b.low && a.high == b.high) {
6266             return float_relation_equal;
6267         } else {
6268             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6269         }
6270     }
6271 }
6272
6273 int float128_compare( float128 a, float128 b STATUS_PARAM )
6274 {
6275     return float128_compare_internal(a, b, 0 STATUS_VAR);
6276 }
6277
6278 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6279 {
6280     return float128_compare_internal(a, b, 1 STATUS_VAR);
6281 }
6282
6283 /* min() and max() functions. These can't be implemented as
6284  * 'compare and pick one input' because that would mishandle
6285  * NaNs and +0 vs -0.
6286  */
6287 #define MINMAX(s, nan_exp)                                              \
6288 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
6289                                         int ismin STATUS_PARAM )        \
6290 {                                                                       \
6291     flag aSign, bSign;                                                  \
6292     uint ## s ## _t av, bv;                                             \
6293     a = float ## s ## _squash_input_denormal(a STATUS_VAR);             \
6294     b = float ## s ## _squash_input_denormal(b STATUS_VAR);             \
6295     if (float ## s ## _is_any_nan(a) ||                                 \
6296         float ## s ## _is_any_nan(b)) {                                 \
6297         return propagateFloat ## s ## NaN(a, b STATUS_VAR);             \
6298     }                                                                   \
6299     aSign = extractFloat ## s ## Sign(a);                               \
6300     bSign = extractFloat ## s ## Sign(b);                               \
6301     av = float ## s ## _val(a);                                         \
6302     bv = float ## s ## _val(b);                                         \
6303     if (aSign != bSign) {                                               \
6304         if (ismin) {                                                    \
6305             return aSign ? a : b;                                       \
6306         } else {                                                        \
6307             return aSign ? b : a;                                       \
6308         }                                                               \
6309     } else {                                                            \
6310         if (ismin) {                                                    \
6311             return (aSign ^ (av < bv)) ? a : b;                         \
6312         } else {                                                        \
6313             return (aSign ^ (av < bv)) ? b : a;                         \
6314         }                                                               \
6315     }                                                                   \
6316 }                                                                       \
6317                                                                         \
6318 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM)  \
6319 {                                                                       \
6320     return float ## s ## _minmax(a, b, 1 STATUS_VAR);                   \
6321 }                                                                       \
6322                                                                         \
6323 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM)  \
6324 {                                                                       \
6325     return float ## s ## _minmax(a, b, 0 STATUS_VAR);                   \
6326 }
6327
6328 MINMAX(32, 0xff)
6329 MINMAX(64, 0x7ff)
6330
6331
6332 /* Multiply A by 2 raised to the power N.  */
6333 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6334 {
6335     flag aSign;
6336     int16_t aExp;
6337     uint32_t aSig;
6338
6339     a = float32_squash_input_denormal(a STATUS_VAR);
6340     aSig = extractFloat32Frac( a );
6341     aExp = extractFloat32Exp( a );
6342     aSign = extractFloat32Sign( a );
6343
6344     if ( aExp == 0xFF ) {
6345         if ( aSig ) {
6346             return propagateFloat32NaN( a, a STATUS_VAR );
6347         }
6348         return a;
6349     }
6350     if ( aExp != 0 )
6351         aSig |= 0x00800000;
6352     else if ( aSig == 0 )
6353         return a;
6354
6355     if (n > 0x200) {
6356         n = 0x200;
6357     } else if (n < -0x200) {
6358         n = -0x200;
6359     }
6360
6361     aExp += n - 1;
6362     aSig <<= 7;
6363     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6364 }
6365
6366 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6367 {
6368     flag aSign;
6369     int16_t aExp;
6370     uint64_t aSig;
6371
6372     a = float64_squash_input_denormal(a STATUS_VAR);
6373     aSig = extractFloat64Frac( a );
6374     aExp = extractFloat64Exp( a );
6375     aSign = extractFloat64Sign( a );
6376
6377     if ( aExp == 0x7FF ) {
6378         if ( aSig ) {
6379             return propagateFloat64NaN( a, a STATUS_VAR );
6380         }
6381         return a;
6382     }
6383     if ( aExp != 0 )
6384         aSig |= LIT64( 0x0010000000000000 );
6385     else if ( aSig == 0 )
6386         return a;
6387
6388     if (n > 0x1000) {
6389         n = 0x1000;
6390     } else if (n < -0x1000) {
6391         n = -0x1000;
6392     }
6393
6394     aExp += n - 1;
6395     aSig <<= 10;
6396     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6397 }
6398
6399 #ifdef FLOATX80
6400 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6401 {
6402     flag aSign;
6403     int32_t aExp;
6404     uint64_t aSig;
6405
6406     aSig = extractFloatx80Frac( a );
6407     aExp = extractFloatx80Exp( a );
6408     aSign = extractFloatx80Sign( a );
6409
6410     if ( aExp == 0x7FFF ) {
6411         if ( aSig<<1 ) {
6412             return propagateFloatx80NaN( a, a STATUS_VAR );
6413         }
6414         return a;
6415     }
6416
6417     if (aExp == 0 && aSig == 0)
6418         return a;
6419
6420     if (n > 0x10000) {
6421         n = 0x10000;
6422     } else if (n < -0x10000) {
6423         n = -0x10000;
6424     }
6425
6426     aExp += n;
6427     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6428                                           aSign, aExp, aSig, 0 STATUS_VAR );
6429 }
6430 #endif
6431
6432 #ifdef FLOAT128
6433 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6434 {
6435     flag aSign;
6436     int32_t aExp;
6437     uint64_t aSig0, aSig1;
6438
6439     aSig1 = extractFloat128Frac1( a );
6440     aSig0 = extractFloat128Frac0( a );
6441     aExp = extractFloat128Exp( a );
6442     aSign = extractFloat128Sign( a );
6443     if ( aExp == 0x7FFF ) {
6444         if ( aSig0 | aSig1 ) {
6445             return propagateFloat128NaN( a, a STATUS_VAR );
6446         }
6447         return a;
6448     }
6449     if ( aExp != 0 )
6450         aSig0 |= LIT64( 0x0001000000000000 );
6451     else if ( aSig0 == 0 && aSig1 == 0 )
6452         return a;
6453
6454     if (n > 0x10000) {
6455         n = 0x10000;
6456     } else if (n < -0x10000) {
6457         n = -0x10000;
6458     }
6459
6460     aExp += n - 1;
6461     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6462                                           STATUS_VAR );
6463
6464 }
6465 #endif