fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * Derived from SoftFloat.
   5  */
   6
   7 /*============================================================================
   8
   9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  10 Package, Release 2b.
  11
  12 Written by John R. Hauser.  This work was made possible in part by the
  13 International Computer Science Institute, located at Suite 600, 1947 Center
  14 Street, Berkeley, California 94704.  Funding was partially provided by the
  15 National Science Foundation under grant MIP-9311980.  The original version
  16 of this code was written as part of a project to build a fixed-point vector
  17 processor in collaboration with the University of California at Berkeley,
  18 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  20 arithmetic/SoftFloat.html'.
  21
  22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  24 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  30
  31 Derivative works are acceptable, even for commercial purposes, so long as
  32 (1) the source code for the derivative work includes prominent notice that
  33 the work is derivative, and (2) the source code includes prominent notice with
  34 these four paragraphs for those parts of this code that are retained.
  35
  36 =============================================================================*/
  37
  38 #include "softfloat.h"
  39
  40 /*----------------------------------------------------------------------------
  41 | Primitive arithmetic functions, including multi-word arithmetic, and
  42 | division and square root approximations.  (Can be specialized to target if
  43 | desired.)
  44 *----------------------------------------------------------------------------*/
  45 #include "softfloat-macros.h"
  46
  47 /*----------------------------------------------------------------------------
  48 | Functions and definitions to determine:  (1) whether tininess for underflow
  49 | is detected before or after rounding by default, (2) what (if anything)
  50 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  51 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  52 | are propagated from function inputs to output.  These details are target-
  53 | specific.
  54 *----------------------------------------------------------------------------*/
  55 #include "softfloat-specialize.h"
  56
  57 void set_float_rounding_mode(int val STATUS_PARAM)
  58 {
  59     STATUS(float_rounding_mode) = val;
  60 }
  61
  62 void set_float_exception_flags(int val STATUS_PARAM)
  63 {
  64     STATUS(float_exception_flags) = val;
  65 }
  66
  67 void set_floatx80_rounding_precision(int val STATUS_PARAM)
  68 {
  69     STATUS(floatx80_rounding_precision) = val;
  70 }
  71
  72 /*----------------------------------------------------------------------------
  73 | Returns the fraction bits of the half-precision floating-point value `a'.
  74 *----------------------------------------------------------------------------*/
  75
  76 INLINE uint32_t extractFloat16Frac(float16 a)
  77 {
  78     return float16_val(a) & 0x3ff;
  79 }
  80
  81 /*----------------------------------------------------------------------------
  82 | Returns the exponent bits of the half-precision floating-point value `a'.
  83 *----------------------------------------------------------------------------*/
  84
  85 INLINE int16 extractFloat16Exp(float16 a)
  86 {
  87     return (float16_val(a) >> 10) & 0x1f;
  88 }
  89
  90 /*----------------------------------------------------------------------------
  91 | Returns the sign bit of the single-precision floating-point value `a'.
  92 *----------------------------------------------------------------------------*/
  93
  94 INLINE flag extractFloat16Sign(float16 a)
  95 {
  96     return float16_val(a)>>15;
  97 }
  98
  99 /*----------------------------------------------------------------------------
 100 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
 101 | and 7, and returns the properly rounded 32-bit integer corresponding to the
 102 | input.  If `zSign' is 1, the input is negated before being converted to an
 103 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 104 | is simply rounded to an integer, with the inexact exception raised if the
 105 | input cannot be represented exactly as an integer.  However, if the fixed-
 106 | point input is too large, the invalid exception is raised and the largest
 107 | positive or negative integer is returned.
 108 *----------------------------------------------------------------------------*/
 109
 110 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
 111 {
 112     int8 roundingMode;
 113     flag roundNearestEven;
 114     int8 roundIncrement, roundBits;
 115     int32 z;
 116
 117     roundingMode = STATUS(float_rounding_mode);
 118     roundNearestEven = ( roundingMode == float_round_nearest_even );
 119     roundIncrement = 0x40;
 120     if ( ! roundNearestEven ) {
 121         if ( roundingMode == float_round_to_zero ) {
 122             roundIncrement = 0;
 123         }
 124         else {
 125             roundIncrement = 0x7F;
 126             if ( zSign ) {
 127                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 128             }
 129             else {
 130                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 131             }
 132         }
 133     }
 134     roundBits = absZ & 0x7F;
 135     absZ = ( absZ + roundIncrement )>>7;
 136     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 137     z = absZ;
 138     if ( zSign ) z = - z;
 139     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 140         float_raise( float_flag_invalid STATUS_VAR);
 141         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
 142     }
 143     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 144     return z;
 145
 146 }
 147
 148 /*----------------------------------------------------------------------------
 149 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 150 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 151 | and returns the properly rounded 64-bit integer corresponding to the input.
 152 | If `zSign' is 1, the input is negated before being converted to an integer.
 153 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 154 | the inexact exception raised if the input cannot be represented exactly as
 155 | an integer.  However, if the fixed-point input is too large, the invalid
 156 | exception is raised and the largest positive or negative integer is
 157 | returned.
 158 *----------------------------------------------------------------------------*/
 159
 160 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
 161 {
 162     int8 roundingMode;
 163     flag roundNearestEven, increment;
 164     int64 z;
 165
 166     roundingMode = STATUS(float_rounding_mode);
 167     roundNearestEven = ( roundingMode == float_round_nearest_even );
 168     increment = ( (int64_t) absZ1 < 0 );
 169     if ( ! roundNearestEven ) {
 170         if ( roundingMode == float_round_to_zero ) {
 171             increment = 0;
 172         }
 173         else {
 174             if ( zSign ) {
 175                 increment = ( roundingMode == float_round_down ) && absZ1;
 176             }
 177             else {
 178                 increment = ( roundingMode == float_round_up ) && absZ1;
 179             }
 180         }
 181     }
 182     if ( increment ) {
 183         ++absZ0;
 184         if ( absZ0 == 0 ) goto overflow;
 185         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 186     }
 187     z = absZ0;
 188     if ( zSign ) z = - z;
 189     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 190  overflow:
 191         float_raise( float_flag_invalid STATUS_VAR);
 192         return
 193               zSign ? (int64_t) LIT64( 0x8000000000000000 )
 194             : LIT64( 0x7FFFFFFFFFFFFFFF );
 195     }
 196     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 197     return z;
 198
 199 }
 200
 201 /*----------------------------------------------------------------------------
 202 | Returns the fraction bits of the single-precision floating-point value `a'.
 203 *----------------------------------------------------------------------------*/
 204
 205 INLINE uint32_t extractFloat32Frac( float32 a )
 206 {
 207
 208     return float32_val(a) & 0x007FFFFF;
 209
 210 }
 211
 212 /*----------------------------------------------------------------------------
 213 | Returns the exponent bits of the single-precision floating-point value `a'.
 214 *----------------------------------------------------------------------------*/
 215
 216 INLINE int16 extractFloat32Exp( float32 a )
 217 {
 218
 219     return ( float32_val(a)>>23 ) & 0xFF;
 220
 221 }
 222
 223 /*----------------------------------------------------------------------------
 224 | Returns the sign bit of the single-precision floating-point value `a'.
 225 *----------------------------------------------------------------------------*/
 226
 227 INLINE flag extractFloat32Sign( float32 a )
 228 {
 229
 230     return float32_val(a)>>31;
 231
 232 }
 233
 234 /*----------------------------------------------------------------------------
 235 | If `a' is denormal and we are in flush-to-zero mode then set the
 236 | input-denormal exception and return zero. Otherwise just return the value.
 237 *----------------------------------------------------------------------------*/
 238 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
 239 {
 240     if (STATUS(flush_inputs_to_zero)) {
 241         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
 242             float_raise(float_flag_input_denormal STATUS_VAR);
 243             return make_float32(float32_val(a) & 0x80000000);
 244         }
 245     }
 246     return a;
 247 }
 248
 249 /*----------------------------------------------------------------------------
 250 | Normalizes the subnormal single-precision floating-point value represented
 251 | by the denormalized significand `aSig'.  The normalized exponent and
 252 | significand are stored at the locations pointed to by `zExpPtr' and
 253 | `zSigPtr', respectively.
 254 *----------------------------------------------------------------------------*/
 255
 256 static void
 257  normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr )
 258 {
 259     int8 shiftCount;
 260
 261     shiftCount = countLeadingZeros32( aSig ) - 8;
 262     *zSigPtr = aSig<<shiftCount;
 263     *zExpPtr = 1 - shiftCount;
 264
 265 }
 266
 267 /*----------------------------------------------------------------------------
 268 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 269 | single-precision floating-point value, returning the result.  After being
 270 | shifted into the proper positions, the three fields are simply added
 271 | together to form the result.  This means that any integer portion of `zSig'
 272 | will be added into the exponent.  Since a properly normalized significand
 273 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 274 | than the desired result exponent whenever `zSig' is a complete, normalized
 275 | significand.
 276 *----------------------------------------------------------------------------*/
 277
 278 INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig )
 279 {
 280
 281     return make_float32(
 282           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
 283
 284 }
 285
 286 /*----------------------------------------------------------------------------
 287 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 288 | and significand `zSig', and returns the proper single-precision floating-
 289 | point value corresponding to the abstract input.  Ordinarily, the abstract
 290 | value is simply rounded and packed into the single-precision format, with
 291 | the inexact exception raised if the abstract input cannot be represented
 292 | exactly.  However, if the abstract value is too large, the overflow and
 293 | inexact exceptions are raised and an infinity or maximal finite value is
 294 | returned.  If the abstract value is too small, the input value is rounded to
 295 | a subnormal number, and the underflow and inexact exceptions are raised if
 296 | the abstract input cannot be represented exactly as a subnormal single-
 297 | precision floating-point number.
 298 |     The input significand `zSig' has its binary point between bits 30
 299 | and 29, which is 7 bits to the left of the usual location.  This shifted
 300 | significand must be normalized or smaller.  If `zSig' is not normalized,
 301 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 302 | and it must not require rounding.  In the usual case that `zSig' is
 303 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 304 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 305 | Binary Floating-Point Arithmetic.
 306 *----------------------------------------------------------------------------*/
 307
 308 static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
 309 {
 310     int8 roundingMode;
 311     flag roundNearestEven;
 312     int8 roundIncrement, roundBits;
 313     flag isTiny;
 314
 315     roundingMode = STATUS(float_rounding_mode);
 316     roundNearestEven = ( roundingMode == float_round_nearest_even );
 317     roundIncrement = 0x40;
 318     if ( ! roundNearestEven ) {
 319         if ( roundingMode == float_round_to_zero ) {
 320             roundIncrement = 0;
 321         }
 322         else {
 323             roundIncrement = 0x7F;
 324             if ( zSign ) {
 325                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 326             }
 327             else {
 328                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 329             }
 330         }
 331     }
 332     roundBits = zSig & 0x7F;
 333     if ( 0xFD <= (uint16_t) zExp ) {
 334         if (    ( 0xFD < zExp )
 335              || (    ( zExp == 0xFD )
 336                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
 337            ) {
 338             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 339             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 340         }
 341         if ( zExp < 0 ) {
 342             if (STATUS(flush_to_zero)) {
 343                 float_raise(float_flag_output_denormal STATUS_VAR);
 344                 return packFloat32(zSign, 0, 0);
 345             }
 346             isTiny =
 347                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 348                 || ( zExp < -1 )
 349                 || ( zSig + roundIncrement < 0x80000000 );
 350             shift32RightJamming( zSig, - zExp, &zSig );
 351             zExp = 0;
 352             roundBits = zSig & 0x7F;
 353             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 354         }
 355     }
 356     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 357     zSig = ( zSig + roundIncrement )>>7;
 358     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 359     if ( zSig == 0 ) zExp = 0;
 360     return packFloat32( zSign, zExp, zSig );
 361
 362 }
 363
 364 /*----------------------------------------------------------------------------
 365 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 366 | and significand `zSig', and returns the proper single-precision floating-
 367 | point value corresponding to the abstract input.  This routine is just like
 368 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 369 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 370 | floating-point exponent.
 371 *----------------------------------------------------------------------------*/
 372
 373 static float32
 374  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
 375 {
 376     int8 shiftCount;
 377
 378     shiftCount = countLeadingZeros32( zSig ) - 1;
 379     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 380
 381 }
 382
 383 /*----------------------------------------------------------------------------
 384 | Returns the fraction bits of the double-precision floating-point value `a'.
 385 *----------------------------------------------------------------------------*/
 386
 387 INLINE uint64_t extractFloat64Frac( float64 a )
 388 {
 389
 390     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 391
 392 }
 393
 394 /*----------------------------------------------------------------------------
 395 | Returns the exponent bits of the double-precision floating-point value `a'.
 396 *----------------------------------------------------------------------------*/
 397
 398 INLINE int16 extractFloat64Exp( float64 a )
 399 {
 400
 401     return ( float64_val(a)>>52 ) & 0x7FF;
 402
 403 }
 404
 405 /*----------------------------------------------------------------------------
 406 | Returns the sign bit of the double-precision floating-point value `a'.
 407 *----------------------------------------------------------------------------*/
 408
 409 INLINE flag extractFloat64Sign( float64 a )
 410 {
 411
 412     return float64_val(a)>>63;
 413
 414 }
 415
 416 /*----------------------------------------------------------------------------
 417 | If `a' is denormal and we are in flush-to-zero mode then set the
 418 | input-denormal exception and return zero. Otherwise just return the value.
 419 *----------------------------------------------------------------------------*/
 420 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
 421 {
 422     if (STATUS(flush_inputs_to_zero)) {
 423         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
 424             float_raise(float_flag_input_denormal STATUS_VAR);
 425             return make_float64(float64_val(a) & (1ULL << 63));
 426         }
 427     }
 428     return a;
 429 }
 430
 431 /*----------------------------------------------------------------------------
 432 | Normalizes the subnormal double-precision floating-point value represented
 433 | by the denormalized significand `aSig'.  The normalized exponent and
 434 | significand are stored at the locations pointed to by `zExpPtr' and
 435 | `zSigPtr', respectively.
 436 *----------------------------------------------------------------------------*/
 437
 438 static void
 439  normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr )
 440 {
 441     int8 shiftCount;
 442
 443     shiftCount = countLeadingZeros64( aSig ) - 11;
 444     *zSigPtr = aSig<<shiftCount;
 445     *zExpPtr = 1 - shiftCount;
 446
 447 }
 448
 449 /*----------------------------------------------------------------------------
 450 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 451 | double-precision floating-point value, returning the result.  After being
 452 | shifted into the proper positions, the three fields are simply added
 453 | together to form the result.  This means that any integer portion of `zSig'
 454 | will be added into the exponent.  Since a properly normalized significand
 455 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 456 | than the desired result exponent whenever `zSig' is a complete, normalized
 457 | significand.
 458 *----------------------------------------------------------------------------*/
 459
 460 INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig )
 461 {
 462
 463     return make_float64(
 464         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
 465
 466 }
 467
 468 /*----------------------------------------------------------------------------
 469 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 470 | and significand `zSig', and returns the proper double-precision floating-
 471 | point value corresponding to the abstract input.  Ordinarily, the abstract
 472 | value is simply rounded and packed into the double-precision format, with
 473 | the inexact exception raised if the abstract input cannot be represented
 474 | exactly.  However, if the abstract value is too large, the overflow and
 475 | inexact exceptions are raised and an infinity or maximal finite value is
 476 | returned.  If the abstract value is too small, the input value is rounded
 477 | to a subnormal number, and the underflow and inexact exceptions are raised
 478 | if the abstract input cannot be represented exactly as a subnormal double-
 479 | precision floating-point number.
 480 |     The input significand `zSig' has its binary point between bits 62
 481 | and 61, which is 10 bits to the left of the usual location.  This shifted
 482 | significand must be normalized or smaller.  If `zSig' is not normalized,
 483 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 484 | and it must not require rounding.  In the usual case that `zSig' is
 485 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 486 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 487 | Binary Floating-Point Arithmetic.
 488 *----------------------------------------------------------------------------*/
 489
 490 static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
 491 {
 492     int8 roundingMode;
 493     flag roundNearestEven;
 494     int16 roundIncrement, roundBits;
 495     flag isTiny;
 496
 497     roundingMode = STATUS(float_rounding_mode);
 498     roundNearestEven = ( roundingMode == float_round_nearest_even );
 499     roundIncrement = 0x200;
 500     if ( ! roundNearestEven ) {
 501         if ( roundingMode == float_round_to_zero ) {
 502             roundIncrement = 0;
 503         }
 504         else {
 505             roundIncrement = 0x3FF;
 506             if ( zSign ) {
 507                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 508             }
 509             else {
 510                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 511             }
 512         }
 513     }
 514     roundBits = zSig & 0x3FF;
 515     if ( 0x7FD <= (uint16_t) zExp ) {
 516         if (    ( 0x7FD < zExp )
 517              || (    ( zExp == 0x7FD )
 518                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
 519            ) {
 520             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 521             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 522         }
 523         if ( zExp < 0 ) {
 524             if (STATUS(flush_to_zero)) {
 525                 float_raise(float_flag_output_denormal STATUS_VAR);
 526                 return packFloat64(zSign, 0, 0);
 527             }
 528             isTiny =
 529                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 530                 || ( zExp < -1 )
 531                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 532             shift64RightJamming( zSig, - zExp, &zSig );
 533             zExp = 0;
 534             roundBits = zSig & 0x3FF;
 535             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 536         }
 537     }
 538     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 539     zSig = ( zSig + roundIncrement )>>10;
 540     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 541     if ( zSig == 0 ) zExp = 0;
 542     return packFloat64( zSign, zExp, zSig );
 543
 544 }
 545
 546 /*----------------------------------------------------------------------------
 547 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 548 | and significand `zSig', and returns the proper double-precision floating-
 549 | point value corresponding to the abstract input.  This routine is just like
 550 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 551 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 552 | floating-point exponent.
 553 *----------------------------------------------------------------------------*/
 554
 555 static float64
 556  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
 557 {
 558     int8 shiftCount;
 559
 560     shiftCount = countLeadingZeros64( zSig ) - 1;
 561     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 562
 563 }
 564
 565 /*----------------------------------------------------------------------------
 566 | Returns the fraction bits of the extended double-precision floating-point
 567 | value `a'.
 568 *----------------------------------------------------------------------------*/
 569
 570 INLINE uint64_t extractFloatx80Frac( floatx80 a )
 571 {
 572
 573     return a.low;
 574
 575 }
 576
 577 /*----------------------------------------------------------------------------
 578 | Returns the exponent bits of the extended double-precision floating-point
 579 | value `a'.
 580 *----------------------------------------------------------------------------*/
 581
 582 INLINE int32 extractFloatx80Exp( floatx80 a )
 583 {
 584
 585     return a.high & 0x7FFF;
 586
 587 }
 588
 589 /*----------------------------------------------------------------------------
 590 | Returns the sign bit of the extended double-precision floating-point value
 591 | `a'.
 592 *----------------------------------------------------------------------------*/
 593
 594 INLINE flag extractFloatx80Sign( floatx80 a )
 595 {
 596
 597     return a.high>>15;
 598
 599 }
 600
 601 /*----------------------------------------------------------------------------
 602 | Normalizes the subnormal extended double-precision floating-point value
 603 | represented by the denormalized significand `aSig'.  The normalized exponent
 604 | and significand are stored at the locations pointed to by `zExpPtr' and
 605 | `zSigPtr', respectively.
 606 *----------------------------------------------------------------------------*/
 607
 608 static void
 609  normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
 610 {
 611     int8 shiftCount;
 612
 613     shiftCount = countLeadingZeros64( aSig );
 614     *zSigPtr = aSig<<shiftCount;
 615     *zExpPtr = 1 - shiftCount;
 616
 617 }
 618
 619 /*----------------------------------------------------------------------------
 620 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 621 | extended double-precision floating-point value, returning the result.
 622 *----------------------------------------------------------------------------*/
 623
 624 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
 625 {
 626     floatx80 z;
 627
 628     z.low = zSig;
 629     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
 630     return z;
 631
 632 }
 633
 634 /*----------------------------------------------------------------------------
 635 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 636 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 637 | and returns the proper extended double-precision floating-point value
 638 | corresponding to the abstract input.  Ordinarily, the abstract value is
 639 | rounded and packed into the extended double-precision format, with the
 640 | inexact exception raised if the abstract input cannot be represented
 641 | exactly.  However, if the abstract value is too large, the overflow and
 642 | inexact exceptions are raised and an infinity or maximal finite value is
 643 | returned.  If the abstract value is too small, the input value is rounded to
 644 | a subnormal number, and the underflow and inexact exceptions are raised if
 645 | the abstract input cannot be represented exactly as a subnormal extended
 646 | double-precision floating-point number.
 647 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 648 | number of bits as single or double precision, respectively.  Otherwise, the
 649 | result is rounded to the full precision of the extended double-precision
 650 | format.
 651 |     The input significand must be normalized or smaller.  If the input
 652 | significand is not normalized, `zExp' must be 0; in that case, the result
 653 | returned is a subnormal number, and it must not require rounding.  The
 654 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 655 | Floating-Point Arithmetic.
 656 *----------------------------------------------------------------------------*/
 657
 658 static floatx80
 659  roundAndPackFloatx80(
 660      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 661  STATUS_PARAM)
 662 {
 663     int8 roundingMode;
 664     flag roundNearestEven, increment, isTiny;
 665     int64 roundIncrement, roundMask, roundBits;
 666
 667     roundingMode = STATUS(float_rounding_mode);
 668     roundNearestEven = ( roundingMode == float_round_nearest_even );
 669     if ( roundingPrecision == 80 ) goto precision80;
 670     if ( roundingPrecision == 64 ) {
 671         roundIncrement = LIT64( 0x0000000000000400 );
 672         roundMask = LIT64( 0x00000000000007FF );
 673     }
 674     else if ( roundingPrecision == 32 ) {
 675         roundIncrement = LIT64( 0x0000008000000000 );
 676         roundMask = LIT64( 0x000000FFFFFFFFFF );
 677     }
 678     else {
 679         goto precision80;
 680     }
 681     zSig0 |= ( zSig1 != 0 );
 682     if ( ! roundNearestEven ) {
 683         if ( roundingMode == float_round_to_zero ) {
 684             roundIncrement = 0;
 685         }
 686         else {
 687             roundIncrement = roundMask;
 688             if ( zSign ) {
 689                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 690             }
 691             else {
 692                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 693             }
 694         }
 695     }
 696     roundBits = zSig0 & roundMask;
 697     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 698         if (    ( 0x7FFE < zExp )
 699              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 700            ) {
 701             goto overflow;
 702         }
 703         if ( zExp <= 0 ) {
 704             if (STATUS(flush_to_zero)) {
 705                 float_raise(float_flag_output_denormal STATUS_VAR);
 706                 return packFloatx80(zSign, 0, 0);
 707             }
 708             isTiny =
 709                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 710                 || ( zExp < 0 )
 711                 || ( zSig0 <= zSig0 + roundIncrement );
 712             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 713             zExp = 0;
 714             roundBits = zSig0 & roundMask;
 715             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 716             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 717             zSig0 += roundIncrement;
 718             if ( (int64_t) zSig0 < 0 ) zExp = 1;
 719             roundIncrement = roundMask + 1;
 720             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 721                 roundMask |= roundIncrement;
 722             }
 723             zSig0 &= ~ roundMask;
 724             return packFloatx80( zSign, zExp, zSig0 );
 725         }
 726     }
 727     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 728     zSig0 += roundIncrement;
 729     if ( zSig0 < roundIncrement ) {
 730         ++zExp;
 731         zSig0 = LIT64( 0x8000000000000000 );
 732     }
 733     roundIncrement = roundMask + 1;
 734     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 735         roundMask |= roundIncrement;
 736     }
 737     zSig0 &= ~ roundMask;
 738     if ( zSig0 == 0 ) zExp = 0;
 739     return packFloatx80( zSign, zExp, zSig0 );
 740  precision80:
 741     increment = ( (int64_t) zSig1 < 0 );
 742     if ( ! roundNearestEven ) {
 743         if ( roundingMode == float_round_to_zero ) {
 744             increment = 0;
 745         }
 746         else {
 747             if ( zSign ) {
 748                 increment = ( roundingMode == float_round_down ) && zSig1;
 749             }
 750             else {
 751                 increment = ( roundingMode == float_round_up ) && zSig1;
 752             }
 753         }
 754     }
 755     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 756         if (    ( 0x7FFE < zExp )
 757              || (    ( zExp == 0x7FFE )
 758                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 759                   && increment
 760                 )
 761            ) {
 762             roundMask = 0;
 763  overflow:
 764             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 765             if (    ( roundingMode == float_round_to_zero )
 766                  || ( zSign && ( roundingMode == float_round_up ) )
 767                  || ( ! zSign && ( roundingMode == float_round_down ) )
 768                ) {
 769                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 770             }
 771             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 772         }
 773         if ( zExp <= 0 ) {
 774             isTiny =
 775                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 776                 || ( zExp < 0 )
 777                 || ! increment
 778                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 779             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 780             zExp = 0;
 781             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 782             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 783             if ( roundNearestEven ) {
 784                 increment = ( (int64_t) zSig1 < 0 );
 785             }
 786             else {
 787                 if ( zSign ) {
 788                     increment = ( roundingMode == float_round_down ) && zSig1;
 789                 }
 790                 else {
 791                     increment = ( roundingMode == float_round_up ) && zSig1;
 792                 }
 793             }
 794             if ( increment ) {
 795                 ++zSig0;
 796                 zSig0 &=
 797                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 798                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
 799             }
 800             return packFloatx80( zSign, zExp, zSig0 );
 801         }
 802     }
 803     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 804     if ( increment ) {
 805         ++zSig0;
 806         if ( zSig0 == 0 ) {
 807             ++zExp;
 808             zSig0 = LIT64( 0x8000000000000000 );
 809         }
 810         else {
 811             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 812         }
 813     }
 814     else {
 815         if ( zSig0 == 0 ) zExp = 0;
 816     }
 817     return packFloatx80( zSign, zExp, zSig0 );
 818
 819 }
 820
 821 /*----------------------------------------------------------------------------
 822 | Takes an abstract floating-point value having sign `zSign', exponent
 823 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 824 | and returns the proper extended double-precision floating-point value
 825 | corresponding to the abstract input.  This routine is just like
 826 | `roundAndPackFloatx80' except that the input significand does not have to be
 827 | normalized.
 828 *----------------------------------------------------------------------------*/
 829
 830 static floatx80
 831  normalizeRoundAndPackFloatx80(
 832      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 833  STATUS_PARAM)
 834 {
 835     int8 shiftCount;
 836
 837     if ( zSig0 == 0 ) {
 838         zSig0 = zSig1;
 839         zSig1 = 0;
 840         zExp -= 64;
 841     }
 842     shiftCount = countLeadingZeros64( zSig0 );
 843     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 844     zExp -= shiftCount;
 845     return
 846         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 847
 848 }
 849
 850 /*----------------------------------------------------------------------------
 851 | Returns the least-significant 64 fraction bits of the quadruple-precision
 852 | floating-point value `a'.
 853 *----------------------------------------------------------------------------*/
 854
 855 INLINE uint64_t extractFloat128Frac1( float128 a )
 856 {
 857
 858     return a.low;
 859
 860 }
 861
 862 /*----------------------------------------------------------------------------
 863 | Returns the most-significant 48 fraction bits of the quadruple-precision
 864 | floating-point value `a'.
 865 *----------------------------------------------------------------------------*/
 866
 867 INLINE uint64_t extractFloat128Frac0( float128 a )
 868 {
 869
 870     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 871
 872 }
 873
 874 /*----------------------------------------------------------------------------
 875 | Returns the exponent bits of the quadruple-precision floating-point value
 876 | `a'.
 877 *----------------------------------------------------------------------------*/
 878
 879 INLINE int32 extractFloat128Exp( float128 a )
 880 {
 881
 882     return ( a.high>>48 ) & 0x7FFF;
 883
 884 }
 885
 886 /*----------------------------------------------------------------------------
 887 | Returns the sign bit of the quadruple-precision floating-point value `a'.
 888 *----------------------------------------------------------------------------*/
 889
 890 INLINE flag extractFloat128Sign( float128 a )
 891 {
 892
 893     return a.high>>63;
 894
 895 }
 896
 897 /*----------------------------------------------------------------------------
 898 | Normalizes the subnormal quadruple-precision floating-point value
 899 | represented by the denormalized significand formed by the concatenation of
 900 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 901 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 902 | significand are stored at the location pointed to by `zSig0Ptr', and the
 903 | least significant 64 bits of the normalized significand are stored at the
 904 | location pointed to by `zSig1Ptr'.
 905 *----------------------------------------------------------------------------*/
 906
 907 static void
 908  normalizeFloat128Subnormal(
 909      uint64_t aSig0,
 910      uint64_t aSig1,
 911      int32 *zExpPtr,
 912      uint64_t *zSig0Ptr,
 913      uint64_t *zSig1Ptr
 914  )
 915 {
 916     int8 shiftCount;
 917
 918     if ( aSig0 == 0 ) {
 919         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 920         if ( shiftCount < 0 ) {
 921             *zSig0Ptr = aSig1>>( - shiftCount );
 922             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 923         }
 924         else {
 925             *zSig0Ptr = aSig1<<shiftCount;
 926             *zSig1Ptr = 0;
 927         }
 928         *zExpPtr = - shiftCount - 63;
 929     }
 930     else {
 931         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 932         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 933         *zExpPtr = 1 - shiftCount;
 934     }
 935
 936 }
 937
 938 /*----------------------------------------------------------------------------
 939 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 940 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 941 | floating-point value, returning the result.  After being shifted into the
 942 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 943 | added together to form the most significant 32 bits of the result.  This
 944 | means that any integer portion of `zSig0' will be added into the exponent.
 945 | Since a properly normalized significand will have an integer portion equal
 946 | to 1, the `zExp' input should be 1 less than the desired result exponent
 947 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 948 | significand.
 949 *----------------------------------------------------------------------------*/
 950
 951 INLINE float128
 952  packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
 953 {
 954     float128 z;
 955
 956     z.low = zSig1;
 957     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
 958     return z;
 959
 960 }
 961
 962 /*----------------------------------------------------------------------------
 963 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 964 | and extended significand formed by the concatenation of `zSig0', `zSig1',
 965 | and `zSig2', and returns the proper quadruple-precision floating-point value
 966 | corresponding to the abstract input.  Ordinarily, the abstract value is
 967 | simply rounded and packed into the quadruple-precision format, with the
 968 | inexact exception raised if the abstract input cannot be represented
 969 | exactly.  However, if the abstract value is too large, the overflow and
 970 | inexact exceptions are raised and an infinity or maximal finite value is
 971 | returned.  If the abstract value is too small, the input value is rounded to
 972 | a subnormal number, and the underflow and inexact exceptions are raised if
 973 | the abstract input cannot be represented exactly as a subnormal quadruple-
 974 | precision floating-point number.
 975 |     The input significand must be normalized or smaller.  If the input
 976 | significand is not normalized, `zExp' must be 0; in that case, the result
 977 | returned is a subnormal number, and it must not require rounding.  In the
 978 | usual case that the input significand is normalized, `zExp' must be 1 less
 979 | than the ``true'' floating-point exponent.  The handling of underflow and
 980 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 981 *----------------------------------------------------------------------------*/
 982
 983 static float128
 984  roundAndPackFloat128(
 985      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
 986 {
 987     int8 roundingMode;
 988     flag roundNearestEven, increment, isTiny;
 989
 990     roundingMode = STATUS(float_rounding_mode);
 991     roundNearestEven = ( roundingMode == float_round_nearest_even );
 992     increment = ( (int64_t) zSig2 < 0 );
 993     if ( ! roundNearestEven ) {
 994         if ( roundingMode == float_round_to_zero ) {
 995             increment = 0;
 996         }
 997         else {
 998             if ( zSign ) {
 999                 increment = ( roundingMode == float_round_down ) && zSig2;
1000             }
1001             else {
1002                 increment = ( roundingMode == float_round_up ) && zSig2;
1003             }
1004         }
1005     }
1006     if ( 0x7FFD <= (uint32_t) zExp ) {
1007         if (    ( 0x7FFD < zExp )
1008              || (    ( zExp == 0x7FFD )
1009                   && eq128(
1010                          LIT64( 0x0001FFFFFFFFFFFF ),
1011                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1012                          zSig0,
1013                          zSig1
1014                      )
1015                   && increment
1016                 )
1017            ) {
1018             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1019             if (    ( roundingMode == float_round_to_zero )
1020                  || ( zSign && ( roundingMode == float_round_up ) )
1021                  || ( ! zSign && ( roundingMode == float_round_down ) )
1022                ) {
1023                 return
1024                     packFloat128(
1025                         zSign,
1026                         0x7FFE,
1027                         LIT64( 0x0000FFFFFFFFFFFF ),
1028                         LIT64( 0xFFFFFFFFFFFFFFFF )
1029                     );
1030             }
1031             return packFloat128( zSign, 0x7FFF, 0, 0 );
1032         }
1033         if ( zExp < 0 ) {
1034             if (STATUS(flush_to_zero)) {
1035                 float_raise(float_flag_output_denormal STATUS_VAR);
1036                 return packFloat128(zSign, 0, 0, 0);
1037             }
1038             isTiny =
1039                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1040                 || ( zExp < -1 )
1041                 || ! increment
1042                 || lt128(
1043                        zSig0,
1044                        zSig1,
1045                        LIT64( 0x0001FFFFFFFFFFFF ),
1046                        LIT64( 0xFFFFFFFFFFFFFFFF )
1047                    );
1048             shift128ExtraRightJamming(
1049                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1050             zExp = 0;
1051             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1052             if ( roundNearestEven ) {
1053                 increment = ( (int64_t) zSig2 < 0 );
1054             }
1055             else {
1056                 if ( zSign ) {
1057                     increment = ( roundingMode == float_round_down ) && zSig2;
1058                 }
1059                 else {
1060                     increment = ( roundingMode == float_round_up ) && zSig2;
1061                 }
1062             }
1063         }
1064     }
1065     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1066     if ( increment ) {
1067         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1068         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1069     }
1070     else {
1071         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1072     }
1073     return packFloat128( zSign, zExp, zSig0, zSig1 );
1074
1075 }
1076
1077 /*----------------------------------------------------------------------------
1078 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1079 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1080 | returns the proper quadruple-precision floating-point value corresponding
1081 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1082 | except that the input significand has fewer bits and does not have to be
1083 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1084 | point exponent.
1085 *----------------------------------------------------------------------------*/
1086
1087 static float128
1088  normalizeRoundAndPackFloat128(
1089      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1090 {
1091     int8 shiftCount;
1092     uint64_t zSig2;
1093
1094     if ( zSig0 == 0 ) {
1095         zSig0 = zSig1;
1096         zSig1 = 0;
1097         zExp -= 64;
1098     }
1099     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1100     if ( 0 <= shiftCount ) {
1101         zSig2 = 0;
1102         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1103     }
1104     else {
1105         shift128ExtraRightJamming(
1106             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1107     }
1108     zExp -= shiftCount;
1109     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1110
1111 }
1112
1113 /*----------------------------------------------------------------------------
1114 | Returns the result of converting the 32-bit two's complement integer `a'
1115 | to the single-precision floating-point format.  The conversion is performed
1116 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1117 *----------------------------------------------------------------------------*/
1118
1119 float32 int32_to_float32( int32 a STATUS_PARAM )
1120 {
1121     flag zSign;
1122
1123     if ( a == 0 ) return float32_zero;
1124     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1125     zSign = ( a < 0 );
1126     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1127
1128 }
1129
1130 /*----------------------------------------------------------------------------
1131 | Returns the result of converting the 32-bit two's complement integer `a'
1132 | to the double-precision floating-point format.  The conversion is performed
1133 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1134 *----------------------------------------------------------------------------*/
1135
1136 float64 int32_to_float64( int32 a STATUS_PARAM )
1137 {
1138     flag zSign;
1139     uint32 absA;
1140     int8 shiftCount;
1141     uint64_t zSig;
1142
1143     if ( a == 0 ) return float64_zero;
1144     zSign = ( a < 0 );
1145     absA = zSign ? - a : a;
1146     shiftCount = countLeadingZeros32( absA ) + 21;
1147     zSig = absA;
1148     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1149
1150 }
1151
1152 /*----------------------------------------------------------------------------
1153 | Returns the result of converting the 32-bit two's complement integer `a'
1154 | to the extended double-precision floating-point format.  The conversion
1155 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1156 | Arithmetic.
1157 *----------------------------------------------------------------------------*/
1158
1159 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1160 {
1161     flag zSign;
1162     uint32 absA;
1163     int8 shiftCount;
1164     uint64_t zSig;
1165
1166     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1167     zSign = ( a < 0 );
1168     absA = zSign ? - a : a;
1169     shiftCount = countLeadingZeros32( absA ) + 32;
1170     zSig = absA;
1171     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1172
1173 }
1174
1175 /*----------------------------------------------------------------------------
1176 | Returns the result of converting the 32-bit two's complement integer `a' to
1177 | the quadruple-precision floating-point format.  The conversion is performed
1178 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1179 *----------------------------------------------------------------------------*/
1180
1181 float128 int32_to_float128( int32 a STATUS_PARAM )
1182 {
1183     flag zSign;
1184     uint32 absA;
1185     int8 shiftCount;
1186     uint64_t zSig0;
1187
1188     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1189     zSign = ( a < 0 );
1190     absA = zSign ? - a : a;
1191     shiftCount = countLeadingZeros32( absA ) + 17;
1192     zSig0 = absA;
1193     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1194
1195 }
1196
1197 /*----------------------------------------------------------------------------
1198 | Returns the result of converting the 64-bit two's complement integer `a'
1199 | to the single-precision floating-point format.  The conversion is performed
1200 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1201 *----------------------------------------------------------------------------*/
1202
1203 float32 int64_to_float32( int64 a STATUS_PARAM )
1204 {
1205     flag zSign;
1206     uint64 absA;
1207     int8 shiftCount;
1208
1209     if ( a == 0 ) return float32_zero;
1210     zSign = ( a < 0 );
1211     absA = zSign ? - a : a;
1212     shiftCount = countLeadingZeros64( absA ) - 40;
1213     if ( 0 <= shiftCount ) {
1214         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1215     }
1216     else {
1217         shiftCount += 7;
1218         if ( shiftCount < 0 ) {
1219             shift64RightJamming( absA, - shiftCount, &absA );
1220         }
1221         else {
1222             absA <<= shiftCount;
1223         }
1224         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1225     }
1226
1227 }
1228
1229 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1230 {
1231     int8 shiftCount;
1232
1233     if ( a == 0 ) return float32_zero;
1234     shiftCount = countLeadingZeros64( a ) - 40;
1235     if ( 0 <= shiftCount ) {
1236         return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1237     }
1238     else {
1239         shiftCount += 7;
1240         if ( shiftCount < 0 ) {
1241             shift64RightJamming( a, - shiftCount, &a );
1242         }
1243         else {
1244             a <<= shiftCount;
1245         }
1246         return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1247     }
1248 }
1249
1250 /*----------------------------------------------------------------------------
1251 | Returns the result of converting the 64-bit two's complement integer `a'
1252 | to the double-precision floating-point format.  The conversion is performed
1253 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1254 *----------------------------------------------------------------------------*/
1255
1256 float64 int64_to_float64( int64 a STATUS_PARAM )
1257 {
1258     flag zSign;
1259
1260     if ( a == 0 ) return float64_zero;
1261     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1262         return packFloat64( 1, 0x43E, 0 );
1263     }
1264     zSign = ( a < 0 );
1265     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1266
1267 }
1268
1269 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1270 {
1271     if ( a == 0 ) return float64_zero;
1272     return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1273
1274 }
1275
1276 /*----------------------------------------------------------------------------
1277 | Returns the result of converting the 64-bit two's complement integer `a'
1278 | to the extended double-precision floating-point format.  The conversion
1279 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1280 | Arithmetic.
1281 *----------------------------------------------------------------------------*/
1282
1283 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1284 {
1285     flag zSign;
1286     uint64 absA;
1287     int8 shiftCount;
1288
1289     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1290     zSign = ( a < 0 );
1291     absA = zSign ? - a : a;
1292     shiftCount = countLeadingZeros64( absA );
1293     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1294
1295 }
1296
1297 /*----------------------------------------------------------------------------
1298 | Returns the result of converting the 64-bit two's complement integer `a' to
1299 | the quadruple-precision floating-point format.  The conversion is performed
1300 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1301 *----------------------------------------------------------------------------*/
1302
1303 float128 int64_to_float128( int64 a STATUS_PARAM )
1304 {
1305     flag zSign;
1306     uint64 absA;
1307     int8 shiftCount;
1308     int32 zExp;
1309     uint64_t zSig0, zSig1;
1310
1311     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1312     zSign = ( a < 0 );
1313     absA = zSign ? - a : a;
1314     shiftCount = countLeadingZeros64( absA ) + 49;
1315     zExp = 0x406E - shiftCount;
1316     if ( 64 <= shiftCount ) {
1317         zSig1 = 0;
1318         zSig0 = absA;
1319         shiftCount -= 64;
1320     }
1321     else {
1322         zSig1 = absA;
1323         zSig0 = 0;
1324     }
1325     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1326     return packFloat128( zSign, zExp, zSig0, zSig1 );
1327
1328 }
1329
1330 /*----------------------------------------------------------------------------
1331 | Returns the result of converting the single-precision floating-point value
1332 | `a' to the 32-bit two's complement integer format.  The conversion is
1333 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1334 | Arithmetic---which means in particular that the conversion is rounded
1335 | according to the current rounding mode.  If `a' is a NaN, the largest
1336 | positive integer is returned.  Otherwise, if the conversion overflows, the
1337 | largest integer with the same sign as `a' is returned.
1338 *----------------------------------------------------------------------------*/
1339
1340 int32 float32_to_int32( float32 a STATUS_PARAM )
1341 {
1342     flag aSign;
1343     int16 aExp, shiftCount;
1344     uint32_t aSig;
1345     uint64_t aSig64;
1346
1347     a = float32_squash_input_denormal(a STATUS_VAR);
1348     aSig = extractFloat32Frac( a );
1349     aExp = extractFloat32Exp( a );
1350     aSign = extractFloat32Sign( a );
1351     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1352     if ( aExp ) aSig |= 0x00800000;
1353     shiftCount = 0xAF - aExp;
1354     aSig64 = aSig;
1355     aSig64 <<= 32;
1356     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1357     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1358
1359 }
1360
1361 /*----------------------------------------------------------------------------
1362 | Returns the result of converting the single-precision floating-point value
1363 | `a' to the 32-bit two's complement integer format.  The conversion is
1364 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1365 | Arithmetic, except that the conversion is always rounded toward zero.
1366 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1367 | the conversion overflows, the largest integer with the same sign as `a' is
1368 | returned.
1369 *----------------------------------------------------------------------------*/
1370
1371 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1372 {
1373     flag aSign;
1374     int16 aExp, shiftCount;
1375     uint32_t aSig;
1376     int32 z;
1377     a = float32_squash_input_denormal(a STATUS_VAR);
1378
1379     aSig = extractFloat32Frac( a );
1380     aExp = extractFloat32Exp( a );
1381     aSign = extractFloat32Sign( a );
1382     shiftCount = aExp - 0x9E;
1383     if ( 0 <= shiftCount ) {
1384         if ( float32_val(a) != 0xCF000000 ) {
1385             float_raise( float_flag_invalid STATUS_VAR);
1386             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1387         }
1388         return (int32_t) 0x80000000;
1389     }
1390     else if ( aExp <= 0x7E ) {
1391         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1392         return 0;
1393     }
1394     aSig = ( aSig | 0x00800000 )<<8;
1395     z = aSig>>( - shiftCount );
1396     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1397         STATUS(float_exception_flags) |= float_flag_inexact;
1398     }
1399     if ( aSign ) z = - z;
1400     return z;
1401
1402 }
1403
1404 /*----------------------------------------------------------------------------
1405 | Returns the result of converting the single-precision floating-point value
1406 | `a' to the 16-bit two's complement integer format.  The conversion is
1407 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1408 | Arithmetic, except that the conversion is always rounded toward zero.
1409 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1410 | the conversion overflows, the largest integer with the same sign as `a' is
1411 | returned.
1412 *----------------------------------------------------------------------------*/
1413
1414 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1415 {
1416     flag aSign;
1417     int16 aExp, shiftCount;
1418     uint32_t aSig;
1419     int32 z;
1420
1421     aSig = extractFloat32Frac( a );
1422     aExp = extractFloat32Exp( a );
1423     aSign = extractFloat32Sign( a );
1424     shiftCount = aExp - 0x8E;
1425     if ( 0 <= shiftCount ) {
1426         if ( float32_val(a) != 0xC7000000 ) {
1427             float_raise( float_flag_invalid STATUS_VAR);
1428             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1429                 return 0x7FFF;
1430             }
1431         }
1432         return (int32_t) 0xffff8000;
1433     }
1434     else if ( aExp <= 0x7E ) {
1435         if ( aExp | aSig ) {
1436             STATUS(float_exception_flags) |= float_flag_inexact;
1437         }
1438         return 0;
1439     }
1440     shiftCount -= 0x10;
1441     aSig = ( aSig | 0x00800000 )<<8;
1442     z = aSig>>( - shiftCount );
1443     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1444         STATUS(float_exception_flags) |= float_flag_inexact;
1445     }
1446     if ( aSign ) {
1447         z = - z;
1448     }
1449     return z;
1450
1451 }
1452
1453 /*----------------------------------------------------------------------------
1454 | Returns the result of converting the single-precision floating-point value
1455 | `a' to the 64-bit two's complement integer format.  The conversion is
1456 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1457 | Arithmetic---which means in particular that the conversion is rounded
1458 | according to the current rounding mode.  If `a' is a NaN, the largest
1459 | positive integer is returned.  Otherwise, if the conversion overflows, the
1460 | largest integer with the same sign as `a' is returned.
1461 *----------------------------------------------------------------------------*/
1462
1463 int64 float32_to_int64( float32 a STATUS_PARAM )
1464 {
1465     flag aSign;
1466     int16 aExp, shiftCount;
1467     uint32_t aSig;
1468     uint64_t aSig64, aSigExtra;
1469     a = float32_squash_input_denormal(a STATUS_VAR);
1470
1471     aSig = extractFloat32Frac( a );
1472     aExp = extractFloat32Exp( a );
1473     aSign = extractFloat32Sign( a );
1474     shiftCount = 0xBE - aExp;
1475     if ( shiftCount < 0 ) {
1476         float_raise( float_flag_invalid STATUS_VAR);
1477         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1478             return LIT64( 0x7FFFFFFFFFFFFFFF );
1479         }
1480         return (int64_t) LIT64( 0x8000000000000000 );
1481     }
1482     if ( aExp ) aSig |= 0x00800000;
1483     aSig64 = aSig;
1484     aSig64 <<= 40;
1485     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1486     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1487
1488 }
1489
1490 /*----------------------------------------------------------------------------
1491 | Returns the result of converting the single-precision floating-point value
1492 | `a' to the 64-bit two's complement integer format.  The conversion is
1493 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1494 | Arithmetic, except that the conversion is always rounded toward zero.  If
1495 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1496 | conversion overflows, the largest integer with the same sign as `a' is
1497 | returned.
1498 *----------------------------------------------------------------------------*/
1499
1500 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1501 {
1502     flag aSign;
1503     int16 aExp, shiftCount;
1504     uint32_t aSig;
1505     uint64_t aSig64;
1506     int64 z;
1507     a = float32_squash_input_denormal(a STATUS_VAR);
1508
1509     aSig = extractFloat32Frac( a );
1510     aExp = extractFloat32Exp( a );
1511     aSign = extractFloat32Sign( a );
1512     shiftCount = aExp - 0xBE;
1513     if ( 0 <= shiftCount ) {
1514         if ( float32_val(a) != 0xDF000000 ) {
1515             float_raise( float_flag_invalid STATUS_VAR);
1516             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1517                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1518             }
1519         }
1520         return (int64_t) LIT64( 0x8000000000000000 );
1521     }
1522     else if ( aExp <= 0x7E ) {
1523         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1524         return 0;
1525     }
1526     aSig64 = aSig | 0x00800000;
1527     aSig64 <<= 40;
1528     z = aSig64>>( - shiftCount );
1529     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1530         STATUS(float_exception_flags) |= float_flag_inexact;
1531     }
1532     if ( aSign ) z = - z;
1533     return z;
1534
1535 }
1536
1537 /*----------------------------------------------------------------------------
1538 | Returns the result of converting the single-precision floating-point value
1539 | `a' to the double-precision floating-point format.  The conversion is
1540 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1541 | Arithmetic.
1542 *----------------------------------------------------------------------------*/
1543
1544 float64 float32_to_float64( float32 a STATUS_PARAM )
1545 {
1546     flag aSign;
1547     int16 aExp;
1548     uint32_t aSig;
1549     a = float32_squash_input_denormal(a STATUS_VAR);
1550
1551     aSig = extractFloat32Frac( a );
1552     aExp = extractFloat32Exp( a );
1553     aSign = extractFloat32Sign( a );
1554     if ( aExp == 0xFF ) {
1555         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1556         return packFloat64( aSign, 0x7FF, 0 );
1557     }
1558     if ( aExp == 0 ) {
1559         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1560         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1561         --aExp;
1562     }
1563     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1564
1565 }
1566
1567 /*----------------------------------------------------------------------------
1568 | Returns the result of converting the single-precision floating-point value
1569 | `a' to the extended double-precision floating-point format.  The conversion
1570 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1571 | Arithmetic.
1572 *----------------------------------------------------------------------------*/
1573
1574 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1575 {
1576     flag aSign;
1577     int16 aExp;
1578     uint32_t aSig;
1579
1580     a = float32_squash_input_denormal(a STATUS_VAR);
1581     aSig = extractFloat32Frac( a );
1582     aExp = extractFloat32Exp( a );
1583     aSign = extractFloat32Sign( a );
1584     if ( aExp == 0xFF ) {
1585         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1586         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1587     }
1588     if ( aExp == 0 ) {
1589         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1590         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1591     }
1592     aSig |= 0x00800000;
1593     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1594
1595 }
1596
1597 /*----------------------------------------------------------------------------
1598 | Returns the result of converting the single-precision floating-point value
1599 | `a' to the double-precision floating-point format.  The conversion is
1600 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1601 | Arithmetic.
1602 *----------------------------------------------------------------------------*/
1603
1604 float128 float32_to_float128( float32 a STATUS_PARAM )
1605 {
1606     flag aSign;
1607     int16 aExp;
1608     uint32_t aSig;
1609
1610     a = float32_squash_input_denormal(a STATUS_VAR);
1611     aSig = extractFloat32Frac( a );
1612     aExp = extractFloat32Exp( a );
1613     aSign = extractFloat32Sign( a );
1614     if ( aExp == 0xFF ) {
1615         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1616         return packFloat128( aSign, 0x7FFF, 0, 0 );
1617     }
1618     if ( aExp == 0 ) {
1619         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1620         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1621         --aExp;
1622     }
1623     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1624
1625 }
1626
1627 /*----------------------------------------------------------------------------
1628 | Rounds the single-precision floating-point value `a' to an integer, and
1629 | returns the result as a single-precision floating-point value.  The
1630 | operation is performed according to the IEC/IEEE Standard for Binary
1631 | Floating-Point Arithmetic.
1632 *----------------------------------------------------------------------------*/
1633
1634 float32 float32_round_to_int( float32 a STATUS_PARAM)
1635 {
1636     flag aSign;
1637     int16 aExp;
1638     uint32_t lastBitMask, roundBitsMask;
1639     int8 roundingMode;
1640     uint32_t z;
1641     a = float32_squash_input_denormal(a STATUS_VAR);
1642
1643     aExp = extractFloat32Exp( a );
1644     if ( 0x96 <= aExp ) {
1645         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1646             return propagateFloat32NaN( a, a STATUS_VAR );
1647         }
1648         return a;
1649     }
1650     if ( aExp <= 0x7E ) {
1651         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1652         STATUS(float_exception_flags) |= float_flag_inexact;
1653         aSign = extractFloat32Sign( a );
1654         switch ( STATUS(float_rounding_mode) ) {
1655          case float_round_nearest_even:
1656             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1657                 return packFloat32( aSign, 0x7F, 0 );
1658             }
1659             break;
1660          case float_round_down:
1661             return make_float32(aSign ? 0xBF800000 : 0);
1662          case float_round_up:
1663             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1664         }
1665         return packFloat32( aSign, 0, 0 );
1666     }
1667     lastBitMask = 1;
1668     lastBitMask <<= 0x96 - aExp;
1669     roundBitsMask = lastBitMask - 1;
1670     z = float32_val(a);
1671     roundingMode = STATUS(float_rounding_mode);
1672     if ( roundingMode == float_round_nearest_even ) {
1673         z += lastBitMask>>1;
1674         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1675     }
1676     else if ( roundingMode != float_round_to_zero ) {
1677         if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1678             z += roundBitsMask;
1679         }
1680     }
1681     z &= ~ roundBitsMask;
1682     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1683     return make_float32(z);
1684
1685 }
1686
1687 /*----------------------------------------------------------------------------
1688 | Returns the result of adding the absolute values of the single-precision
1689 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1690 | before being returned.  `zSign' is ignored if the result is a NaN.
1691 | The addition is performed according to the IEC/IEEE Standard for Binary
1692 | Floating-Point Arithmetic.
1693 *----------------------------------------------------------------------------*/
1694
1695 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1696 {
1697     int16 aExp, bExp, zExp;
1698     uint32_t aSig, bSig, zSig;
1699     int16 expDiff;
1700
1701     aSig = extractFloat32Frac( a );
1702     aExp = extractFloat32Exp( a );
1703     bSig = extractFloat32Frac( b );
1704     bExp = extractFloat32Exp( b );
1705     expDiff = aExp - bExp;
1706     aSig <<= 6;
1707     bSig <<= 6;
1708     if ( 0 < expDiff ) {
1709         if ( aExp == 0xFF ) {
1710             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1711             return a;
1712         }
1713         if ( bExp == 0 ) {
1714             --expDiff;
1715         }
1716         else {
1717             bSig |= 0x20000000;
1718         }
1719         shift32RightJamming( bSig, expDiff, &bSig );
1720         zExp = aExp;
1721     }
1722     else if ( expDiff < 0 ) {
1723         if ( bExp == 0xFF ) {
1724             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1725             return packFloat32( zSign, 0xFF, 0 );
1726         }
1727         if ( aExp == 0 ) {
1728             ++expDiff;
1729         }
1730         else {
1731             aSig |= 0x20000000;
1732         }
1733         shift32RightJamming( aSig, - expDiff, &aSig );
1734         zExp = bExp;
1735     }
1736     else {
1737         if ( aExp == 0xFF ) {
1738             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1739             return a;
1740         }
1741         if ( aExp == 0 ) {
1742             if (STATUS(flush_to_zero)) {
1743                 if (aSig | bSig) {
1744                     float_raise(float_flag_output_denormal STATUS_VAR);
1745                 }
1746                 return packFloat32(zSign, 0, 0);
1747             }
1748             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1749         }
1750         zSig = 0x40000000 + aSig + bSig;
1751         zExp = aExp;
1752         goto roundAndPack;
1753     }
1754     aSig |= 0x20000000;
1755     zSig = ( aSig + bSig )<<1;
1756     --zExp;
1757     if ( (int32_t) zSig < 0 ) {
1758         zSig = aSig + bSig;
1759         ++zExp;
1760     }
1761  roundAndPack:
1762     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1763
1764 }
1765
1766 /*----------------------------------------------------------------------------
1767 | Returns the result of subtracting the absolute values of the single-
1768 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1769 | difference is negated before being returned.  `zSign' is ignored if the
1770 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1771 | Standard for Binary Floating-Point Arithmetic.
1772 *----------------------------------------------------------------------------*/
1773
1774 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1775 {
1776     int16 aExp, bExp, zExp;
1777     uint32_t aSig, bSig, zSig;
1778     int16 expDiff;
1779
1780     aSig = extractFloat32Frac( a );
1781     aExp = extractFloat32Exp( a );
1782     bSig = extractFloat32Frac( b );
1783     bExp = extractFloat32Exp( b );
1784     expDiff = aExp - bExp;
1785     aSig <<= 7;
1786     bSig <<= 7;
1787     if ( 0 < expDiff ) goto aExpBigger;
1788     if ( expDiff < 0 ) goto bExpBigger;
1789     if ( aExp == 0xFF ) {
1790         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1791         float_raise( float_flag_invalid STATUS_VAR);
1792         return float32_default_nan;
1793     }
1794     if ( aExp == 0 ) {
1795         aExp = 1;
1796         bExp = 1;
1797     }
1798     if ( bSig < aSig ) goto aBigger;
1799     if ( aSig < bSig ) goto bBigger;
1800     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1801  bExpBigger:
1802     if ( bExp == 0xFF ) {
1803         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1804         return packFloat32( zSign ^ 1, 0xFF, 0 );
1805     }
1806     if ( aExp == 0 ) {
1807         ++expDiff;
1808     }
1809     else {
1810         aSig |= 0x40000000;
1811     }
1812     shift32RightJamming( aSig, - expDiff, &aSig );
1813     bSig |= 0x40000000;
1814  bBigger:
1815     zSig = bSig - aSig;
1816     zExp = bExp;
1817     zSign ^= 1;
1818     goto normalizeRoundAndPack;
1819  aExpBigger:
1820     if ( aExp == 0xFF ) {
1821         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1822         return a;
1823     }
1824     if ( bExp == 0 ) {
1825         --expDiff;
1826     }
1827     else {
1828         bSig |= 0x40000000;
1829     }
1830     shift32RightJamming( bSig, expDiff, &bSig );
1831     aSig |= 0x40000000;
1832  aBigger:
1833     zSig = aSig - bSig;
1834     zExp = aExp;
1835  normalizeRoundAndPack:
1836     --zExp;
1837     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1838
1839 }
1840
1841 /*----------------------------------------------------------------------------
1842 | Returns the result of adding the single-precision floating-point values `a'
1843 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1844 | Binary Floating-Point Arithmetic.
1845 *----------------------------------------------------------------------------*/
1846
1847 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1848 {
1849     flag aSign, bSign;
1850     a = float32_squash_input_denormal(a STATUS_VAR);
1851     b = float32_squash_input_denormal(b STATUS_VAR);
1852
1853     aSign = extractFloat32Sign( a );
1854     bSign = extractFloat32Sign( b );
1855     if ( aSign == bSign ) {
1856         return addFloat32Sigs( a, b, aSign STATUS_VAR);
1857     }
1858     else {
1859         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1860     }
1861
1862 }
1863
1864 /*----------------------------------------------------------------------------
1865 | Returns the result of subtracting the single-precision floating-point values
1866 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1867 | for Binary Floating-Point Arithmetic.
1868 *----------------------------------------------------------------------------*/
1869
1870 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1871 {
1872     flag aSign, bSign;
1873     a = float32_squash_input_denormal(a STATUS_VAR);
1874     b = float32_squash_input_denormal(b STATUS_VAR);
1875
1876     aSign = extractFloat32Sign( a );
1877     bSign = extractFloat32Sign( b );
1878     if ( aSign == bSign ) {
1879         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1880     }
1881     else {
1882         return addFloat32Sigs( a, b, aSign STATUS_VAR );
1883     }
1884
1885 }
1886
1887 /*----------------------------------------------------------------------------
1888 | Returns the result of multiplying the single-precision floating-point values
1889 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1890 | for Binary Floating-Point Arithmetic.
1891 *----------------------------------------------------------------------------*/
1892
1893 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1894 {
1895     flag aSign, bSign, zSign;
1896     int16 aExp, bExp, zExp;
1897     uint32_t aSig, bSig;
1898     uint64_t zSig64;
1899     uint32_t zSig;
1900
1901     a = float32_squash_input_denormal(a STATUS_VAR);
1902     b = float32_squash_input_denormal(b STATUS_VAR);
1903
1904     aSig = extractFloat32Frac( a );
1905     aExp = extractFloat32Exp( a );
1906     aSign = extractFloat32Sign( a );
1907     bSig = extractFloat32Frac( b );
1908     bExp = extractFloat32Exp( b );
1909     bSign = extractFloat32Sign( b );
1910     zSign = aSign ^ bSign;
1911     if ( aExp == 0xFF ) {
1912         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1913             return propagateFloat32NaN( a, b STATUS_VAR );
1914         }
1915         if ( ( bExp | bSig ) == 0 ) {
1916             float_raise( float_flag_invalid STATUS_VAR);
1917             return float32_default_nan;
1918         }
1919         return packFloat32( zSign, 0xFF, 0 );
1920     }
1921     if ( bExp == 0xFF ) {
1922         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1923         if ( ( aExp | aSig ) == 0 ) {
1924             float_raise( float_flag_invalid STATUS_VAR);
1925             return float32_default_nan;
1926         }
1927         return packFloat32( zSign, 0xFF, 0 );
1928     }
1929     if ( aExp == 0 ) {
1930         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1931         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1932     }
1933     if ( bExp == 0 ) {
1934         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1935         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1936     }
1937     zExp = aExp + bExp - 0x7F;
1938     aSig = ( aSig | 0x00800000 )<<7;
1939     bSig = ( bSig | 0x00800000 )<<8;
1940     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
1941     zSig = zSig64;
1942     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
1943         zSig <<= 1;
1944         --zExp;
1945     }
1946     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1947
1948 }
1949
1950 /*----------------------------------------------------------------------------
1951 | Returns the result of dividing the single-precision floating-point value `a'
1952 | by the corresponding value `b'.  The operation is performed according to the
1953 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1954 *----------------------------------------------------------------------------*/
1955
1956 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1957 {
1958     flag aSign, bSign, zSign;
1959     int16 aExp, bExp, zExp;
1960     uint32_t aSig, bSig, zSig;
1961     a = float32_squash_input_denormal(a STATUS_VAR);
1962     b = float32_squash_input_denormal(b STATUS_VAR);
1963
1964     aSig = extractFloat32Frac( a );
1965     aExp = extractFloat32Exp( a );
1966     aSign = extractFloat32Sign( a );
1967     bSig = extractFloat32Frac( b );
1968     bExp = extractFloat32Exp( b );
1969     bSign = extractFloat32Sign( b );
1970     zSign = aSign ^ bSign;
1971     if ( aExp == 0xFF ) {
1972         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1973         if ( bExp == 0xFF ) {
1974             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1975             float_raise( float_flag_invalid STATUS_VAR);
1976             return float32_default_nan;
1977         }
1978         return packFloat32( zSign, 0xFF, 0 );
1979     }
1980     if ( bExp == 0xFF ) {
1981         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1982         return packFloat32( zSign, 0, 0 );
1983     }
1984     if ( bExp == 0 ) {
1985         if ( bSig == 0 ) {
1986             if ( ( aExp | aSig ) == 0 ) {
1987                 float_raise( float_flag_invalid STATUS_VAR);
1988                 return float32_default_nan;
1989             }
1990             float_raise( float_flag_divbyzero STATUS_VAR);
1991             return packFloat32( zSign, 0xFF, 0 );
1992         }
1993         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1994     }
1995     if ( aExp == 0 ) {
1996         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1997         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1998     }
1999     zExp = aExp - bExp + 0x7D;
2000     aSig = ( aSig | 0x00800000 )<<7;
2001     bSig = ( bSig | 0x00800000 )<<8;
2002     if ( bSig <= ( aSig + aSig ) ) {
2003         aSig >>= 1;
2004         ++zExp;
2005     }
2006     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2007     if ( ( zSig & 0x3F ) == 0 ) {
2008         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2009     }
2010     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2011
2012 }
2013
2014 /*----------------------------------------------------------------------------
2015 | Returns the remainder of the single-precision floating-point value `a'
2016 | with respect to the corresponding value `b'.  The operation is performed
2017 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2018 *----------------------------------------------------------------------------*/
2019
2020 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2021 {
2022     flag aSign, zSign;
2023     int16 aExp, bExp, expDiff;
2024     uint32_t aSig, bSig;
2025     uint32_t q;
2026     uint64_t aSig64, bSig64, q64;
2027     uint32_t alternateASig;
2028     int32_t sigMean;
2029     a = float32_squash_input_denormal(a STATUS_VAR);
2030     b = float32_squash_input_denormal(b STATUS_VAR);
2031
2032     aSig = extractFloat32Frac( a );
2033     aExp = extractFloat32Exp( a );
2034     aSign = extractFloat32Sign( a );
2035     bSig = extractFloat32Frac( b );
2036     bExp = extractFloat32Exp( b );
2037     if ( aExp == 0xFF ) {
2038         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2039             return propagateFloat32NaN( a, b STATUS_VAR );
2040         }
2041         float_raise( float_flag_invalid STATUS_VAR);
2042         return float32_default_nan;
2043     }
2044     if ( bExp == 0xFF ) {
2045         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2046         return a;
2047     }
2048     if ( bExp == 0 ) {
2049         if ( bSig == 0 ) {
2050             float_raise( float_flag_invalid STATUS_VAR);
2051             return float32_default_nan;
2052         }
2053         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2054     }
2055     if ( aExp == 0 ) {
2056         if ( aSig == 0 ) return a;
2057         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2058     }
2059     expDiff = aExp - bExp;
2060     aSig |= 0x00800000;
2061     bSig |= 0x00800000;
2062     if ( expDiff < 32 ) {
2063         aSig <<= 8;
2064         bSig <<= 8;
2065         if ( expDiff < 0 ) {
2066             if ( expDiff < -1 ) return a;
2067             aSig >>= 1;
2068         }
2069         q = ( bSig <= aSig );
2070         if ( q ) aSig -= bSig;
2071         if ( 0 < expDiff ) {
2072             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2073             q >>= 32 - expDiff;
2074             bSig >>= 2;
2075             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2076         }
2077         else {
2078             aSig >>= 2;
2079             bSig >>= 2;
2080         }
2081     }
2082     else {
2083         if ( bSig <= aSig ) aSig -= bSig;
2084         aSig64 = ( (uint64_t) aSig )<<40;
2085         bSig64 = ( (uint64_t) bSig )<<40;
2086         expDiff -= 64;
2087         while ( 0 < expDiff ) {
2088             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2089             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2090             aSig64 = - ( ( bSig * q64 )<<38 );
2091             expDiff -= 62;
2092         }
2093         expDiff += 64;
2094         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2095         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2096         q = q64>>( 64 - expDiff );
2097         bSig <<= 6;
2098         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2099     }
2100     do {
2101         alternateASig = aSig;
2102         ++q;
2103         aSig -= bSig;
2104     } while ( 0 <= (int32_t) aSig );
2105     sigMean = aSig + alternateASig;
2106     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2107         aSig = alternateASig;
2108     }
2109     zSign = ( (int32_t) aSig < 0 );
2110     if ( zSign ) aSig = - aSig;
2111     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2112
2113 }
2114
2115 /*----------------------------------------------------------------------------
2116 | Returns the square root of the single-precision floating-point value `a'.
2117 | The operation is performed according to the IEC/IEEE Standard for Binary
2118 | Floating-Point Arithmetic.
2119 *----------------------------------------------------------------------------*/
2120
2121 float32 float32_sqrt( float32 a STATUS_PARAM )
2122 {
2123     flag aSign;
2124     int16 aExp, zExp;
2125     uint32_t aSig, zSig;
2126     uint64_t rem, term;
2127     a = float32_squash_input_denormal(a STATUS_VAR);
2128
2129     aSig = extractFloat32Frac( a );
2130     aExp = extractFloat32Exp( a );
2131     aSign = extractFloat32Sign( a );
2132     if ( aExp == 0xFF ) {
2133         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2134         if ( ! aSign ) return a;
2135         float_raise( float_flag_invalid STATUS_VAR);
2136         return float32_default_nan;
2137     }
2138     if ( aSign ) {
2139         if ( ( aExp | aSig ) == 0 ) return a;
2140         float_raise( float_flag_invalid STATUS_VAR);
2141         return float32_default_nan;
2142     }
2143     if ( aExp == 0 ) {
2144         if ( aSig == 0 ) return float32_zero;
2145         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2146     }
2147     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2148     aSig = ( aSig | 0x00800000 )<<8;
2149     zSig = estimateSqrt32( aExp, aSig ) + 2;
2150     if ( ( zSig & 0x7F ) <= 5 ) {
2151         if ( zSig < 2 ) {
2152             zSig = 0x7FFFFFFF;
2153             goto roundAndPack;
2154         }
2155         aSig >>= aExp & 1;
2156         term = ( (uint64_t) zSig ) * zSig;
2157         rem = ( ( (uint64_t) aSig )<<32 ) - term;
2158         while ( (int64_t) rem < 0 ) {
2159             --zSig;
2160             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2161         }
2162         zSig |= ( rem != 0 );
2163     }
2164     shift32RightJamming( zSig, 1, &zSig );
2165  roundAndPack:
2166     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2167
2168 }
2169
2170 /*----------------------------------------------------------------------------
2171 | Returns the binary exponential of the single-precision floating-point value
2172 | `a'. The operation is performed according to the IEC/IEEE Standard for
2173 | Binary Floating-Point Arithmetic.
2174 |
2175 | Uses the following identities:
2176 |
2177 | 1. -------------------------------------------------------------------------
2178 |      x    x*ln(2)
2179 |     2  = e
2180 |
2181 | 2. -------------------------------------------------------------------------
2182 |                      2     3     4     5           n
2183 |      x        x     x     x     x     x           x
2184 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2185 |               1!    2!    3!    4!    5!          n!
2186 *----------------------------------------------------------------------------*/
2187
2188 static const float64 float32_exp2_coefficients[15] =
2189 {
2190     const_float64( 0x3ff0000000000000ll ), /*  1 */
2191     const_float64( 0x3fe0000000000000ll ), /*  2 */
2192     const_float64( 0x3fc5555555555555ll ), /*  3 */
2193     const_float64( 0x3fa5555555555555ll ), /*  4 */
2194     const_float64( 0x3f81111111111111ll ), /*  5 */
2195     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2196     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2197     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2198     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2199     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2200     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2201     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2202     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2203     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2204     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2205 };
2206
2207 float32 float32_exp2( float32 a STATUS_PARAM )
2208 {
2209     flag aSign;
2210     int16 aExp;
2211     uint32_t aSig;
2212     float64 r, x, xn;
2213     int i;
2214     a = float32_squash_input_denormal(a STATUS_VAR);
2215
2216     aSig = extractFloat32Frac( a );
2217     aExp = extractFloat32Exp( a );
2218     aSign = extractFloat32Sign( a );
2219
2220     if ( aExp == 0xFF) {
2221         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2222         return (aSign) ? float32_zero : a;
2223     }
2224     if (aExp == 0) {
2225         if (aSig == 0) return float32_one;
2226     }
2227
2228     float_raise( float_flag_inexact STATUS_VAR);
2229
2230     /* ******************************* */
2231     /* using float64 for approximation */
2232     /* ******************************* */
2233     x = float32_to_float64(a STATUS_VAR);
2234     x = float64_mul(x, float64_ln2 STATUS_VAR);
2235
2236     xn = x;
2237     r = float64_one;
2238     for (i = 0 ; i < 15 ; i++) {
2239         float64 f;
2240
2241         f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2242         r = float64_add(r, f STATUS_VAR);
2243
2244         xn = float64_mul(xn, x STATUS_VAR);
2245     }
2246
2247     return float64_to_float32(r, status);
2248 }
2249
2250 /*----------------------------------------------------------------------------
2251 | Returns the binary log of the single-precision floating-point value `a'.
2252 | The operation is performed according to the IEC/IEEE Standard for Binary
2253 | Floating-Point Arithmetic.
2254 *----------------------------------------------------------------------------*/
2255 float32 float32_log2( float32 a STATUS_PARAM )
2256 {
2257     flag aSign, zSign;
2258     int16 aExp;
2259     uint32_t aSig, zSig, i;
2260
2261     a = float32_squash_input_denormal(a STATUS_VAR);
2262     aSig = extractFloat32Frac( a );
2263     aExp = extractFloat32Exp( a );
2264     aSign = extractFloat32Sign( a );
2265
2266     if ( aExp == 0 ) {
2267         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2268         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2269     }
2270     if ( aSign ) {
2271         float_raise( float_flag_invalid STATUS_VAR);
2272         return float32_default_nan;
2273     }
2274     if ( aExp == 0xFF ) {
2275         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2276         return a;
2277     }
2278
2279     aExp -= 0x7F;
2280     aSig |= 0x00800000;
2281     zSign = aExp < 0;
2282     zSig = aExp << 23;
2283
2284     for (i = 1 << 22; i > 0; i >>= 1) {
2285         aSig = ( (uint64_t)aSig * aSig ) >> 23;
2286         if ( aSig & 0x01000000 ) {
2287             aSig >>= 1;
2288             zSig |= i;
2289         }
2290     }
2291
2292     if ( zSign )
2293         zSig = -zSig;
2294
2295     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2296 }
2297
2298 /*----------------------------------------------------------------------------
2299 | Returns 1 if the single-precision floating-point value `a' is equal to
2300 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2301 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2302 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2303 *----------------------------------------------------------------------------*/
2304
2305 int float32_eq( float32 a, float32 b STATUS_PARAM )
2306 {
2307     uint32_t av, bv;
2308     a = float32_squash_input_denormal(a STATUS_VAR);
2309     b = float32_squash_input_denormal(b STATUS_VAR);
2310
2311     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2312          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2313        ) {
2314         float_raise( float_flag_invalid STATUS_VAR);
2315         return 0;
2316     }
2317     av = float32_val(a);
2318     bv = float32_val(b);
2319     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2320 }
2321
2322 /*----------------------------------------------------------------------------
2323 | Returns 1 if the single-precision floating-point value `a' is less than
2324 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
2325 | exception is raised if either operand is a NaN.  The comparison is performed
2326 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2327 *----------------------------------------------------------------------------*/
2328
2329 int float32_le( float32 a, float32 b STATUS_PARAM )
2330 {
2331     flag aSign, bSign;
2332     uint32_t av, bv;
2333     a = float32_squash_input_denormal(a STATUS_VAR);
2334     b = float32_squash_input_denormal(b STATUS_VAR);
2335
2336     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2337          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2338        ) {
2339         float_raise( float_flag_invalid STATUS_VAR);
2340         return 0;
2341     }
2342     aSign = extractFloat32Sign( a );
2343     bSign = extractFloat32Sign( b );
2344     av = float32_val(a);
2345     bv = float32_val(b);
2346     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2347     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2348
2349 }
2350
2351 /*----------------------------------------------------------------------------
2352 | Returns 1 if the single-precision floating-point value `a' is less than
2353 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2354 | raised if either operand is a NaN.  The comparison is performed according
2355 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2356 *----------------------------------------------------------------------------*/
2357
2358 int float32_lt( float32 a, float32 b STATUS_PARAM )
2359 {
2360     flag aSign, bSign;
2361     uint32_t av, bv;
2362     a = float32_squash_input_denormal(a STATUS_VAR);
2363     b = float32_squash_input_denormal(b STATUS_VAR);
2364
2365     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2366          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2367        ) {
2368         float_raise( float_flag_invalid STATUS_VAR);
2369         return 0;
2370     }
2371     aSign = extractFloat32Sign( a );
2372     bSign = extractFloat32Sign( b );
2373     av = float32_val(a);
2374     bv = float32_val(b);
2375     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2376     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2377
2378 }
2379
2380 /*----------------------------------------------------------------------------
2381 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2382 | be compared, and 0 otherwise.  The invalid exception is raised if either
2383 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
2384 | Standard for Binary Floating-Point Arithmetic.
2385 *----------------------------------------------------------------------------*/
2386
2387 int float32_unordered( float32 a, float32 b STATUS_PARAM )
2388 {
2389     a = float32_squash_input_denormal(a STATUS_VAR);
2390     b = float32_squash_input_denormal(b STATUS_VAR);
2391
2392     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2393          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2394        ) {
2395         float_raise( float_flag_invalid STATUS_VAR);
2396         return 1;
2397     }
2398     return 0;
2399 }
2400
2401 /*----------------------------------------------------------------------------
2402 | Returns 1 if the single-precision floating-point value `a' is equal to
2403 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2404 | exception.  The comparison is performed according to the IEC/IEEE Standard
2405 | for Binary Floating-Point Arithmetic.
2406 *----------------------------------------------------------------------------*/
2407
2408 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2409 {
2410     a = float32_squash_input_denormal(a STATUS_VAR);
2411     b = float32_squash_input_denormal(b STATUS_VAR);
2412
2413     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2414          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2415        ) {
2416         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2417             float_raise( float_flag_invalid STATUS_VAR);
2418         }
2419         return 0;
2420     }
2421     return ( float32_val(a) == float32_val(b) ) ||
2422             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2423 }
2424
2425 /*----------------------------------------------------------------------------
2426 | Returns 1 if the single-precision floating-point value `a' is less than or
2427 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2428 | cause an exception.  Otherwise, the comparison is performed according to the
2429 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2430 *----------------------------------------------------------------------------*/
2431
2432 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2433 {
2434     flag aSign, bSign;
2435     uint32_t av, bv;
2436     a = float32_squash_input_denormal(a STATUS_VAR);
2437     b = float32_squash_input_denormal(b STATUS_VAR);
2438
2439     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2440          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2441        ) {
2442         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2443             float_raise( float_flag_invalid STATUS_VAR);
2444         }
2445         return 0;
2446     }
2447     aSign = extractFloat32Sign( a );
2448     bSign = extractFloat32Sign( b );
2449     av = float32_val(a);
2450     bv = float32_val(b);
2451     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2452     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2453
2454 }
2455
2456 /*----------------------------------------------------------------------------
2457 | Returns 1 if the single-precision floating-point value `a' is less than
2458 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2459 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2460 | Standard for Binary Floating-Point Arithmetic.
2461 *----------------------------------------------------------------------------*/
2462
2463 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2464 {
2465     flag aSign, bSign;
2466     uint32_t av, bv;
2467     a = float32_squash_input_denormal(a STATUS_VAR);
2468     b = float32_squash_input_denormal(b STATUS_VAR);
2469
2470     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2471          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2472        ) {
2473         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2474             float_raise( float_flag_invalid STATUS_VAR);
2475         }
2476         return 0;
2477     }
2478     aSign = extractFloat32Sign( a );
2479     bSign = extractFloat32Sign( b );
2480     av = float32_val(a);
2481     bv = float32_val(b);
2482     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2483     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2484
2485 }
2486
2487 /*----------------------------------------------------------------------------
2488 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2489 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
2490 | comparison is performed according to the IEC/IEEE Standard for Binary
2491 | Floating-Point Arithmetic.
2492 *----------------------------------------------------------------------------*/
2493
2494 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2495 {
2496     a = float32_squash_input_denormal(a STATUS_VAR);
2497     b = float32_squash_input_denormal(b STATUS_VAR);
2498
2499     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2500          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2501        ) {
2502         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2503             float_raise( float_flag_invalid STATUS_VAR);
2504         }
2505         return 1;
2506     }
2507     return 0;
2508 }
2509
2510 /*----------------------------------------------------------------------------
2511 | Returns the result of converting the double-precision floating-point value
2512 | `a' to the 32-bit two's complement integer format.  The conversion is
2513 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2514 | Arithmetic---which means in particular that the conversion is rounded
2515 | according to the current rounding mode.  If `a' is a NaN, the largest
2516 | positive integer is returned.  Otherwise, if the conversion overflows, the
2517 | largest integer with the same sign as `a' is returned.
2518 *----------------------------------------------------------------------------*/
2519
2520 int32 float64_to_int32( float64 a STATUS_PARAM )
2521 {
2522     flag aSign;
2523     int16 aExp, shiftCount;
2524     uint64_t aSig;
2525     a = float64_squash_input_denormal(a STATUS_VAR);
2526
2527     aSig = extractFloat64Frac( a );
2528     aExp = extractFloat64Exp( a );
2529     aSign = extractFloat64Sign( a );
2530     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2531     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2532     shiftCount = 0x42C - aExp;
2533     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2534     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2535
2536 }
2537
2538 /*----------------------------------------------------------------------------
2539 | Returns the result of converting the double-precision floating-point value
2540 | `a' to the 32-bit two's complement integer format.  The conversion is
2541 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2542 | Arithmetic, except that the conversion is always rounded toward zero.
2543 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2544 | the conversion overflows, the largest integer with the same sign as `a' is
2545 | returned.
2546 *----------------------------------------------------------------------------*/
2547
2548 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2549 {
2550     flag aSign;
2551     int16 aExp, shiftCount;
2552     uint64_t aSig, savedASig;
2553     int32 z;
2554     a = float64_squash_input_denormal(a STATUS_VAR);
2555
2556     aSig = extractFloat64Frac( a );
2557     aExp = extractFloat64Exp( a );
2558     aSign = extractFloat64Sign( a );
2559     if ( 0x41E < aExp ) {
2560         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2561         goto invalid;
2562     }
2563     else if ( aExp < 0x3FF ) {
2564         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2565         return 0;
2566     }
2567     aSig |= LIT64( 0x0010000000000000 );
2568     shiftCount = 0x433 - aExp;
2569     savedASig = aSig;
2570     aSig >>= shiftCount;
2571     z = aSig;
2572     if ( aSign ) z = - z;
2573     if ( ( z < 0 ) ^ aSign ) {
2574  invalid:
2575         float_raise( float_flag_invalid STATUS_VAR);
2576         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2577     }
2578     if ( ( aSig<<shiftCount ) != savedASig ) {
2579         STATUS(float_exception_flags) |= float_flag_inexact;
2580     }
2581     return z;
2582
2583 }
2584
2585 /*----------------------------------------------------------------------------
2586 | Returns the result of converting the double-precision floating-point value
2587 | `a' to the 16-bit two's complement integer format.  The conversion is
2588 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2589 | Arithmetic, except that the conversion is always rounded toward zero.
2590 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2591 | the conversion overflows, the largest integer with the same sign as `a' is
2592 | returned.
2593 *----------------------------------------------------------------------------*/
2594
2595 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2596 {
2597     flag aSign;
2598     int16 aExp, shiftCount;
2599     uint64_t aSig, savedASig;
2600     int32 z;
2601
2602     aSig = extractFloat64Frac( a );
2603     aExp = extractFloat64Exp( a );
2604     aSign = extractFloat64Sign( a );
2605     if ( 0x40E < aExp ) {
2606         if ( ( aExp == 0x7FF ) && aSig ) {
2607             aSign = 0;
2608         }
2609         goto invalid;
2610     }
2611     else if ( aExp < 0x3FF ) {
2612         if ( aExp || aSig ) {
2613             STATUS(float_exception_flags) |= float_flag_inexact;
2614         }
2615         return 0;
2616     }
2617     aSig |= LIT64( 0x0010000000000000 );
2618     shiftCount = 0x433 - aExp;
2619     savedASig = aSig;
2620     aSig >>= shiftCount;
2621     z = aSig;
2622     if ( aSign ) {
2623         z = - z;
2624     }
2625     if ( ( (int16_t)z < 0 ) ^ aSign ) {
2626  invalid:
2627         float_raise( float_flag_invalid STATUS_VAR);
2628         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2629     }
2630     if ( ( aSig<<shiftCount ) != savedASig ) {
2631         STATUS(float_exception_flags) |= float_flag_inexact;
2632     }
2633     return z;
2634 }
2635
2636 /*----------------------------------------------------------------------------
2637 | Returns the result of converting the double-precision floating-point value
2638 | `a' to the 64-bit two's complement integer format.  The conversion is
2639 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2640 | Arithmetic---which means in particular that the conversion is rounded
2641 | according to the current rounding mode.  If `a' is a NaN, the largest
2642 | positive integer is returned.  Otherwise, if the conversion overflows, the
2643 | largest integer with the same sign as `a' is returned.
2644 *----------------------------------------------------------------------------*/
2645
2646 int64 float64_to_int64( float64 a STATUS_PARAM )
2647 {
2648     flag aSign;
2649     int16 aExp, shiftCount;
2650     uint64_t aSig, aSigExtra;
2651     a = float64_squash_input_denormal(a STATUS_VAR);
2652
2653     aSig = extractFloat64Frac( a );
2654     aExp = extractFloat64Exp( a );
2655     aSign = extractFloat64Sign( a );
2656     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2657     shiftCount = 0x433 - aExp;
2658     if ( shiftCount <= 0 ) {
2659         if ( 0x43E < aExp ) {
2660             float_raise( float_flag_invalid STATUS_VAR);
2661             if (    ! aSign
2662                  || (    ( aExp == 0x7FF )
2663                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2664                ) {
2665                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2666             }
2667             return (int64_t) LIT64( 0x8000000000000000 );
2668         }
2669         aSigExtra = 0;
2670         aSig <<= - shiftCount;
2671     }
2672     else {
2673         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2674     }
2675     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2676
2677 }
2678
2679 /*----------------------------------------------------------------------------
2680 | Returns the result of converting the double-precision floating-point value
2681 | `a' to the 64-bit two's complement integer format.  The conversion is
2682 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2683 | Arithmetic, except that the conversion is always rounded toward zero.
2684 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2685 | the conversion overflows, the largest integer with the same sign as `a' is
2686 | returned.
2687 *----------------------------------------------------------------------------*/
2688
2689 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2690 {
2691     flag aSign;
2692     int16 aExp, shiftCount;
2693     uint64_t aSig;
2694     int64 z;
2695     a = float64_squash_input_denormal(a STATUS_VAR);
2696
2697     aSig = extractFloat64Frac( a );
2698     aExp = extractFloat64Exp( a );
2699     aSign = extractFloat64Sign( a );
2700     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2701     shiftCount = aExp - 0x433;
2702     if ( 0 <= shiftCount ) {
2703         if ( 0x43E <= aExp ) {
2704             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2705                 float_raise( float_flag_invalid STATUS_VAR);
2706                 if (    ! aSign
2707                      || (    ( aExp == 0x7FF )
2708                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2709                    ) {
2710                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2711                 }
2712             }
2713             return (int64_t) LIT64( 0x8000000000000000 );
2714         }
2715         z = aSig<<shiftCount;
2716     }
2717     else {
2718         if ( aExp < 0x3FE ) {
2719             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2720             return 0;
2721         }
2722         z = aSig>>( - shiftCount );
2723         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
2724             STATUS(float_exception_flags) |= float_flag_inexact;
2725         }
2726     }
2727     if ( aSign ) z = - z;
2728     return z;
2729
2730 }
2731
2732 /*----------------------------------------------------------------------------
2733 | Returns the result of converting the double-precision floating-point value
2734 | `a' to the single-precision floating-point format.  The conversion is
2735 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2736 | Arithmetic.
2737 *----------------------------------------------------------------------------*/
2738
2739 float32 float64_to_float32( float64 a STATUS_PARAM )
2740 {
2741     flag aSign;
2742     int16 aExp;
2743     uint64_t aSig;
2744     uint32_t zSig;
2745     a = float64_squash_input_denormal(a STATUS_VAR);
2746
2747     aSig = extractFloat64Frac( a );
2748     aExp = extractFloat64Exp( a );
2749     aSign = extractFloat64Sign( a );
2750     if ( aExp == 0x7FF ) {
2751         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2752         return packFloat32( aSign, 0xFF, 0 );
2753     }
2754     shift64RightJamming( aSig, 22, &aSig );
2755     zSig = aSig;
2756     if ( aExp || zSig ) {
2757         zSig |= 0x40000000;
2758         aExp -= 0x381;
2759     }
2760     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2761
2762 }
2763
2764
2765 /*----------------------------------------------------------------------------
2766 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2767 | half-precision floating-point value, returning the result.  After being
2768 | shifted into the proper positions, the three fields are simply added
2769 | together to form the result.  This means that any integer portion of `zSig'
2770 | will be added into the exponent.  Since a properly normalized significand
2771 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2772 | than the desired result exponent whenever `zSig' is a complete, normalized
2773 | significand.
2774 *----------------------------------------------------------------------------*/
2775 static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig)
2776 {
2777     return make_float16(
2778         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
2779 }
2780
2781 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2782    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2783
2784 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2785 {
2786     flag aSign;
2787     int16 aExp;
2788     uint32_t aSig;
2789
2790     aSign = extractFloat16Sign(a);
2791     aExp = extractFloat16Exp(a);
2792     aSig = extractFloat16Frac(a);
2793
2794     if (aExp == 0x1f && ieee) {
2795         if (aSig) {
2796             return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
2797         }
2798         return packFloat32(aSign, 0xff, aSig << 13);
2799     }
2800     if (aExp == 0) {
2801         int8 shiftCount;
2802
2803         if (aSig == 0) {
2804             return packFloat32(aSign, 0, 0);
2805         }
2806
2807         shiftCount = countLeadingZeros32( aSig ) - 21;
2808         aSig = aSig << shiftCount;
2809         aExp = -shiftCount;
2810     }
2811     return packFloat32( aSign, aExp + 0x70, aSig << 13);
2812 }
2813
2814 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
2815 {
2816     flag aSign;
2817     int16 aExp;
2818     uint32_t aSig;
2819     uint32_t mask;
2820     uint32_t increment;
2821     int8 roundingMode;
2822     a = float32_squash_input_denormal(a STATUS_VAR);
2823
2824     aSig = extractFloat32Frac( a );
2825     aExp = extractFloat32Exp( a );
2826     aSign = extractFloat32Sign( a );
2827     if ( aExp == 0xFF ) {
2828         if (aSig) {
2829             /* Input is a NaN */
2830             float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2831             if (!ieee) {
2832                 return packFloat16(aSign, 0, 0);
2833             }
2834             return r;
2835         }
2836         /* Infinity */
2837         if (!ieee) {
2838             float_raise(float_flag_invalid STATUS_VAR);
2839             return packFloat16(aSign, 0x1f, 0x3ff);
2840         }
2841         return packFloat16(aSign, 0x1f, 0);
2842     }
2843     if (aExp == 0 && aSig == 0) {
2844         return packFloat16(aSign, 0, 0);
2845     }
2846     /* Decimal point between bits 22 and 23.  */
2847     aSig |= 0x00800000;
2848     aExp -= 0x7f;
2849     if (aExp < -14) {
2850         mask = 0x00ffffff;
2851         if (aExp >= -24) {
2852             mask >>= 25 + aExp;
2853         }
2854     } else {
2855         mask = 0x00001fff;
2856     }
2857     if (aSig & mask) {
2858         float_raise( float_flag_underflow STATUS_VAR );
2859         roundingMode = STATUS(float_rounding_mode);
2860         switch (roundingMode) {
2861         case float_round_nearest_even:
2862             increment = (mask + 1) >> 1;
2863             if ((aSig & mask) == increment) {
2864                 increment = aSig & (increment << 1);
2865             }
2866             break;
2867         case float_round_up:
2868             increment = aSign ? 0 : mask;
2869             break;
2870         case float_round_down:
2871             increment = aSign ? mask : 0;
2872             break;
2873         default: /* round_to_zero */
2874             increment = 0;
2875             break;
2876         }
2877         aSig += increment;
2878         if (aSig >= 0x01000000) {
2879             aSig >>= 1;
2880             aExp++;
2881         }
2882     } else if (aExp < -14
2883           && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2884         float_raise( float_flag_underflow STATUS_VAR);
2885     }
2886
2887     if (ieee) {
2888         if (aExp > 15) {
2889             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2890             return packFloat16(aSign, 0x1f, 0);
2891         }
2892     } else {
2893         if (aExp > 16) {
2894             float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
2895             return packFloat16(aSign, 0x1f, 0x3ff);
2896         }
2897     }
2898     if (aExp < -24) {
2899         return packFloat16(aSign, 0, 0);
2900     }
2901     if (aExp < -14) {
2902         aSig >>= -14 - aExp;
2903         aExp = -14;
2904     }
2905     return packFloat16(aSign, aExp + 14, aSig >> 13);
2906 }
2907
2908 /*----------------------------------------------------------------------------
2909 | Returns the result of converting the double-precision floating-point value
2910 | `a' to the extended double-precision floating-point format.  The conversion
2911 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2912 | Arithmetic.
2913 *----------------------------------------------------------------------------*/
2914
2915 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2916 {
2917     flag aSign;
2918     int16 aExp;
2919     uint64_t aSig;
2920
2921     a = float64_squash_input_denormal(a STATUS_VAR);
2922     aSig = extractFloat64Frac( a );
2923     aExp = extractFloat64Exp( a );
2924     aSign = extractFloat64Sign( a );
2925     if ( aExp == 0x7FF ) {
2926         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2927         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2928     }
2929     if ( aExp == 0 ) {
2930         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2931         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2932     }
2933     return
2934         packFloatx80(
2935             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2936
2937 }
2938
2939 /*----------------------------------------------------------------------------
2940 | Returns the result of converting the double-precision floating-point value
2941 | `a' to the quadruple-precision floating-point format.  The conversion is
2942 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2943 | Arithmetic.
2944 *----------------------------------------------------------------------------*/
2945
2946 float128 float64_to_float128( float64 a STATUS_PARAM )
2947 {
2948     flag aSign;
2949     int16 aExp;
2950     uint64_t aSig, zSig0, zSig1;
2951
2952     a = float64_squash_input_denormal(a STATUS_VAR);
2953     aSig = extractFloat64Frac( a );
2954     aExp = extractFloat64Exp( a );
2955     aSign = extractFloat64Sign( a );
2956     if ( aExp == 0x7FF ) {
2957         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2958         return packFloat128( aSign, 0x7FFF, 0, 0 );
2959     }
2960     if ( aExp == 0 ) {
2961         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2962         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2963         --aExp;
2964     }
2965     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2966     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2967
2968 }
2969
2970 /*----------------------------------------------------------------------------
2971 | Rounds the double-precision floating-point value `a' to an integer, and
2972 | returns the result as a double-precision floating-point value.  The
2973 | operation is performed according to the IEC/IEEE Standard for Binary
2974 | Floating-Point Arithmetic.
2975 *----------------------------------------------------------------------------*/
2976
2977 float64 float64_round_to_int( float64 a STATUS_PARAM )
2978 {
2979     flag aSign;
2980     int16 aExp;
2981     uint64_t lastBitMask, roundBitsMask;
2982     int8 roundingMode;
2983     uint64_t z;
2984     a = float64_squash_input_denormal(a STATUS_VAR);
2985
2986     aExp = extractFloat64Exp( a );
2987     if ( 0x433 <= aExp ) {
2988         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2989             return propagateFloat64NaN( a, a STATUS_VAR );
2990         }
2991         return a;
2992     }
2993     if ( aExp < 0x3FF ) {
2994         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
2995         STATUS(float_exception_flags) |= float_flag_inexact;
2996         aSign = extractFloat64Sign( a );
2997         switch ( STATUS(float_rounding_mode) ) {
2998          case float_round_nearest_even:
2999             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3000                 return packFloat64( aSign, 0x3FF, 0 );
3001             }
3002             break;
3003          case float_round_down:
3004             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3005          case float_round_up:
3006             return make_float64(
3007             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3008         }
3009         return packFloat64( aSign, 0, 0 );
3010     }
3011     lastBitMask = 1;
3012     lastBitMask <<= 0x433 - aExp;
3013     roundBitsMask = lastBitMask - 1;
3014     z = float64_val(a);
3015     roundingMode = STATUS(float_rounding_mode);
3016     if ( roundingMode == float_round_nearest_even ) {
3017         z += lastBitMask>>1;
3018         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3019     }
3020     else if ( roundingMode != float_round_to_zero ) {
3021         if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3022             z += roundBitsMask;
3023         }
3024     }
3025     z &= ~ roundBitsMask;
3026     if ( z != float64_val(a) )
3027         STATUS(float_exception_flags) |= float_flag_inexact;
3028     return make_float64(z);
3029
3030 }
3031
3032 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3033 {
3034     int oldmode;
3035     float64 res;
3036     oldmode = STATUS(float_rounding_mode);
3037     STATUS(float_rounding_mode) = float_round_to_zero;
3038     res = float64_round_to_int(a STATUS_VAR);
3039     STATUS(float_rounding_mode) = oldmode;
3040     return res;
3041 }
3042
3043 /*----------------------------------------------------------------------------
3044 | Returns the result of adding the absolute values of the double-precision
3045 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3046 | before being returned.  `zSign' is ignored if the result is a NaN.
3047 | The addition is performed according to the IEC/IEEE Standard for Binary
3048 | Floating-Point Arithmetic.
3049 *----------------------------------------------------------------------------*/
3050
3051 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3052 {
3053     int16 aExp, bExp, zExp;
3054     uint64_t aSig, bSig, zSig;
3055     int16 expDiff;
3056
3057     aSig = extractFloat64Frac( a );
3058     aExp = extractFloat64Exp( a );
3059     bSig = extractFloat64Frac( b );
3060     bExp = extractFloat64Exp( b );
3061     expDiff = aExp - bExp;
3062     aSig <<= 9;
3063     bSig <<= 9;
3064     if ( 0 < expDiff ) {
3065         if ( aExp == 0x7FF ) {
3066             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3067             return a;
3068         }
3069         if ( bExp == 0 ) {
3070             --expDiff;
3071         }
3072         else {
3073             bSig |= LIT64( 0x2000000000000000 );
3074         }
3075         shift64RightJamming( bSig, expDiff, &bSig );
3076         zExp = aExp;
3077     }
3078     else if ( expDiff < 0 ) {
3079         if ( bExp == 0x7FF ) {
3080             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3081             return packFloat64( zSign, 0x7FF, 0 );
3082         }
3083         if ( aExp == 0 ) {
3084             ++expDiff;
3085         }
3086         else {
3087             aSig |= LIT64( 0x2000000000000000 );
3088         }
3089         shift64RightJamming( aSig, - expDiff, &aSig );
3090         zExp = bExp;
3091     }
3092     else {
3093         if ( aExp == 0x7FF ) {
3094             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3095             return a;
3096         }
3097         if ( aExp == 0 ) {
3098             if (STATUS(flush_to_zero)) {
3099                 if (aSig | bSig) {
3100                     float_raise(float_flag_output_denormal STATUS_VAR);
3101                 }
3102                 return packFloat64(zSign, 0, 0);
3103             }
3104             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3105         }
3106         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3107         zExp = aExp;
3108         goto roundAndPack;
3109     }
3110     aSig |= LIT64( 0x2000000000000000 );
3111     zSig = ( aSig + bSig )<<1;
3112     --zExp;
3113     if ( (int64_t) zSig < 0 ) {
3114         zSig = aSig + bSig;
3115         ++zExp;
3116     }
3117  roundAndPack:
3118     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3119
3120 }
3121
3122 /*----------------------------------------------------------------------------
3123 | Returns the result of subtracting the absolute values of the double-
3124 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3125 | difference is negated before being returned.  `zSign' is ignored if the
3126 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3127 | Standard for Binary Floating-Point Arithmetic.
3128 *----------------------------------------------------------------------------*/
3129
3130 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3131 {
3132     int16 aExp, bExp, zExp;
3133     uint64_t aSig, bSig, zSig;
3134     int16 expDiff;
3135
3136     aSig = extractFloat64Frac( a );
3137     aExp = extractFloat64Exp( a );
3138     bSig = extractFloat64Frac( b );
3139     bExp = extractFloat64Exp( b );
3140     expDiff = aExp - bExp;
3141     aSig <<= 10;
3142     bSig <<= 10;
3143     if ( 0 < expDiff ) goto aExpBigger;
3144     if ( expDiff < 0 ) goto bExpBigger;
3145     if ( aExp == 0x7FF ) {
3146         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3147         float_raise( float_flag_invalid STATUS_VAR);
3148         return float64_default_nan;
3149     }
3150     if ( aExp == 0 ) {
3151         aExp = 1;
3152         bExp = 1;
3153     }
3154     if ( bSig < aSig ) goto aBigger;
3155     if ( aSig < bSig ) goto bBigger;
3156     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3157  bExpBigger:
3158     if ( bExp == 0x7FF ) {
3159         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3160         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3161     }
3162     if ( aExp == 0 ) {
3163         ++expDiff;
3164     }
3165     else {
3166         aSig |= LIT64( 0x4000000000000000 );
3167     }
3168     shift64RightJamming( aSig, - expDiff, &aSig );
3169     bSig |= LIT64( 0x4000000000000000 );
3170  bBigger:
3171     zSig = bSig - aSig;
3172     zExp = bExp;
3173     zSign ^= 1;
3174     goto normalizeRoundAndPack;
3175  aExpBigger:
3176     if ( aExp == 0x7FF ) {
3177         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3178         return a;
3179     }
3180     if ( bExp == 0 ) {
3181         --expDiff;
3182     }
3183     else {
3184         bSig |= LIT64( 0x4000000000000000 );
3185     }
3186     shift64RightJamming( bSig, expDiff, &bSig );
3187     aSig |= LIT64( 0x4000000000000000 );
3188  aBigger:
3189     zSig = aSig - bSig;
3190     zExp = aExp;
3191  normalizeRoundAndPack:
3192     --zExp;
3193     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3194
3195 }
3196
3197 /*----------------------------------------------------------------------------
3198 | Returns the result of adding the double-precision floating-point values `a'
3199 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3200 | Binary Floating-Point Arithmetic.
3201 *----------------------------------------------------------------------------*/
3202
3203 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3204 {
3205     flag aSign, bSign;
3206     a = float64_squash_input_denormal(a STATUS_VAR);
3207     b = float64_squash_input_denormal(b STATUS_VAR);
3208
3209     aSign = extractFloat64Sign( a );
3210     bSign = extractFloat64Sign( b );
3211     if ( aSign == bSign ) {
3212         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3213     }
3214     else {
3215         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3216     }
3217
3218 }
3219
3220 /*----------------------------------------------------------------------------
3221 | Returns the result of subtracting the double-precision floating-point values
3222 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3223 | for Binary Floating-Point Arithmetic.
3224 *----------------------------------------------------------------------------*/
3225
3226 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3227 {
3228     flag aSign, bSign;
3229     a = float64_squash_input_denormal(a STATUS_VAR);
3230     b = float64_squash_input_denormal(b STATUS_VAR);
3231
3232     aSign = extractFloat64Sign( a );
3233     bSign = extractFloat64Sign( b );
3234     if ( aSign == bSign ) {
3235         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3236     }
3237     else {
3238         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3239     }
3240
3241 }
3242
3243 /*----------------------------------------------------------------------------
3244 | Returns the result of multiplying the double-precision floating-point values
3245 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3246 | for Binary Floating-Point Arithmetic.
3247 *----------------------------------------------------------------------------*/
3248
3249 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3250 {
3251     flag aSign, bSign, zSign;
3252     int16 aExp, bExp, zExp;
3253     uint64_t aSig, bSig, zSig0, zSig1;
3254
3255     a = float64_squash_input_denormal(a STATUS_VAR);
3256     b = float64_squash_input_denormal(b STATUS_VAR);
3257
3258     aSig = extractFloat64Frac( a );
3259     aExp = extractFloat64Exp( a );
3260     aSign = extractFloat64Sign( a );
3261     bSig = extractFloat64Frac( b );
3262     bExp = extractFloat64Exp( b );
3263     bSign = extractFloat64Sign( b );
3264     zSign = aSign ^ bSign;
3265     if ( aExp == 0x7FF ) {
3266         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3267             return propagateFloat64NaN( a, b STATUS_VAR );
3268         }
3269         if ( ( bExp | bSig ) == 0 ) {
3270             float_raise( float_flag_invalid STATUS_VAR);
3271             return float64_default_nan;
3272         }
3273         return packFloat64( zSign, 0x7FF, 0 );
3274     }
3275     if ( bExp == 0x7FF ) {
3276         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3277         if ( ( aExp | aSig ) == 0 ) {
3278             float_raise( float_flag_invalid STATUS_VAR);
3279             return float64_default_nan;
3280         }
3281         return packFloat64( zSign, 0x7FF, 0 );
3282     }
3283     if ( aExp == 0 ) {
3284         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3285         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3286     }
3287     if ( bExp == 0 ) {
3288         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3289         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3290     }
3291     zExp = aExp + bExp - 0x3FF;
3292     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3293     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3294     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3295     zSig0 |= ( zSig1 != 0 );
3296     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3297         zSig0 <<= 1;
3298         --zExp;
3299     }
3300     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3301
3302 }
3303
3304 /*----------------------------------------------------------------------------
3305 | Returns the result of dividing the double-precision floating-point value `a'
3306 | by the corresponding value `b'.  The operation is performed according to
3307 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3308 *----------------------------------------------------------------------------*/
3309
3310 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3311 {
3312     flag aSign, bSign, zSign;
3313     int16 aExp, bExp, zExp;
3314     uint64_t aSig, bSig, zSig;
3315     uint64_t rem0, rem1;
3316     uint64_t term0, term1;
3317     a = float64_squash_input_denormal(a STATUS_VAR);
3318     b = float64_squash_input_denormal(b STATUS_VAR);
3319
3320     aSig = extractFloat64Frac( a );
3321     aExp = extractFloat64Exp( a );
3322     aSign = extractFloat64Sign( a );
3323     bSig = extractFloat64Frac( b );
3324     bExp = extractFloat64Exp( b );
3325     bSign = extractFloat64Sign( b );
3326     zSign = aSign ^ bSign;
3327     if ( aExp == 0x7FF ) {
3328         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3329         if ( bExp == 0x7FF ) {
3330             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3331             float_raise( float_flag_invalid STATUS_VAR);
3332             return float64_default_nan;
3333         }
3334         return packFloat64( zSign, 0x7FF, 0 );
3335     }
3336     if ( bExp == 0x7FF ) {
3337         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3338         return packFloat64( zSign, 0, 0 );
3339     }
3340     if ( bExp == 0 ) {
3341         if ( bSig == 0 ) {
3342             if ( ( aExp | aSig ) == 0 ) {
3343                 float_raise( float_flag_invalid STATUS_VAR);
3344                 return float64_default_nan;
3345             }
3346             float_raise( float_flag_divbyzero STATUS_VAR);
3347             return packFloat64( zSign, 0x7FF, 0 );
3348         }
3349         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3350     }
3351     if ( aExp == 0 ) {
3352         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3353         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3354     }
3355     zExp = aExp - bExp + 0x3FD;
3356     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3357     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3358     if ( bSig <= ( aSig + aSig ) ) {
3359         aSig >>= 1;
3360         ++zExp;
3361     }
3362     zSig = estimateDiv128To64( aSig, 0, bSig );
3363     if ( ( zSig & 0x1FF ) <= 2 ) {
3364         mul64To128( bSig, zSig, &term0, &term1 );
3365         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3366         while ( (int64_t) rem0 < 0 ) {
3367             --zSig;
3368             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3369         }
3370         zSig |= ( rem1 != 0 );
3371     }
3372     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3373
3374 }
3375
3376 /*----------------------------------------------------------------------------
3377 | Returns the remainder of the double-precision floating-point value `a'
3378 | with respect to the corresponding value `b'.  The operation is performed
3379 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3380 *----------------------------------------------------------------------------*/
3381
3382 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3383 {
3384     flag aSign, zSign;
3385     int16 aExp, bExp, expDiff;
3386     uint64_t aSig, bSig;
3387     uint64_t q, alternateASig;
3388     int64_t sigMean;
3389
3390     a = float64_squash_input_denormal(a STATUS_VAR);
3391     b = float64_squash_input_denormal(b STATUS_VAR);
3392     aSig = extractFloat64Frac( a );
3393     aExp = extractFloat64Exp( a );
3394     aSign = extractFloat64Sign( a );
3395     bSig = extractFloat64Frac( b );
3396     bExp = extractFloat64Exp( b );
3397     if ( aExp == 0x7FF ) {
3398         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3399             return propagateFloat64NaN( a, b STATUS_VAR );
3400         }
3401         float_raise( float_flag_invalid STATUS_VAR);
3402         return float64_default_nan;
3403     }
3404     if ( bExp == 0x7FF ) {
3405         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3406         return a;
3407     }
3408     if ( bExp == 0 ) {
3409         if ( bSig == 0 ) {
3410             float_raise( float_flag_invalid STATUS_VAR);
3411             return float64_default_nan;
3412         }
3413         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3414     }
3415     if ( aExp == 0 ) {
3416         if ( aSig == 0 ) return a;
3417         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3418     }
3419     expDiff = aExp - bExp;
3420     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3421     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3422     if ( expDiff < 0 ) {
3423         if ( expDiff < -1 ) return a;
3424         aSig >>= 1;
3425     }
3426     q = ( bSig <= aSig );
3427     if ( q ) aSig -= bSig;
3428     expDiff -= 64;
3429     while ( 0 < expDiff ) {
3430         q = estimateDiv128To64( aSig, 0, bSig );
3431         q = ( 2 < q ) ? q - 2 : 0;
3432         aSig = - ( ( bSig>>2 ) * q );
3433         expDiff -= 62;
3434     }
3435     expDiff += 64;
3436     if ( 0 < expDiff ) {
3437         q = estimateDiv128To64( aSig, 0, bSig );
3438         q = ( 2 < q ) ? q - 2 : 0;
3439         q >>= 64 - expDiff;
3440         bSig >>= 2;
3441         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3442     }
3443     else {
3444         aSig >>= 2;
3445         bSig >>= 2;
3446     }
3447     do {
3448         alternateASig = aSig;
3449         ++q;
3450         aSig -= bSig;
3451     } while ( 0 <= (int64_t) aSig );
3452     sigMean = aSig + alternateASig;
3453     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3454         aSig = alternateASig;
3455     }
3456     zSign = ( (int64_t) aSig < 0 );
3457     if ( zSign ) aSig = - aSig;
3458     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3459
3460 }
3461
3462 /*----------------------------------------------------------------------------
3463 | Returns the square root of the double-precision floating-point value `a'.
3464 | The operation is performed according to the IEC/IEEE Standard for Binary
3465 | Floating-Point Arithmetic.
3466 *----------------------------------------------------------------------------*/
3467
3468 float64 float64_sqrt( float64 a STATUS_PARAM )
3469 {
3470     flag aSign;
3471     int16 aExp, zExp;
3472     uint64_t aSig, zSig, doubleZSig;
3473     uint64_t rem0, rem1, term0, term1;
3474     a = float64_squash_input_denormal(a STATUS_VAR);
3475
3476     aSig = extractFloat64Frac( a );
3477     aExp = extractFloat64Exp( a );
3478     aSign = extractFloat64Sign( a );
3479     if ( aExp == 0x7FF ) {
3480         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3481         if ( ! aSign ) return a;
3482         float_raise( float_flag_invalid STATUS_VAR);
3483         return float64_default_nan;
3484     }
3485     if ( aSign ) {
3486         if ( ( aExp | aSig ) == 0 ) return a;
3487         float_raise( float_flag_invalid STATUS_VAR);
3488         return float64_default_nan;
3489     }
3490     if ( aExp == 0 ) {
3491         if ( aSig == 0 ) return float64_zero;
3492         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3493     }
3494     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3495     aSig |= LIT64( 0x0010000000000000 );
3496     zSig = estimateSqrt32( aExp, aSig>>21 );
3497     aSig <<= 9 - ( aExp & 1 );
3498     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3499     if ( ( zSig & 0x1FF ) <= 5 ) {
3500         doubleZSig = zSig<<1;
3501         mul64To128( zSig, zSig, &term0, &term1 );
3502         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3503         while ( (int64_t) rem0 < 0 ) {
3504             --zSig;
3505             doubleZSig -= 2;
3506             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3507         }
3508         zSig |= ( ( rem0 | rem1 ) != 0 );
3509     }
3510     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3511
3512 }
3513
3514 /*----------------------------------------------------------------------------
3515 | Returns the binary log of the double-precision floating-point value `a'.
3516 | The operation is performed according to the IEC/IEEE Standard for Binary
3517 | Floating-Point Arithmetic.
3518 *----------------------------------------------------------------------------*/
3519 float64 float64_log2( float64 a STATUS_PARAM )
3520 {
3521     flag aSign, zSign;
3522     int16 aExp;
3523     uint64_t aSig, aSig0, aSig1, zSig, i;
3524     a = float64_squash_input_denormal(a STATUS_VAR);
3525
3526     aSig = extractFloat64Frac( a );
3527     aExp = extractFloat64Exp( a );
3528     aSign = extractFloat64Sign( a );
3529
3530     if ( aExp == 0 ) {
3531         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3532         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3533     }
3534     if ( aSign ) {
3535         float_raise( float_flag_invalid STATUS_VAR);
3536         return float64_default_nan;
3537     }
3538     if ( aExp == 0x7FF ) {
3539         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3540         return a;
3541     }
3542
3543     aExp -= 0x3FF;
3544     aSig |= LIT64( 0x0010000000000000 );
3545     zSign = aExp < 0;
3546     zSig = (uint64_t)aExp << 52;
3547     for (i = 1LL << 51; i > 0; i >>= 1) {
3548         mul64To128( aSig, aSig, &aSig0, &aSig1 );
3549         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3550         if ( aSig & LIT64( 0x0020000000000000 ) ) {
3551             aSig >>= 1;
3552             zSig |= i;
3553         }
3554     }
3555
3556     if ( zSign )
3557         zSig = -zSig;
3558     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3559 }
3560
3561 /*----------------------------------------------------------------------------
3562 | Returns 1 if the double-precision floating-point value `a' is equal to the
3563 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
3564 | if either operand is a NaN.  Otherwise, the comparison is performed
3565 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3566 *----------------------------------------------------------------------------*/
3567
3568 int float64_eq( float64 a, float64 b STATUS_PARAM )
3569 {
3570     uint64_t av, bv;
3571     a = float64_squash_input_denormal(a STATUS_VAR);
3572     b = float64_squash_input_denormal(b STATUS_VAR);
3573
3574     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3575          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3576        ) {
3577         float_raise( float_flag_invalid STATUS_VAR);
3578         return 0;
3579     }
3580     av = float64_val(a);
3581     bv = float64_val(b);
3582     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3583
3584 }
3585
3586 /*----------------------------------------------------------------------------
3587 | Returns 1 if the double-precision floating-point value `a' is less than or
3588 | equal to the corresponding value `b', and 0 otherwise.  The invalid
3589 | exception is raised if either operand is a NaN.  The comparison is performed
3590 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3591 *----------------------------------------------------------------------------*/
3592
3593 int float64_le( float64 a, float64 b STATUS_PARAM )
3594 {
3595     flag aSign, bSign;
3596     uint64_t av, bv;
3597     a = float64_squash_input_denormal(a STATUS_VAR);
3598     b = float64_squash_input_denormal(b STATUS_VAR);
3599
3600     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3601          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3602        ) {
3603         float_raise( float_flag_invalid STATUS_VAR);
3604         return 0;
3605     }
3606     aSign = extractFloat64Sign( a );
3607     bSign = extractFloat64Sign( b );
3608     av = float64_val(a);
3609     bv = float64_val(b);
3610     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3611     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3612
3613 }
3614
3615 /*----------------------------------------------------------------------------
3616 | Returns 1 if the double-precision floating-point value `a' is less than
3617 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3618 | raised if either operand is a NaN.  The comparison is performed according
3619 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3620 *----------------------------------------------------------------------------*/
3621
3622 int float64_lt( float64 a, float64 b STATUS_PARAM )
3623 {
3624     flag aSign, bSign;
3625     uint64_t av, bv;
3626
3627     a = float64_squash_input_denormal(a STATUS_VAR);
3628     b = float64_squash_input_denormal(b STATUS_VAR);
3629     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3630          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3631        ) {
3632         float_raise( float_flag_invalid STATUS_VAR);
3633         return 0;
3634     }
3635     aSign = extractFloat64Sign( a );
3636     bSign = extractFloat64Sign( b );
3637     av = float64_val(a);
3638     bv = float64_val(b);
3639     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3640     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3641
3642 }
3643
3644 /*----------------------------------------------------------------------------
3645 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3646 | be compared, and 0 otherwise.  The invalid exception is raised if either
3647 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
3648 | Standard for Binary Floating-Point Arithmetic.
3649 *----------------------------------------------------------------------------*/
3650
3651 int float64_unordered( float64 a, float64 b STATUS_PARAM )
3652 {
3653     a = float64_squash_input_denormal(a STATUS_VAR);
3654     b = float64_squash_input_denormal(b STATUS_VAR);
3655
3656     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3657          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3658        ) {
3659         float_raise( float_flag_invalid STATUS_VAR);
3660         return 1;
3661     }
3662     return 0;
3663 }
3664
3665 /*----------------------------------------------------------------------------
3666 | Returns 1 if the double-precision floating-point value `a' is equal to the
3667 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3668 | exception.The comparison is performed according to the IEC/IEEE Standard
3669 | for Binary Floating-Point Arithmetic.
3670 *----------------------------------------------------------------------------*/
3671
3672 int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
3673 {
3674     uint64_t av, bv;
3675     a = float64_squash_input_denormal(a STATUS_VAR);
3676     b = float64_squash_input_denormal(b STATUS_VAR);
3677
3678     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3679          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3680        ) {
3681         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3682             float_raise( float_flag_invalid STATUS_VAR);
3683         }
3684         return 0;
3685     }
3686     av = float64_val(a);
3687     bv = float64_val(b);
3688     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3689
3690 }
3691
3692 /*----------------------------------------------------------------------------
3693 | Returns 1 if the double-precision floating-point value `a' is less than or
3694 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3695 | cause an exception.  Otherwise, the comparison is performed according to the
3696 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3697 *----------------------------------------------------------------------------*/
3698
3699 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3700 {
3701     flag aSign, bSign;
3702     uint64_t av, bv;
3703     a = float64_squash_input_denormal(a STATUS_VAR);
3704     b = float64_squash_input_denormal(b STATUS_VAR);
3705
3706     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3707          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3708        ) {
3709         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3710             float_raise( float_flag_invalid STATUS_VAR);
3711         }
3712         return 0;
3713     }
3714     aSign = extractFloat64Sign( a );
3715     bSign = extractFloat64Sign( b );
3716     av = float64_val(a);
3717     bv = float64_val(b);
3718     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3719     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3720
3721 }
3722
3723 /*----------------------------------------------------------------------------
3724 | Returns 1 if the double-precision floating-point value `a' is less than
3725 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3726 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3727 | Standard for Binary Floating-Point Arithmetic.
3728 *----------------------------------------------------------------------------*/
3729
3730 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3731 {
3732     flag aSign, bSign;
3733     uint64_t av, bv;
3734     a = float64_squash_input_denormal(a STATUS_VAR);
3735     b = float64_squash_input_denormal(b STATUS_VAR);
3736
3737     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3738          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3739        ) {
3740         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3741             float_raise( float_flag_invalid STATUS_VAR);
3742         }
3743         return 0;
3744     }
3745     aSign = extractFloat64Sign( a );
3746     bSign = extractFloat64Sign( b );
3747     av = float64_val(a);
3748     bv = float64_val(b);
3749     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3750     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3751
3752 }
3753
3754 /*----------------------------------------------------------------------------
3755 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3756 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3757 | comparison is performed according to the IEC/IEEE Standard for Binary
3758 | Floating-Point Arithmetic.
3759 *----------------------------------------------------------------------------*/
3760
3761 int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
3762 {
3763     a = float64_squash_input_denormal(a STATUS_VAR);
3764     b = float64_squash_input_denormal(b STATUS_VAR);
3765
3766     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3767          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3768        ) {
3769         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3770             float_raise( float_flag_invalid STATUS_VAR);
3771         }
3772         return 1;
3773     }
3774     return 0;
3775 }
3776
3777 /*----------------------------------------------------------------------------
3778 | Returns the result of converting the extended double-precision floating-
3779 | point value `a' to the 32-bit two's complement integer format.  The
3780 | conversion is performed according to the IEC/IEEE Standard for Binary
3781 | Floating-Point Arithmetic---which means in particular that the conversion
3782 | is rounded according to the current rounding mode.  If `a' is a NaN, the
3783 | largest positive integer is returned.  Otherwise, if the conversion
3784 | overflows, the largest integer with the same sign as `a' is returned.
3785 *----------------------------------------------------------------------------*/
3786
3787 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3788 {
3789     flag aSign;
3790     int32 aExp, shiftCount;
3791     uint64_t aSig;
3792
3793     aSig = extractFloatx80Frac( a );
3794     aExp = extractFloatx80Exp( a );
3795     aSign = extractFloatx80Sign( a );
3796     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3797     shiftCount = 0x4037 - aExp;
3798     if ( shiftCount <= 0 ) shiftCount = 1;
3799     shift64RightJamming( aSig, shiftCount, &aSig );
3800     return roundAndPackInt32( aSign, aSig STATUS_VAR );
3801
3802 }
3803
3804 /*----------------------------------------------------------------------------
3805 | Returns the result of converting the extended double-precision floating-
3806 | point value `a' to the 32-bit two's complement integer format.  The
3807 | conversion is performed according to the IEC/IEEE Standard for Binary
3808 | Floating-Point Arithmetic, except that the conversion is always rounded
3809 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3810 | Otherwise, if the conversion overflows, the largest integer with the same
3811 | sign as `a' is returned.
3812 *----------------------------------------------------------------------------*/
3813
3814 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3815 {
3816     flag aSign;
3817     int32 aExp, shiftCount;
3818     uint64_t aSig, savedASig;
3819     int32 z;
3820
3821     aSig = extractFloatx80Frac( a );
3822     aExp = extractFloatx80Exp( a );
3823     aSign = extractFloatx80Sign( a );
3824     if ( 0x401E < aExp ) {
3825         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3826         goto invalid;
3827     }
3828     else if ( aExp < 0x3FFF ) {
3829         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3830         return 0;
3831     }
3832     shiftCount = 0x403E - aExp;
3833     savedASig = aSig;
3834     aSig >>= shiftCount;
3835     z = aSig;
3836     if ( aSign ) z = - z;
3837     if ( ( z < 0 ) ^ aSign ) {
3838  invalid:
3839         float_raise( float_flag_invalid STATUS_VAR);
3840         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3841     }
3842     if ( ( aSig<<shiftCount ) != savedASig ) {
3843         STATUS(float_exception_flags) |= float_flag_inexact;
3844     }
3845     return z;
3846
3847 }
3848
3849 /*----------------------------------------------------------------------------
3850 | Returns the result of converting the extended double-precision floating-
3851 | point value `a' to the 64-bit two's complement integer format.  The
3852 | conversion is performed according to the IEC/IEEE Standard for Binary
3853 | Floating-Point Arithmetic---which means in particular that the conversion
3854 | is rounded according to the current rounding mode.  If `a' is a NaN,
3855 | the largest positive integer is returned.  Otherwise, if the conversion
3856 | overflows, the largest integer with the same sign as `a' is returned.
3857 *----------------------------------------------------------------------------*/
3858
3859 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3860 {
3861     flag aSign;
3862     int32 aExp, shiftCount;
3863     uint64_t aSig, aSigExtra;
3864
3865     aSig = extractFloatx80Frac( a );
3866     aExp = extractFloatx80Exp( a );
3867     aSign = extractFloatx80Sign( a );
3868     shiftCount = 0x403E - aExp;
3869     if ( shiftCount <= 0 ) {
3870         if ( shiftCount ) {
3871             float_raise( float_flag_invalid STATUS_VAR);
3872             if (    ! aSign
3873                  || (    ( aExp == 0x7FFF )
3874                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3875                ) {
3876                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3877             }
3878             return (int64_t) LIT64( 0x8000000000000000 );
3879         }
3880         aSigExtra = 0;
3881     }
3882     else {
3883         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3884     }
3885     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3886
3887 }
3888
3889 /*----------------------------------------------------------------------------
3890 | Returns the result of converting the extended double-precision floating-
3891 | point value `a' to the 64-bit two's complement integer format.  The
3892 | conversion is performed according to the IEC/IEEE Standard for Binary
3893 | Floating-Point Arithmetic, except that the conversion is always rounded
3894 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3895 | Otherwise, if the conversion overflows, the largest integer with the same
3896 | sign as `a' is returned.
3897 *----------------------------------------------------------------------------*/
3898
3899 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3900 {
3901     flag aSign;
3902     int32 aExp, shiftCount;
3903     uint64_t aSig;
3904     int64 z;
3905
3906     aSig = extractFloatx80Frac( a );
3907     aExp = extractFloatx80Exp( a );
3908     aSign = extractFloatx80Sign( a );
3909     shiftCount = aExp - 0x403E;
3910     if ( 0 <= shiftCount ) {
3911         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3912         if ( ( a.high != 0xC03E ) || aSig ) {
3913             float_raise( float_flag_invalid STATUS_VAR);
3914             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3915                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3916             }
3917         }
3918         return (int64_t) LIT64( 0x8000000000000000 );
3919     }
3920     else if ( aExp < 0x3FFF ) {
3921         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3922         return 0;
3923     }
3924     z = aSig>>( - shiftCount );
3925     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3926         STATUS(float_exception_flags) |= float_flag_inexact;
3927     }
3928     if ( aSign ) z = - z;
3929     return z;
3930
3931 }
3932
3933 /*----------------------------------------------------------------------------
3934 | Returns the result of converting the extended double-precision floating-
3935 | point value `a' to the single-precision floating-point format.  The
3936 | conversion is performed according to the IEC/IEEE Standard for Binary
3937 | Floating-Point Arithmetic.
3938 *----------------------------------------------------------------------------*/
3939
3940 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3941 {
3942     flag aSign;
3943     int32 aExp;
3944     uint64_t aSig;
3945
3946     aSig = extractFloatx80Frac( a );
3947     aExp = extractFloatx80Exp( a );
3948     aSign = extractFloatx80Sign( a );
3949     if ( aExp == 0x7FFF ) {
3950         if ( (uint64_t) ( aSig<<1 ) ) {
3951             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3952         }
3953         return packFloat32( aSign, 0xFF, 0 );
3954     }
3955     shift64RightJamming( aSig, 33, &aSig );
3956     if ( aExp || aSig ) aExp -= 0x3F81;
3957     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3958
3959 }
3960
3961 /*----------------------------------------------------------------------------
3962 | Returns the result of converting the extended double-precision floating-
3963 | point value `a' to the double-precision floating-point format.  The
3964 | conversion is performed according to the IEC/IEEE Standard for Binary
3965 | Floating-Point Arithmetic.
3966 *----------------------------------------------------------------------------*/
3967
3968 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3969 {
3970     flag aSign;
3971     int32 aExp;
3972     uint64_t aSig, zSig;
3973
3974     aSig = extractFloatx80Frac( a );
3975     aExp = extractFloatx80Exp( a );
3976     aSign = extractFloatx80Sign( a );
3977     if ( aExp == 0x7FFF ) {
3978         if ( (uint64_t) ( aSig<<1 ) ) {
3979             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3980         }
3981         return packFloat64( aSign, 0x7FF, 0 );
3982     }
3983     shift64RightJamming( aSig, 1, &zSig );
3984     if ( aExp || aSig ) aExp -= 0x3C01;
3985     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3986
3987 }
3988
3989 /*----------------------------------------------------------------------------
3990 | Returns the result of converting the extended double-precision floating-
3991 | point value `a' to the quadruple-precision floating-point format.  The
3992 | conversion is performed according to the IEC/IEEE Standard for Binary
3993 | Floating-Point Arithmetic.
3994 *----------------------------------------------------------------------------*/
3995
3996 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3997 {
3998     flag aSign;
3999     int16 aExp;
4000     uint64_t aSig, zSig0, zSig1;
4001
4002     aSig = extractFloatx80Frac( a );
4003     aExp = extractFloatx80Exp( a );
4004     aSign = extractFloatx80Sign( a );
4005     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4006         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4007     }
4008     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4009     return packFloat128( aSign, aExp, zSig0, zSig1 );
4010
4011 }
4012
4013 /*----------------------------------------------------------------------------
4014 | Rounds the extended double-precision floating-point value `a' to an integer,
4015 | and returns the result as an extended quadruple-precision floating-point
4016 | value.  The operation is performed according to the IEC/IEEE Standard for
4017 | Binary Floating-Point Arithmetic.
4018 *----------------------------------------------------------------------------*/
4019
4020 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4021 {
4022     flag aSign;
4023     int32 aExp;
4024     uint64_t lastBitMask, roundBitsMask;
4025     int8 roundingMode;
4026     floatx80 z;
4027
4028     aExp = extractFloatx80Exp( a );
4029     if ( 0x403E <= aExp ) {
4030         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4031             return propagateFloatx80NaN( a, a STATUS_VAR );
4032         }
4033         return a;
4034     }
4035     if ( aExp < 0x3FFF ) {
4036         if (    ( aExp == 0 )
4037              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4038             return a;
4039         }
4040         STATUS(float_exception_flags) |= float_flag_inexact;
4041         aSign = extractFloatx80Sign( a );
4042         switch ( STATUS(float_rounding_mode) ) {
4043          case float_round_nearest_even:
4044             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4045                ) {
4046                 return
4047                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4048             }
4049             break;
4050          case float_round_down:
4051             return
4052                   aSign ?
4053                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4054                 : packFloatx80( 0, 0, 0 );
4055          case float_round_up:
4056             return
4057                   aSign ? packFloatx80( 1, 0, 0 )
4058                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4059         }
4060         return packFloatx80( aSign, 0, 0 );
4061     }
4062     lastBitMask = 1;
4063     lastBitMask <<= 0x403E - aExp;
4064     roundBitsMask = lastBitMask - 1;
4065     z = a;
4066     roundingMode = STATUS(float_rounding_mode);
4067     if ( roundingMode == float_round_nearest_even ) {
4068         z.low += lastBitMask>>1;
4069         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4070     }
4071     else if ( roundingMode != float_round_to_zero ) {
4072         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4073             z.low += roundBitsMask;
4074         }
4075     }
4076     z.low &= ~ roundBitsMask;
4077     if ( z.low == 0 ) {
4078         ++z.high;
4079         z.low = LIT64( 0x8000000000000000 );
4080     }
4081     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4082     return z;
4083
4084 }
4085
4086 /*----------------------------------------------------------------------------
4087 | Returns the result of adding the absolute values of the extended double-
4088 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4089 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4090 | The addition is performed according to the IEC/IEEE Standard for Binary
4091 | Floating-Point Arithmetic.
4092 *----------------------------------------------------------------------------*/
4093
4094 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4095 {
4096     int32 aExp, bExp, zExp;
4097     uint64_t aSig, bSig, zSig0, zSig1;
4098     int32 expDiff;
4099
4100     aSig = extractFloatx80Frac( a );
4101     aExp = extractFloatx80Exp( a );
4102     bSig = extractFloatx80Frac( b );
4103     bExp = extractFloatx80Exp( b );
4104     expDiff = aExp - bExp;
4105     if ( 0 < expDiff ) {
4106         if ( aExp == 0x7FFF ) {
4107             if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4108             return a;
4109         }
4110         if ( bExp == 0 ) --expDiff;
4111         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4112         zExp = aExp;
4113     }
4114     else if ( expDiff < 0 ) {
4115         if ( bExp == 0x7FFF ) {
4116             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4117             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4118         }
4119         if ( aExp == 0 ) ++expDiff;
4120         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4121         zExp = bExp;
4122     }
4123     else {
4124         if ( aExp == 0x7FFF ) {
4125             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4126                 return propagateFloatx80NaN( a, b STATUS_VAR );
4127             }
4128             return a;
4129         }
4130         zSig1 = 0;
4131         zSig0 = aSig + bSig;
4132         if ( aExp == 0 ) {
4133             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4134             goto roundAndPack;
4135         }
4136         zExp = aExp;
4137         goto shiftRight1;
4138     }
4139     zSig0 = aSig + bSig;
4140     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4141  shiftRight1:
4142     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4143     zSig0 |= LIT64( 0x8000000000000000 );
4144     ++zExp;
4145  roundAndPack:
4146     return
4147         roundAndPackFloatx80(
4148             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4149
4150 }
4151
4152 /*----------------------------------------------------------------------------
4153 | Returns the result of subtracting the absolute values of the extended
4154 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4155 | difference is negated before being returned.  `zSign' is ignored if the
4156 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4157 | Standard for Binary Floating-Point Arithmetic.
4158 *----------------------------------------------------------------------------*/
4159
4160 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4161 {
4162     int32 aExp, bExp, zExp;
4163     uint64_t aSig, bSig, zSig0, zSig1;
4164     int32 expDiff;
4165     floatx80 z;
4166
4167     aSig = extractFloatx80Frac( a );
4168     aExp = extractFloatx80Exp( a );
4169     bSig = extractFloatx80Frac( b );
4170     bExp = extractFloatx80Exp( b );
4171     expDiff = aExp - bExp;
4172     if ( 0 < expDiff ) goto aExpBigger;
4173     if ( expDiff < 0 ) goto bExpBigger;
4174     if ( aExp == 0x7FFF ) {
4175         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4176             return propagateFloatx80NaN( a, b STATUS_VAR );
4177         }
4178         float_raise( float_flag_invalid STATUS_VAR);
4179         z.low = floatx80_default_nan_low;
4180         z.high = floatx80_default_nan_high;
4181         return z;
4182     }
4183     if ( aExp == 0 ) {
4184         aExp = 1;
4185         bExp = 1;
4186     }
4187     zSig1 = 0;
4188     if ( bSig < aSig ) goto aBigger;
4189     if ( aSig < bSig ) goto bBigger;
4190     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4191  bExpBigger:
4192     if ( bExp == 0x7FFF ) {
4193         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4194         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4195     }
4196     if ( aExp == 0 ) ++expDiff;
4197     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4198  bBigger:
4199     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4200     zExp = bExp;
4201     zSign ^= 1;
4202     goto normalizeRoundAndPack;
4203  aExpBigger:
4204     if ( aExp == 0x7FFF ) {
4205         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4206         return a;
4207     }
4208     if ( bExp == 0 ) --expDiff;
4209     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4210  aBigger:
4211     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4212     zExp = aExp;
4213  normalizeRoundAndPack:
4214     return
4215         normalizeRoundAndPackFloatx80(
4216             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4217
4218 }
4219
4220 /*----------------------------------------------------------------------------
4221 | Returns the result of adding the extended double-precision floating-point
4222 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4223 | Standard for Binary Floating-Point Arithmetic.
4224 *----------------------------------------------------------------------------*/
4225
4226 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4227 {
4228     flag aSign, bSign;
4229
4230     aSign = extractFloatx80Sign( a );
4231     bSign = extractFloatx80Sign( b );
4232     if ( aSign == bSign ) {
4233         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4234     }
4235     else {
4236         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4237     }
4238
4239 }
4240
4241 /*----------------------------------------------------------------------------
4242 | Returns the result of subtracting the extended double-precision floating-
4243 | point values `a' and `b'.  The operation is performed according to the
4244 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4245 *----------------------------------------------------------------------------*/
4246
4247 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4248 {
4249     flag aSign, bSign;
4250
4251     aSign = extractFloatx80Sign( a );
4252     bSign = extractFloatx80Sign( b );
4253     if ( aSign == bSign ) {
4254         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4255     }
4256     else {
4257         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4258     }
4259
4260 }
4261
4262 /*----------------------------------------------------------------------------
4263 | Returns the result of multiplying the extended double-precision floating-
4264 | point values `a' and `b'.  The operation is performed according to the
4265 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4266 *----------------------------------------------------------------------------*/
4267
4268 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4269 {
4270     flag aSign, bSign, zSign;
4271     int32 aExp, bExp, zExp;
4272     uint64_t aSig, bSig, zSig0, zSig1;
4273     floatx80 z;
4274
4275     aSig = extractFloatx80Frac( a );
4276     aExp = extractFloatx80Exp( a );
4277     aSign = extractFloatx80Sign( a );
4278     bSig = extractFloatx80Frac( b );
4279     bExp = extractFloatx80Exp( b );
4280     bSign = extractFloatx80Sign( b );
4281     zSign = aSign ^ bSign;
4282     if ( aExp == 0x7FFF ) {
4283         if (    (uint64_t) ( aSig<<1 )
4284              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4285             return propagateFloatx80NaN( a, b STATUS_VAR );
4286         }
4287         if ( ( bExp | bSig ) == 0 ) goto invalid;
4288         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4289     }
4290     if ( bExp == 0x7FFF ) {
4291         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4292         if ( ( aExp | aSig ) == 0 ) {
4293  invalid:
4294             float_raise( float_flag_invalid STATUS_VAR);
4295             z.low = floatx80_default_nan_low;
4296             z.high = floatx80_default_nan_high;
4297             return z;
4298         }
4299         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4300     }
4301     if ( aExp == 0 ) {
4302         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4303         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4304     }
4305     if ( bExp == 0 ) {
4306         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4307         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4308     }
4309     zExp = aExp + bExp - 0x3FFE;
4310     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4311     if ( 0 < (int64_t) zSig0 ) {
4312         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4313         --zExp;
4314     }
4315     return
4316         roundAndPackFloatx80(
4317             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4318
4319 }
4320
4321 /*----------------------------------------------------------------------------
4322 | Returns the result of dividing the extended double-precision floating-point
4323 | value `a' by the corresponding value `b'.  The operation is performed
4324 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4325 *----------------------------------------------------------------------------*/
4326
4327 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4328 {
4329     flag aSign, bSign, zSign;
4330     int32 aExp, bExp, zExp;
4331     uint64_t aSig, bSig, zSig0, zSig1;
4332     uint64_t rem0, rem1, rem2, term0, term1, term2;
4333     floatx80 z;
4334
4335     aSig = extractFloatx80Frac( a );
4336     aExp = extractFloatx80Exp( a );
4337     aSign = extractFloatx80Sign( a );
4338     bSig = extractFloatx80Frac( b );
4339     bExp = extractFloatx80Exp( b );
4340     bSign = extractFloatx80Sign( b );
4341     zSign = aSign ^ bSign;
4342     if ( aExp == 0x7FFF ) {
4343         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4344         if ( bExp == 0x7FFF ) {
4345             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4346             goto invalid;
4347         }
4348         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4349     }
4350     if ( bExp == 0x7FFF ) {
4351         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4352         return packFloatx80( zSign, 0, 0 );
4353     }
4354     if ( bExp == 0 ) {
4355         if ( bSig == 0 ) {
4356             if ( ( aExp | aSig ) == 0 ) {
4357  invalid:
4358                 float_raise( float_flag_invalid STATUS_VAR);
4359                 z.low = floatx80_default_nan_low;
4360                 z.high = floatx80_default_nan_high;
4361                 return z;
4362             }
4363             float_raise( float_flag_divbyzero STATUS_VAR);
4364             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4365         }
4366         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4367     }
4368     if ( aExp == 0 ) {
4369         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4370         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4371     }
4372     zExp = aExp - bExp + 0x3FFE;
4373     rem1 = 0;
4374     if ( bSig <= aSig ) {
4375         shift128Right( aSig, 0, 1, &aSig, &rem1 );
4376         ++zExp;
4377     }
4378     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4379     mul64To128( bSig, zSig0, &term0, &term1 );
4380     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4381     while ( (int64_t) rem0 < 0 ) {
4382         --zSig0;
4383         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4384     }
4385     zSig1 = estimateDiv128To64( rem1, 0, bSig );
4386     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4387         mul64To128( bSig, zSig1, &term1, &term2 );
4388         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4389         while ( (int64_t) rem1 < 0 ) {
4390             --zSig1;
4391             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4392         }
4393         zSig1 |= ( ( rem1 | rem2 ) != 0 );
4394     }
4395     return
4396         roundAndPackFloatx80(
4397             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4398
4399 }
4400
4401 /*----------------------------------------------------------------------------
4402 | Returns the remainder of the extended double-precision floating-point value
4403 | `a' with respect to the corresponding value `b'.  The operation is performed
4404 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4405 *----------------------------------------------------------------------------*/
4406
4407 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4408 {
4409     flag aSign, zSign;
4410     int32 aExp, bExp, expDiff;
4411     uint64_t aSig0, aSig1, bSig;
4412     uint64_t q, term0, term1, alternateASig0, alternateASig1;
4413     floatx80 z;
4414
4415     aSig0 = extractFloatx80Frac( a );
4416     aExp = extractFloatx80Exp( a );
4417     aSign = extractFloatx80Sign( a );
4418     bSig = extractFloatx80Frac( b );
4419     bExp = extractFloatx80Exp( b );
4420     if ( aExp == 0x7FFF ) {
4421         if (    (uint64_t) ( aSig0<<1 )
4422              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4423             return propagateFloatx80NaN( a, b STATUS_VAR );
4424         }
4425         goto invalid;
4426     }
4427     if ( bExp == 0x7FFF ) {
4428         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4429         return a;
4430     }
4431     if ( bExp == 0 ) {
4432         if ( bSig == 0 ) {
4433  invalid:
4434             float_raise( float_flag_invalid STATUS_VAR);
4435             z.low = floatx80_default_nan_low;
4436             z.high = floatx80_default_nan_high;
4437             return z;
4438         }
4439         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4440     }
4441     if ( aExp == 0 ) {
4442         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4443         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4444     }
4445     bSig |= LIT64( 0x8000000000000000 );
4446     zSign = aSign;
4447     expDiff = aExp - bExp;
4448     aSig1 = 0;
4449     if ( expDiff < 0 ) {
4450         if ( expDiff < -1 ) return a;
4451         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4452         expDiff = 0;
4453     }
4454     q = ( bSig <= aSig0 );
4455     if ( q ) aSig0 -= bSig;
4456     expDiff -= 64;
4457     while ( 0 < expDiff ) {
4458         q = estimateDiv128To64( aSig0, aSig1, bSig );
4459         q = ( 2 < q ) ? q - 2 : 0;
4460         mul64To128( bSig, q, &term0, &term1 );
4461         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4462         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4463         expDiff -= 62;
4464     }
4465     expDiff += 64;
4466     if ( 0 < expDiff ) {
4467         q = estimateDiv128To64( aSig0, aSig1, bSig );
4468         q = ( 2 < q ) ? q - 2 : 0;
4469         q >>= 64 - expDiff;
4470         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4471         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4472         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4473         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4474             ++q;
4475             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4476         }
4477     }
4478     else {
4479         term1 = 0;
4480         term0 = bSig;
4481     }
4482     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4483     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4484          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4485               && ( q & 1 ) )
4486        ) {
4487         aSig0 = alternateASig0;
4488         aSig1 = alternateASig1;
4489         zSign = ! zSign;
4490     }
4491     return
4492         normalizeRoundAndPackFloatx80(
4493             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4494
4495 }
4496
4497 /*----------------------------------------------------------------------------
4498 | Returns the square root of the extended double-precision floating-point
4499 | value `a'.  The operation is performed according to the IEC/IEEE Standard
4500 | for Binary Floating-Point Arithmetic.
4501 *----------------------------------------------------------------------------*/
4502
4503 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4504 {
4505     flag aSign;
4506     int32 aExp, zExp;
4507     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4508     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4509     floatx80 z;
4510
4511     aSig0 = extractFloatx80Frac( a );
4512     aExp = extractFloatx80Exp( a );
4513     aSign = extractFloatx80Sign( a );
4514     if ( aExp == 0x7FFF ) {
4515         if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4516         if ( ! aSign ) return a;
4517         goto invalid;
4518     }
4519     if ( aSign ) {
4520         if ( ( aExp | aSig0 ) == 0 ) return a;
4521  invalid:
4522         float_raise( float_flag_invalid STATUS_VAR);
4523         z.low = floatx80_default_nan_low;
4524         z.high = floatx80_default_nan_high;
4525         return z;
4526     }
4527     if ( aExp == 0 ) {
4528         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4529         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4530     }
4531     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4532     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4533     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4534     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4535     doubleZSig0 = zSig0<<1;
4536     mul64To128( zSig0, zSig0, &term0, &term1 );
4537     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4538     while ( (int64_t) rem0 < 0 ) {
4539         --zSig0;
4540         doubleZSig0 -= 2;
4541         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4542     }
4543     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4544     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4545         if ( zSig1 == 0 ) zSig1 = 1;
4546         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4547         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4548         mul64To128( zSig1, zSig1, &term2, &term3 );
4549         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4550         while ( (int64_t) rem1 < 0 ) {
4551             --zSig1;
4552             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4553             term3 |= 1;
4554             term2 |= doubleZSig0;
4555             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4556         }
4557         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4558     }
4559     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4560     zSig0 |= doubleZSig0;
4561     return
4562         roundAndPackFloatx80(
4563             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4564
4565 }
4566
4567 /*----------------------------------------------------------------------------
4568 | Returns 1 if the extended double-precision floating-point value `a' is equal
4569 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
4570 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4571 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4572 *----------------------------------------------------------------------------*/
4573
4574 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4575 {
4576
4577     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4578               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4579          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4580               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4581        ) {
4582         float_raise( float_flag_invalid STATUS_VAR);
4583         return 0;
4584     }
4585     return
4586            ( a.low == b.low )
4587         && (    ( a.high == b.high )
4588              || (    ( a.low == 0 )
4589                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4590            );
4591
4592 }
4593
4594 /*----------------------------------------------------------------------------
4595 | Returns 1 if the extended double-precision floating-point value `a' is
4596 | less than or equal to the corresponding value `b', and 0 otherwise.  The
4597 | invalid exception is raised if either operand is a NaN.  The comparison is
4598 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4599 | Arithmetic.
4600 *----------------------------------------------------------------------------*/
4601
4602 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4603 {
4604     flag aSign, bSign;
4605
4606     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4607               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4608          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4609               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4610        ) {
4611         float_raise( float_flag_invalid STATUS_VAR);
4612         return 0;
4613     }
4614     aSign = extractFloatx80Sign( a );
4615     bSign = extractFloatx80Sign( b );
4616     if ( aSign != bSign ) {
4617         return
4618                aSign
4619             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4620                  == 0 );
4621     }
4622     return
4623           aSign ? le128( b.high, b.low, a.high, a.low )
4624         : le128( a.high, a.low, b.high, b.low );
4625
4626 }
4627
4628 /*----------------------------------------------------------------------------
4629 | Returns 1 if the extended double-precision floating-point value `a' is
4630 | less than the corresponding value `b', and 0 otherwise.  The invalid
4631 | exception is raised if either operand is a NaN.  The comparison is performed
4632 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4633 *----------------------------------------------------------------------------*/
4634
4635 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4636 {
4637     flag aSign, bSign;
4638
4639     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4640               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4641          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4642               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4643        ) {
4644         float_raise( float_flag_invalid STATUS_VAR);
4645         return 0;
4646     }
4647     aSign = extractFloatx80Sign( a );
4648     bSign = extractFloatx80Sign( b );
4649     if ( aSign != bSign ) {
4650         return
4651                aSign
4652             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4653                  != 0 );
4654     }
4655     return
4656           aSign ? lt128( b.high, b.low, a.high, a.low )
4657         : lt128( a.high, a.low, b.high, b.low );
4658
4659 }
4660
4661 /*----------------------------------------------------------------------------
4662 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4663 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
4664 | either operand is a NaN.   The comparison is performed according to the
4665 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4666 *----------------------------------------------------------------------------*/
4667 int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
4668 {
4669     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4670               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4671          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4672               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4673        ) {
4674         float_raise( float_flag_invalid STATUS_VAR);
4675         return 1;
4676     }
4677     return 0;
4678 }
4679
4680 /*----------------------------------------------------------------------------
4681 | Returns 1 if the extended double-precision floating-point value `a' is
4682 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4683 | cause an exception.  The comparison is performed according to the IEC/IEEE
4684 | Standard for Binary Floating-Point Arithmetic.
4685 *----------------------------------------------------------------------------*/
4686
4687 int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4688 {
4689
4690     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4691               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4692          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4693               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4694        ) {
4695         if (    floatx80_is_signaling_nan( a )
4696              || floatx80_is_signaling_nan( b ) ) {
4697             float_raise( float_flag_invalid STATUS_VAR);
4698         }
4699         return 0;
4700     }
4701     return
4702            ( a.low == b.low )
4703         && (    ( a.high == b.high )
4704              || (    ( a.low == 0 )
4705                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4706            );
4707
4708 }
4709
4710 /*----------------------------------------------------------------------------
4711 | Returns 1 if the extended double-precision floating-point value `a' is less
4712 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4713 | do not cause an exception.  Otherwise, the comparison is performed according
4714 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4715 *----------------------------------------------------------------------------*/
4716
4717 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4718 {
4719     flag aSign, bSign;
4720
4721     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4722               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4723          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4724               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4725        ) {
4726         if (    floatx80_is_signaling_nan( a )
4727              || floatx80_is_signaling_nan( b ) ) {
4728             float_raise( float_flag_invalid STATUS_VAR);
4729         }
4730         return 0;
4731     }
4732     aSign = extractFloatx80Sign( a );
4733     bSign = extractFloatx80Sign( b );
4734     if ( aSign != bSign ) {
4735         return
4736                aSign
4737             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4738                  == 0 );
4739     }
4740     return
4741           aSign ? le128( b.high, b.low, a.high, a.low )
4742         : le128( a.high, a.low, b.high, b.low );
4743
4744 }
4745
4746 /*----------------------------------------------------------------------------
4747 | Returns 1 if the extended double-precision floating-point value `a' is less
4748 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4749 | an exception.  Otherwise, the comparison is performed according to the
4750 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4751 *----------------------------------------------------------------------------*/
4752
4753 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4754 {
4755     flag aSign, bSign;
4756
4757     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4758               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4759          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4760               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4761        ) {
4762         if (    floatx80_is_signaling_nan( a )
4763              || floatx80_is_signaling_nan( b ) ) {
4764             float_raise( float_flag_invalid STATUS_VAR);
4765         }
4766         return 0;
4767     }
4768     aSign = extractFloatx80Sign( a );
4769     bSign = extractFloatx80Sign( b );
4770     if ( aSign != bSign ) {
4771         return
4772                aSign
4773             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4774                  != 0 );
4775     }
4776     return
4777           aSign ? lt128( b.high, b.low, a.high, a.low )
4778         : lt128( a.high, a.low, b.high, b.low );
4779
4780 }
4781
4782 /*----------------------------------------------------------------------------
4783 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4784 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
4785 | The comparison is performed according to the IEC/IEEE Standard for Binary
4786 | Floating-Point Arithmetic.
4787 *----------------------------------------------------------------------------*/
4788 int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4789 {
4790     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4791               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4792          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4793               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4794        ) {
4795         if (    floatx80_is_signaling_nan( a )
4796              || floatx80_is_signaling_nan( b ) ) {
4797             float_raise( float_flag_invalid STATUS_VAR);
4798         }
4799         return 1;
4800     }
4801     return 0;
4802 }
4803
4804 /*----------------------------------------------------------------------------
4805 | Returns the result of converting the quadruple-precision floating-point
4806 | value `a' to the 32-bit two's complement integer format.  The conversion
4807 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4808 | Arithmetic---which means in particular that the conversion is rounded
4809 | according to the current rounding mode.  If `a' is a NaN, the largest
4810 | positive integer is returned.  Otherwise, if the conversion overflows, the
4811 | largest integer with the same sign as `a' is returned.
4812 *----------------------------------------------------------------------------*/
4813
4814 int32 float128_to_int32( float128 a STATUS_PARAM )
4815 {
4816     flag aSign;
4817     int32 aExp, shiftCount;
4818     uint64_t aSig0, aSig1;
4819
4820     aSig1 = extractFloat128Frac1( a );
4821     aSig0 = extractFloat128Frac0( a );
4822     aExp = extractFloat128Exp( a );
4823     aSign = extractFloat128Sign( a );
4824     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4825     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4826     aSig0 |= ( aSig1 != 0 );
4827     shiftCount = 0x4028 - aExp;
4828     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4829     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4830
4831 }
4832
4833 /*----------------------------------------------------------------------------
4834 | Returns the result of converting the quadruple-precision floating-point
4835 | value `a' to the 32-bit two's complement integer format.  The conversion
4836 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4837 | Arithmetic, except that the conversion is always rounded toward zero.  If
4838 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4839 | conversion overflows, the largest integer with the same sign as `a' is
4840 | returned.
4841 *----------------------------------------------------------------------------*/
4842
4843 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4844 {
4845     flag aSign;
4846     int32 aExp, shiftCount;
4847     uint64_t aSig0, aSig1, savedASig;
4848     int32 z;
4849
4850     aSig1 = extractFloat128Frac1( a );
4851     aSig0 = extractFloat128Frac0( a );
4852     aExp = extractFloat128Exp( a );
4853     aSign = extractFloat128Sign( a );
4854     aSig0 |= ( aSig1 != 0 );
4855     if ( 0x401E < aExp ) {
4856         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4857         goto invalid;
4858     }
4859     else if ( aExp < 0x3FFF ) {
4860         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4861         return 0;
4862     }
4863     aSig0 |= LIT64( 0x0001000000000000 );
4864     shiftCount = 0x402F - aExp;
4865     savedASig = aSig0;
4866     aSig0 >>= shiftCount;
4867     z = aSig0;
4868     if ( aSign ) z = - z;
4869     if ( ( z < 0 ) ^ aSign ) {
4870  invalid:
4871         float_raise( float_flag_invalid STATUS_VAR);
4872         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4873     }
4874     if ( ( aSig0<<shiftCount ) != savedASig ) {
4875         STATUS(float_exception_flags) |= float_flag_inexact;
4876     }
4877     return z;
4878
4879 }
4880
4881 /*----------------------------------------------------------------------------
4882 | Returns the result of converting the quadruple-precision floating-point
4883 | value `a' to the 64-bit two's complement integer format.  The conversion
4884 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4885 | Arithmetic---which means in particular that the conversion is rounded
4886 | according to the current rounding mode.  If `a' is a NaN, the largest
4887 | positive integer is returned.  Otherwise, if the conversion overflows, the
4888 | largest integer with the same sign as `a' is returned.
4889 *----------------------------------------------------------------------------*/
4890
4891 int64 float128_to_int64( float128 a STATUS_PARAM )
4892 {
4893     flag aSign;
4894     int32 aExp, shiftCount;
4895     uint64_t aSig0, aSig1;
4896
4897     aSig1 = extractFloat128Frac1( a );
4898     aSig0 = extractFloat128Frac0( a );
4899     aExp = extractFloat128Exp( a );
4900     aSign = extractFloat128Sign( a );
4901     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4902     shiftCount = 0x402F - aExp;
4903     if ( shiftCount <= 0 ) {
4904         if ( 0x403E < aExp ) {
4905             float_raise( float_flag_invalid STATUS_VAR);
4906             if (    ! aSign
4907                  || (    ( aExp == 0x7FFF )
4908                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4909                     )
4910                ) {
4911                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4912             }
4913             return (int64_t) LIT64( 0x8000000000000000 );
4914         }
4915         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4916     }
4917     else {
4918         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4919     }
4920     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4921
4922 }
4923
4924 /*----------------------------------------------------------------------------
4925 | Returns the result of converting the quadruple-precision floating-point
4926 | value `a' to the 64-bit two's complement integer format.  The conversion
4927 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4928 | Arithmetic, except that the conversion is always rounded toward zero.
4929 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4930 | the conversion overflows, the largest integer with the same sign as `a' is
4931 | returned.
4932 *----------------------------------------------------------------------------*/
4933
4934 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4935 {
4936     flag aSign;
4937     int32 aExp, shiftCount;
4938     uint64_t aSig0, aSig1;
4939     int64 z;
4940
4941     aSig1 = extractFloat128Frac1( a );
4942     aSig0 = extractFloat128Frac0( a );
4943     aExp = extractFloat128Exp( a );
4944     aSign = extractFloat128Sign( a );
4945     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4946     shiftCount = aExp - 0x402F;
4947     if ( 0 < shiftCount ) {
4948         if ( 0x403E <= aExp ) {
4949             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4950             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4951                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4952                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4953             }
4954             else {
4955                 float_raise( float_flag_invalid STATUS_VAR);
4956                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4957                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4958                 }
4959             }
4960             return (int64_t) LIT64( 0x8000000000000000 );
4961         }
4962         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4963         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
4964             STATUS(float_exception_flags) |= float_flag_inexact;
4965         }
4966     }
4967     else {
4968         if ( aExp < 0x3FFF ) {
4969             if ( aExp | aSig0 | aSig1 ) {
4970                 STATUS(float_exception_flags) |= float_flag_inexact;
4971             }
4972             return 0;
4973         }
4974         z = aSig0>>( - shiftCount );
4975         if (    aSig1
4976              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4977             STATUS(float_exception_flags) |= float_flag_inexact;
4978         }
4979     }
4980     if ( aSign ) z = - z;
4981     return z;
4982
4983 }
4984
4985 /*----------------------------------------------------------------------------
4986 | Returns the result of converting the quadruple-precision floating-point
4987 | value `a' to the single-precision floating-point format.  The conversion
4988 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4989 | Arithmetic.
4990 *----------------------------------------------------------------------------*/
4991
4992 float32 float128_to_float32( float128 a STATUS_PARAM )
4993 {
4994     flag aSign;
4995     int32 aExp;
4996     uint64_t aSig0, aSig1;
4997     uint32_t zSig;
4998
4999     aSig1 = extractFloat128Frac1( a );
5000     aSig0 = extractFloat128Frac0( a );
5001     aExp = extractFloat128Exp( a );
5002     aSign = extractFloat128Sign( a );
5003     if ( aExp == 0x7FFF ) {
5004         if ( aSig0 | aSig1 ) {
5005             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5006         }
5007         return packFloat32( aSign, 0xFF, 0 );
5008     }
5009     aSig0 |= ( aSig1 != 0 );
5010     shift64RightJamming( aSig0, 18, &aSig0 );
5011     zSig = aSig0;
5012     if ( aExp || zSig ) {
5013         zSig |= 0x40000000;
5014         aExp -= 0x3F81;
5015     }
5016     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5017
5018 }
5019
5020 /*----------------------------------------------------------------------------
5021 | Returns the result of converting the quadruple-precision floating-point
5022 | value `a' to the double-precision floating-point format.  The conversion
5023 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5024 | Arithmetic.
5025 *----------------------------------------------------------------------------*/
5026
5027 float64 float128_to_float64( float128 a STATUS_PARAM )
5028 {
5029     flag aSign;
5030     int32 aExp;
5031     uint64_t aSig0, aSig1;
5032
5033     aSig1 = extractFloat128Frac1( a );
5034     aSig0 = extractFloat128Frac0( a );
5035     aExp = extractFloat128Exp( a );
5036     aSign = extractFloat128Sign( a );
5037     if ( aExp == 0x7FFF ) {
5038         if ( aSig0 | aSig1 ) {
5039             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5040         }
5041         return packFloat64( aSign, 0x7FF, 0 );
5042     }
5043     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5044     aSig0 |= ( aSig1 != 0 );
5045     if ( aExp || aSig0 ) {
5046         aSig0 |= LIT64( 0x4000000000000000 );
5047         aExp -= 0x3C01;
5048     }
5049     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5050
5051 }
5052
5053 /*----------------------------------------------------------------------------
5054 | Returns the result of converting the quadruple-precision floating-point
5055 | value `a' to the extended double-precision floating-point format.  The
5056 | conversion is performed according to the IEC/IEEE Standard for Binary
5057 | Floating-Point Arithmetic.
5058 *----------------------------------------------------------------------------*/
5059
5060 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5061 {
5062     flag aSign;
5063     int32 aExp;
5064     uint64_t aSig0, aSig1;
5065
5066     aSig1 = extractFloat128Frac1( a );
5067     aSig0 = extractFloat128Frac0( a );
5068     aExp = extractFloat128Exp( a );
5069     aSign = extractFloat128Sign( a );
5070     if ( aExp == 0x7FFF ) {
5071         if ( aSig0 | aSig1 ) {
5072             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5073         }
5074         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5075     }
5076     if ( aExp == 0 ) {
5077         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5078         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5079     }
5080     else {
5081         aSig0 |= LIT64( 0x0001000000000000 );
5082     }
5083     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5084     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5085
5086 }
5087
5088 /*----------------------------------------------------------------------------
5089 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5090 | returns the result as a quadruple-precision floating-point value.  The
5091 | operation is performed according to the IEC/IEEE Standard for Binary
5092 | Floating-Point Arithmetic.
5093 *----------------------------------------------------------------------------*/
5094
5095 float128 float128_round_to_int( float128 a STATUS_PARAM )
5096 {
5097     flag aSign;
5098     int32 aExp;
5099     uint64_t lastBitMask, roundBitsMask;
5100     int8 roundingMode;
5101     float128 z;
5102
5103     aExp = extractFloat128Exp( a );
5104     if ( 0x402F <= aExp ) {
5105         if ( 0x406F <= aExp ) {
5106             if (    ( aExp == 0x7FFF )
5107                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5108                ) {
5109                 return propagateFloat128NaN( a, a STATUS_VAR );
5110             }
5111             return a;
5112         }
5113         lastBitMask = 1;
5114         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5115         roundBitsMask = lastBitMask - 1;
5116         z = a;
5117         roundingMode = STATUS(float_rounding_mode);
5118         if ( roundingMode == float_round_nearest_even ) {
5119             if ( lastBitMask ) {
5120                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5121                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5122             }
5123             else {
5124                 if ( (int64_t) z.low < 0 ) {
5125                     ++z.high;
5126                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5127                 }
5128             }
5129         }
5130         else if ( roundingMode != float_round_to_zero ) {
5131             if (   extractFloat128Sign( z )
5132                  ^ ( roundingMode == float_round_up ) ) {
5133                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5134             }
5135         }
5136         z.low &= ~ roundBitsMask;
5137     }
5138     else {
5139         if ( aExp < 0x3FFF ) {
5140             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5141             STATUS(float_exception_flags) |= float_flag_inexact;
5142             aSign = extractFloat128Sign( a );
5143             switch ( STATUS(float_rounding_mode) ) {
5144              case float_round_nearest_even:
5145                 if (    ( aExp == 0x3FFE )
5146                      && (   extractFloat128Frac0( a )
5147                           | extractFloat128Frac1( a ) )
5148                    ) {
5149                     return packFloat128( aSign, 0x3FFF, 0, 0 );
5150                 }
5151                 break;
5152              case float_round_down:
5153                 return
5154                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5155                     : packFloat128( 0, 0, 0, 0 );
5156              case float_round_up:
5157                 return
5158                       aSign ? packFloat128( 1, 0, 0, 0 )
5159                     : packFloat128( 0, 0x3FFF, 0, 0 );
5160             }
5161             return packFloat128( aSign, 0, 0, 0 );
5162         }
5163         lastBitMask = 1;
5164         lastBitMask <<= 0x402F - aExp;
5165         roundBitsMask = lastBitMask - 1;
5166         z.low = 0;
5167         z.high = a.high;
5168         roundingMode = STATUS(float_rounding_mode);
5169         if ( roundingMode == float_round_nearest_even ) {
5170             z.high += lastBitMask>>1;
5171             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5172                 z.high &= ~ lastBitMask;
5173             }
5174         }
5175         else if ( roundingMode != float_round_to_zero ) {
5176             if (   extractFloat128Sign( z )
5177                  ^ ( roundingMode == float_round_up ) ) {
5178                 z.high |= ( a.low != 0 );
5179                 z.high += roundBitsMask;
5180             }
5181         }
5182         z.high &= ~ roundBitsMask;
5183     }
5184     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5185         STATUS(float_exception_flags) |= float_flag_inexact;
5186     }
5187     return z;
5188
5189 }
5190
5191 /*----------------------------------------------------------------------------
5192 | Returns the result of adding the absolute values of the quadruple-precision
5193 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5194 | before being returned.  `zSign' is ignored if the result is a NaN.
5195 | The addition is performed according to the IEC/IEEE Standard for Binary
5196 | Floating-Point Arithmetic.
5197 *----------------------------------------------------------------------------*/
5198
5199 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5200 {
5201     int32 aExp, bExp, zExp;
5202     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5203     int32 expDiff;
5204
5205     aSig1 = extractFloat128Frac1( a );
5206     aSig0 = extractFloat128Frac0( a );
5207     aExp = extractFloat128Exp( a );
5208     bSig1 = extractFloat128Frac1( b );
5209     bSig0 = extractFloat128Frac0( b );
5210     bExp = extractFloat128Exp( b );
5211     expDiff = aExp - bExp;
5212     if ( 0 < expDiff ) {
5213         if ( aExp == 0x7FFF ) {
5214             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5215             return a;
5216         }
5217         if ( bExp == 0 ) {
5218             --expDiff;
5219         }
5220         else {
5221             bSig0 |= LIT64( 0x0001000000000000 );
5222         }
5223         shift128ExtraRightJamming(
5224             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5225         zExp = aExp;
5226     }
5227     else if ( expDiff < 0 ) {
5228         if ( bExp == 0x7FFF ) {
5229             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5230             return packFloat128( zSign, 0x7FFF, 0, 0 );
5231         }
5232         if ( aExp == 0 ) {
5233             ++expDiff;
5234         }
5235         else {
5236             aSig0 |= LIT64( 0x0001000000000000 );
5237         }
5238         shift128ExtraRightJamming(
5239             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5240         zExp = bExp;
5241     }
5242     else {
5243         if ( aExp == 0x7FFF ) {
5244             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5245                 return propagateFloat128NaN( a, b STATUS_VAR );
5246             }
5247             return a;
5248         }
5249         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5250         if ( aExp == 0 ) {
5251             if (STATUS(flush_to_zero)) {
5252                 if (zSig0 | zSig1) {
5253                     float_raise(float_flag_output_denormal STATUS_VAR);
5254                 }
5255                 return packFloat128(zSign, 0, 0, 0);
5256             }
5257             return packFloat128( zSign, 0, zSig0, zSig1 );
5258         }
5259         zSig2 = 0;
5260         zSig0 |= LIT64( 0x0002000000000000 );
5261         zExp = aExp;
5262         goto shiftRight1;
5263     }
5264     aSig0 |= LIT64( 0x0001000000000000 );
5265     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5266     --zExp;
5267     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5268     ++zExp;
5269  shiftRight1:
5270     shift128ExtraRightJamming(
5271         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5272  roundAndPack:
5273     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5274
5275 }
5276
5277 /*----------------------------------------------------------------------------
5278 | Returns the result of subtracting the absolute values of the quadruple-
5279 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
5280 | difference is negated before being returned.  `zSign' is ignored if the
5281 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5282 | Standard for Binary Floating-Point Arithmetic.
5283 *----------------------------------------------------------------------------*/
5284
5285 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5286 {
5287     int32 aExp, bExp, zExp;
5288     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5289     int32 expDiff;
5290     float128 z;
5291
5292     aSig1 = extractFloat128Frac1( a );
5293     aSig0 = extractFloat128Frac0( a );
5294     aExp = extractFloat128Exp( a );
5295     bSig1 = extractFloat128Frac1( b );
5296     bSig0 = extractFloat128Frac0( b );
5297     bExp = extractFloat128Exp( b );
5298     expDiff = aExp - bExp;
5299     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5300     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5301     if ( 0 < expDiff ) goto aExpBigger;
5302     if ( expDiff < 0 ) goto bExpBigger;
5303     if ( aExp == 0x7FFF ) {
5304         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5305             return propagateFloat128NaN( a, b STATUS_VAR );
5306         }
5307         float_raise( float_flag_invalid STATUS_VAR);
5308         z.low = float128_default_nan_low;
5309         z.high = float128_default_nan_high;
5310         return z;
5311     }
5312     if ( aExp == 0 ) {
5313         aExp = 1;
5314         bExp = 1;
5315     }
5316     if ( bSig0 < aSig0 ) goto aBigger;
5317     if ( aSig0 < bSig0 ) goto bBigger;
5318     if ( bSig1 < aSig1 ) goto aBigger;
5319     if ( aSig1 < bSig1 ) goto bBigger;
5320     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5321  bExpBigger:
5322     if ( bExp == 0x7FFF ) {
5323         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5324         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5325     }
5326     if ( aExp == 0 ) {
5327         ++expDiff;
5328     }
5329     else {
5330         aSig0 |= LIT64( 0x4000000000000000 );
5331     }
5332     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5333     bSig0 |= LIT64( 0x4000000000000000 );
5334  bBigger:
5335     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5336     zExp = bExp;
5337     zSign ^= 1;
5338     goto normalizeRoundAndPack;
5339  aExpBigger:
5340     if ( aExp == 0x7FFF ) {
5341         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5342         return a;
5343     }
5344     if ( bExp == 0 ) {
5345         --expDiff;
5346     }
5347     else {
5348         bSig0 |= LIT64( 0x4000000000000000 );
5349     }
5350     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5351     aSig0 |= LIT64( 0x4000000000000000 );
5352  aBigger:
5353     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5354     zExp = aExp;
5355  normalizeRoundAndPack:
5356     --zExp;
5357     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5358
5359 }
5360
5361 /*----------------------------------------------------------------------------
5362 | Returns the result of adding the quadruple-precision floating-point values
5363 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5364 | for Binary Floating-Point Arithmetic.
5365 *----------------------------------------------------------------------------*/
5366
5367 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5368 {
5369     flag aSign, bSign;
5370
5371     aSign = extractFloat128Sign( a );
5372     bSign = extractFloat128Sign( b );
5373     if ( aSign == bSign ) {
5374         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5375     }
5376     else {
5377         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5378     }
5379
5380 }
5381
5382 /*----------------------------------------------------------------------------
5383 | Returns the result of subtracting the quadruple-precision floating-point
5384 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5385 | Standard for Binary Floating-Point Arithmetic.
5386 *----------------------------------------------------------------------------*/
5387
5388 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5389 {
5390     flag aSign, bSign;
5391
5392     aSign = extractFloat128Sign( a );
5393     bSign = extractFloat128Sign( b );
5394     if ( aSign == bSign ) {
5395         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5396     }
5397     else {
5398         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5399     }
5400
5401 }
5402
5403 /*----------------------------------------------------------------------------
5404 | Returns the result of multiplying the quadruple-precision floating-point
5405 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5406 | Standard for Binary Floating-Point Arithmetic.
5407 *----------------------------------------------------------------------------*/
5408
5409 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5410 {
5411     flag aSign, bSign, zSign;
5412     int32 aExp, bExp, zExp;
5413     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5414     float128 z;
5415
5416     aSig1 = extractFloat128Frac1( a );
5417     aSig0 = extractFloat128Frac0( a );
5418     aExp = extractFloat128Exp( a );
5419     aSign = extractFloat128Sign( a );
5420     bSig1 = extractFloat128Frac1( b );
5421     bSig0 = extractFloat128Frac0( b );
5422     bExp = extractFloat128Exp( b );
5423     bSign = extractFloat128Sign( b );
5424     zSign = aSign ^ bSign;
5425     if ( aExp == 0x7FFF ) {
5426         if (    ( aSig0 | aSig1 )
5427              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5428             return propagateFloat128NaN( a, b STATUS_VAR );
5429         }
5430         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5431         return packFloat128( zSign, 0x7FFF, 0, 0 );
5432     }
5433     if ( bExp == 0x7FFF ) {
5434         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5435         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5436  invalid:
5437             float_raise( float_flag_invalid STATUS_VAR);
5438             z.low = float128_default_nan_low;
5439             z.high = float128_default_nan_high;
5440             return z;
5441         }
5442         return packFloat128( zSign, 0x7FFF, 0, 0 );
5443     }
5444     if ( aExp == 0 ) {
5445         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5446         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5447     }
5448     if ( bExp == 0 ) {
5449         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5450         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5451     }
5452     zExp = aExp + bExp - 0x4000;
5453     aSig0 |= LIT64( 0x0001000000000000 );
5454     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5455     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5456     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5457     zSig2 |= ( zSig3 != 0 );
5458     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5459         shift128ExtraRightJamming(
5460             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5461         ++zExp;
5462     }
5463     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5464
5465 }
5466
5467 /*----------------------------------------------------------------------------
5468 | Returns the result of dividing the quadruple-precision floating-point value
5469 | `a' by the corresponding value `b'.  The operation is performed according to
5470 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5471 *----------------------------------------------------------------------------*/
5472
5473 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5474 {
5475     flag aSign, bSign, zSign;
5476     int32 aExp, bExp, zExp;
5477     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5478     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5479     float128 z;
5480
5481     aSig1 = extractFloat128Frac1( a );
5482     aSig0 = extractFloat128Frac0( a );
5483     aExp = extractFloat128Exp( a );
5484     aSign = extractFloat128Sign( a );
5485     bSig1 = extractFloat128Frac1( b );
5486     bSig0 = extractFloat128Frac0( b );
5487     bExp = extractFloat128Exp( b );
5488     bSign = extractFloat128Sign( b );
5489     zSign = aSign ^ bSign;
5490     if ( aExp == 0x7FFF ) {
5491         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5492         if ( bExp == 0x7FFF ) {
5493             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5494             goto invalid;
5495         }
5496         return packFloat128( zSign, 0x7FFF, 0, 0 );
5497     }
5498     if ( bExp == 0x7FFF ) {
5499         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5500         return packFloat128( zSign, 0, 0, 0 );
5501     }
5502     if ( bExp == 0 ) {
5503         if ( ( bSig0 | bSig1 ) == 0 ) {
5504             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5505  invalid:
5506                 float_raise( float_flag_invalid STATUS_VAR);
5507                 z.low = float128_default_nan_low;
5508                 z.high = float128_default_nan_high;
5509                 return z;
5510             }
5511             float_raise( float_flag_divbyzero STATUS_VAR);
5512             return packFloat128( zSign, 0x7FFF, 0, 0 );
5513         }
5514         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5515     }
5516     if ( aExp == 0 ) {
5517         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5518         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5519     }
5520     zExp = aExp - bExp + 0x3FFD;
5521     shortShift128Left(
5522         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5523     shortShift128Left(
5524         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5525     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5526         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5527         ++zExp;
5528     }
5529     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5530     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5531     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5532     while ( (int64_t) rem0 < 0 ) {
5533         --zSig0;
5534         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5535     }
5536     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5537     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5538         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5539         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5540         while ( (int64_t) rem1 < 0 ) {
5541             --zSig1;
5542             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5543         }
5544         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5545     }
5546     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5547     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5548
5549 }
5550
5551 /*----------------------------------------------------------------------------
5552 | Returns the remainder of the quadruple-precision floating-point value `a'
5553 | with respect to the corresponding value `b'.  The operation is performed
5554 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5555 *----------------------------------------------------------------------------*/
5556
5557 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5558 {
5559     flag aSign, zSign;
5560     int32 aExp, bExp, expDiff;
5561     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5562     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
5563     int64_t sigMean0;
5564     float128 z;
5565
5566     aSig1 = extractFloat128Frac1( a );
5567     aSig0 = extractFloat128Frac0( a );
5568     aExp = extractFloat128Exp( a );
5569     aSign = extractFloat128Sign( a );
5570     bSig1 = extractFloat128Frac1( b );
5571     bSig0 = extractFloat128Frac0( b );
5572     bExp = extractFloat128Exp( b );
5573     if ( aExp == 0x7FFF ) {
5574         if (    ( aSig0 | aSig1 )
5575              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5576             return propagateFloat128NaN( a, b STATUS_VAR );
5577         }
5578         goto invalid;
5579     }
5580     if ( bExp == 0x7FFF ) {
5581         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5582         return a;
5583     }
5584     if ( bExp == 0 ) {
5585         if ( ( bSig0 | bSig1 ) == 0 ) {
5586  invalid:
5587             float_raise( float_flag_invalid STATUS_VAR);
5588             z.low = float128_default_nan_low;
5589             z.high = float128_default_nan_high;
5590             return z;
5591         }
5592         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5593     }
5594     if ( aExp == 0 ) {
5595         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5596         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5597     }
5598     expDiff = aExp - bExp;
5599     if ( expDiff < -1 ) return a;
5600     shortShift128Left(
5601         aSig0 | LIT64( 0x0001000000000000 ),
5602         aSig1,
5603         15 - ( expDiff < 0 ),
5604         &aSig0,
5605         &aSig1
5606     );
5607     shortShift128Left(
5608         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5609     q = le128( bSig0, bSig1, aSig0, aSig1 );
5610     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5611     expDiff -= 64;
5612     while ( 0 < expDiff ) {
5613         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5614         q = ( 4 < q ) ? q - 4 : 0;
5615         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5616         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5617         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5618         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5619         expDiff -= 61;
5620     }
5621     if ( -64 < expDiff ) {
5622         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5623         q = ( 4 < q ) ? q - 4 : 0;
5624         q >>= - expDiff;
5625         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5626         expDiff += 52;
5627         if ( expDiff < 0 ) {
5628             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5629         }
5630         else {
5631             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5632         }
5633         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5634         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5635     }
5636     else {
5637         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5638         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5639     }
5640     do {
5641         alternateASig0 = aSig0;
5642         alternateASig1 = aSig1;
5643         ++q;
5644         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5645     } while ( 0 <= (int64_t) aSig0 );
5646     add128(
5647         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
5648     if (    ( sigMean0 < 0 )
5649          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5650         aSig0 = alternateASig0;
5651         aSig1 = alternateASig1;
5652     }
5653     zSign = ( (int64_t) aSig0 < 0 );
5654     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5655     return
5656         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5657
5658 }
5659
5660 /*----------------------------------------------------------------------------
5661 | Returns the square root of the quadruple-precision floating-point value `a'.
5662 | The operation is performed according to the IEC/IEEE Standard for Binary
5663 | Floating-Point Arithmetic.
5664 *----------------------------------------------------------------------------*/
5665
5666 float128 float128_sqrt( float128 a STATUS_PARAM )
5667 {
5668     flag aSign;
5669     int32 aExp, zExp;
5670     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5671     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5672     float128 z;
5673
5674     aSig1 = extractFloat128Frac1( a );
5675     aSig0 = extractFloat128Frac0( a );
5676     aExp = extractFloat128Exp( a );
5677     aSign = extractFloat128Sign( a );
5678     if ( aExp == 0x7FFF ) {
5679         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5680         if ( ! aSign ) return a;
5681         goto invalid;
5682     }
5683     if ( aSign ) {
5684         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5685  invalid:
5686         float_raise( float_flag_invalid STATUS_VAR);
5687         z.low = float128_default_nan_low;
5688         z.high = float128_default_nan_high;
5689         return z;
5690     }
5691     if ( aExp == 0 ) {
5692         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5693         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5694     }
5695     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5696     aSig0 |= LIT64( 0x0001000000000000 );
5697     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5698     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5699     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5700     doubleZSig0 = zSig0<<1;
5701     mul64To128( zSig0, zSig0, &term0, &term1 );
5702     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5703     while ( (int64_t) rem0 < 0 ) {
5704         --zSig0;
5705         doubleZSig0 -= 2;
5706         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5707     }
5708     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5709     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5710         if ( zSig1 == 0 ) zSig1 = 1;
5711         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5712         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5713         mul64To128( zSig1, zSig1, &term2, &term3 );
5714         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5715         while ( (int64_t) rem1 < 0 ) {
5716             --zSig1;
5717             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5718             term3 |= 1;
5719             term2 |= doubleZSig0;
5720             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5721         }
5722         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5723     }
5724     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5725     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5726
5727 }
5728
5729 /*----------------------------------------------------------------------------
5730 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5731 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5732 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5733 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5734 *----------------------------------------------------------------------------*/
5735
5736 int float128_eq( float128 a, float128 b STATUS_PARAM )
5737 {
5738
5739     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5740               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5741          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5742               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5743        ) {
5744         float_raise( float_flag_invalid STATUS_VAR);
5745         return 0;
5746     }
5747     return
5748            ( a.low == b.low )
5749         && (    ( a.high == b.high )
5750              || (    ( a.low == 0 )
5751                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5752            );
5753
5754 }
5755
5756 /*----------------------------------------------------------------------------
5757 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5758 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
5759 | exception is raised if either operand is a NaN.  The comparison is performed
5760 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5761 *----------------------------------------------------------------------------*/
5762
5763 int float128_le( float128 a, float128 b STATUS_PARAM )
5764 {
5765     flag aSign, bSign;
5766
5767     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5768               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5769          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5770               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5771        ) {
5772         float_raise( float_flag_invalid STATUS_VAR);
5773         return 0;
5774     }
5775     aSign = extractFloat128Sign( a );
5776     bSign = extractFloat128Sign( b );
5777     if ( aSign != bSign ) {
5778         return
5779                aSign
5780             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5781                  == 0 );
5782     }
5783     return
5784           aSign ? le128( b.high, b.low, a.high, a.low )
5785         : le128( a.high, a.low, b.high, b.low );
5786
5787 }
5788
5789 /*----------------------------------------------------------------------------
5790 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5791 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5792 | raised if either operand is a NaN.  The comparison is performed according
5793 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5794 *----------------------------------------------------------------------------*/
5795
5796 int float128_lt( float128 a, float128 b STATUS_PARAM )
5797 {
5798     flag aSign, bSign;
5799
5800     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5801               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5802          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5803               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5804        ) {
5805         float_raise( float_flag_invalid STATUS_VAR);
5806         return 0;
5807     }
5808     aSign = extractFloat128Sign( a );
5809     bSign = extractFloat128Sign( b );
5810     if ( aSign != bSign ) {
5811         return
5812                aSign
5813             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5814                  != 0 );
5815     }
5816     return
5817           aSign ? lt128( b.high, b.low, a.high, a.low )
5818         : lt128( a.high, a.low, b.high, b.low );
5819
5820 }
5821
5822 /*----------------------------------------------------------------------------
5823 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5824 | be compared, and 0 otherwise.  The invalid exception is raised if either
5825 | operand is a NaN. The comparison is performed according to the IEC/IEEE
5826 | Standard for Binary Floating-Point Arithmetic.
5827 *----------------------------------------------------------------------------*/
5828
5829 int float128_unordered( float128 a, float128 b STATUS_PARAM )
5830 {
5831     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5832               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5833          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5834               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5835        ) {
5836         float_raise( float_flag_invalid STATUS_VAR);
5837         return 1;
5838     }
5839     return 0;
5840 }
5841
5842 /*----------------------------------------------------------------------------
5843 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5844 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5845 | exception.  The comparison is performed according to the IEC/IEEE Standard
5846 | for Binary Floating-Point Arithmetic.
5847 *----------------------------------------------------------------------------*/
5848
5849 int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
5850 {
5851
5852     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5853               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5854          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5855               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5856        ) {
5857         if (    float128_is_signaling_nan( a )
5858              || float128_is_signaling_nan( b ) ) {
5859             float_raise( float_flag_invalid STATUS_VAR);
5860         }
5861         return 0;
5862     }
5863     return
5864            ( a.low == b.low )
5865         && (    ( a.high == b.high )
5866              || (    ( a.low == 0 )
5867                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5868            );
5869
5870 }
5871
5872 /*----------------------------------------------------------------------------
5873 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5874 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5875 | cause an exception.  Otherwise, the comparison is performed according to the
5876 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5877 *----------------------------------------------------------------------------*/
5878
5879 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5880 {
5881     flag aSign, bSign;
5882
5883     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5884               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5885          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5886               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5887        ) {
5888         if (    float128_is_signaling_nan( a )
5889              || float128_is_signaling_nan( b ) ) {
5890             float_raise( float_flag_invalid STATUS_VAR);
5891         }
5892         return 0;
5893     }
5894     aSign = extractFloat128Sign( a );
5895     bSign = extractFloat128Sign( b );
5896     if ( aSign != bSign ) {
5897         return
5898                aSign
5899             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5900                  == 0 );
5901     }
5902     return
5903           aSign ? le128( b.high, b.low, a.high, a.low )
5904         : le128( a.high, a.low, b.high, b.low );
5905
5906 }
5907
5908 /*----------------------------------------------------------------------------
5909 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5910 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5911 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5912 | Standard for Binary Floating-Point Arithmetic.
5913 *----------------------------------------------------------------------------*/
5914
5915 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5916 {
5917     flag aSign, bSign;
5918
5919     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5920               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5921          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5922               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5923        ) {
5924         if (    float128_is_signaling_nan( a )
5925              || float128_is_signaling_nan( b ) ) {
5926             float_raise( float_flag_invalid STATUS_VAR);
5927         }
5928         return 0;
5929     }
5930     aSign = extractFloat128Sign( a );
5931     bSign = extractFloat128Sign( b );
5932     if ( aSign != bSign ) {
5933         return
5934                aSign
5935             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5936                  != 0 );
5937     }
5938     return
5939           aSign ? lt128( b.high, b.low, a.high, a.low )
5940         : lt128( a.high, a.low, b.high, b.low );
5941
5942 }
5943
5944 /*----------------------------------------------------------------------------
5945 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5946 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
5947 | comparison is performed according to the IEC/IEEE Standard for Binary
5948 | Floating-Point Arithmetic.
5949 *----------------------------------------------------------------------------*/
5950
5951 int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
5952 {
5953     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5954               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5955          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5956               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5957        ) {
5958         if (    float128_is_signaling_nan( a )
5959              || float128_is_signaling_nan( b ) ) {
5960             float_raise( float_flag_invalid STATUS_VAR);
5961         }
5962         return 1;
5963     }
5964     return 0;
5965 }
5966
5967 /* misc functions */
5968 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5969 {
5970     return int64_to_float32(a STATUS_VAR);
5971 }
5972
5973 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5974 {
5975     return int64_to_float64(a STATUS_VAR);
5976 }
5977
5978 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5979 {
5980     int64_t v;
5981     unsigned int res;
5982
5983     v = float32_to_int64(a STATUS_VAR);
5984     if (v < 0) {
5985         res = 0;
5986         float_raise( float_flag_invalid STATUS_VAR);
5987     } else if (v > 0xffffffff) {
5988         res = 0xffffffff;
5989         float_raise( float_flag_invalid STATUS_VAR);
5990     } else {
5991         res = v;
5992     }
5993     return res;
5994 }
5995
5996 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5997 {
5998     int64_t v;
5999     unsigned int res;
6000
6001     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6002     if (v < 0) {
6003         res = 0;
6004         float_raise( float_flag_invalid STATUS_VAR);
6005     } else if (v > 0xffffffff) {
6006         res = 0xffffffff;
6007         float_raise( float_flag_invalid STATUS_VAR);
6008     } else {
6009         res = v;
6010     }
6011     return res;
6012 }
6013
6014 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
6015 {
6016     int64_t v;
6017     unsigned int res;
6018
6019     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6020     if (v < 0) {
6021         res = 0;
6022         float_raise( float_flag_invalid STATUS_VAR);
6023     } else if (v > 0xffff) {
6024         res = 0xffff;
6025         float_raise( float_flag_invalid STATUS_VAR);
6026     } else {
6027         res = v;
6028     }
6029     return res;
6030 }
6031
6032 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
6033 {
6034     int64_t v;
6035     unsigned int res;
6036
6037     v = float64_to_int64(a STATUS_VAR);
6038     if (v < 0) {
6039         res = 0;
6040         float_raise( float_flag_invalid STATUS_VAR);
6041     } else if (v > 0xffffffff) {
6042         res = 0xffffffff;
6043         float_raise( float_flag_invalid STATUS_VAR);
6044     } else {
6045         res = v;
6046     }
6047     return res;
6048 }
6049
6050 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6051 {
6052     int64_t v;
6053     unsigned int res;
6054
6055     v = float64_to_int64_round_to_zero(a STATUS_VAR);
6056     if (v < 0) {
6057         res = 0;
6058         float_raise( float_flag_invalid STATUS_VAR);
6059     } else if (v > 0xffffffff) {
6060         res = 0xffffffff;
6061         float_raise( float_flag_invalid STATUS_VAR);
6062     } else {
6063         res = v;
6064     }
6065     return res;
6066 }
6067
6068 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
6069 {
6070     int64_t v;
6071     unsigned int res;
6072
6073     v = float64_to_int64_round_to_zero(a STATUS_VAR);
6074     if (v < 0) {
6075         res = 0;
6076         float_raise( float_flag_invalid STATUS_VAR);
6077     } else if (v > 0xffff) {
6078         res = 0xffff;
6079         float_raise( float_flag_invalid STATUS_VAR);
6080     } else {
6081         res = v;
6082     }
6083     return res;
6084 }
6085
6086 /* FIXME: This looks broken.  */
6087 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
6088 {
6089     int64_t v;
6090
6091     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6092     v += float64_val(a);
6093     v = float64_to_int64(make_float64(v) STATUS_VAR);
6094
6095     return v - INT64_MIN;
6096 }
6097
6098 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6099 {
6100     int64_t v;
6101
6102     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6103     v += float64_val(a);
6104     v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
6105
6106     return v - INT64_MIN;
6107 }
6108
6109 #define COMPARE(s, nan_exp)                                                  \
6110 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
6111                                       int is_quiet STATUS_PARAM )            \
6112 {                                                                            \
6113     flag aSign, bSign;                                                       \
6114     uint ## s ## _t av, bv;                                                  \
6115     a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
6116     b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
6117                                                                              \
6118     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
6119          extractFloat ## s ## Frac( a ) ) ||                                 \
6120         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
6121           extractFloat ## s ## Frac( b ) )) {                                \
6122         if (!is_quiet ||                                                     \
6123             float ## s ## _is_signaling_nan( a ) ||                          \
6124             float ## s ## _is_signaling_nan( b ) ) {                         \
6125             float_raise( float_flag_invalid STATUS_VAR);                     \
6126         }                                                                    \
6127         return float_relation_unordered;                                     \
6128     }                                                                        \
6129     aSign = extractFloat ## s ## Sign( a );                                  \
6130     bSign = extractFloat ## s ## Sign( b );                                  \
6131     av = float ## s ## _val(a);                                              \
6132     bv = float ## s ## _val(b);                                              \
6133     if ( aSign != bSign ) {                                                  \
6134         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
6135             /* zero case */                                                  \
6136             return float_relation_equal;                                     \
6137         } else {                                                             \
6138             return 1 - (2 * aSign);                                          \
6139         }                                                                    \
6140     } else {                                                                 \
6141         if (av == bv) {                                                      \
6142             return float_relation_equal;                                     \
6143         } else {                                                             \
6144             return 1 - 2 * (aSign ^ ( av < bv ));                            \
6145         }                                                                    \
6146     }                                                                        \
6147 }                                                                            \
6148                                                                              \
6149 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
6150 {                                                                            \
6151     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
6152 }                                                                            \
6153                                                                              \
6154 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
6155 {                                                                            \
6156     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
6157 }
6158
6159 COMPARE(32, 0xff)
6160 COMPARE(64, 0x7ff)
6161
6162 INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6163                                       int is_quiet STATUS_PARAM )
6164 {
6165     flag aSign, bSign;
6166
6167     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6168           ( extractFloatx80Frac( a )<<1 ) ) ||
6169         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6170           ( extractFloatx80Frac( b )<<1 ) )) {
6171         if (!is_quiet ||
6172             floatx80_is_signaling_nan( a ) ||
6173             floatx80_is_signaling_nan( b ) ) {
6174             float_raise( float_flag_invalid STATUS_VAR);
6175         }
6176         return float_relation_unordered;
6177     }
6178     aSign = extractFloatx80Sign( a );
6179     bSign = extractFloatx80Sign( b );
6180     if ( aSign != bSign ) {
6181
6182         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6183              ( ( a.low | b.low ) == 0 ) ) {
6184             /* zero case */
6185             return float_relation_equal;
6186         } else {
6187             return 1 - (2 * aSign);
6188         }
6189     } else {
6190         if (a.low == b.low && a.high == b.high) {
6191             return float_relation_equal;
6192         } else {
6193             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6194         }
6195     }
6196 }
6197
6198 int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6199 {
6200     return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6201 }
6202
6203 int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6204 {
6205     return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6206 }
6207
6208 INLINE int float128_compare_internal( float128 a, float128 b,
6209                                       int is_quiet STATUS_PARAM )
6210 {
6211     flag aSign, bSign;
6212
6213     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6214           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6215         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6216           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6217         if (!is_quiet ||
6218             float128_is_signaling_nan( a ) ||
6219             float128_is_signaling_nan( b ) ) {
6220             float_raise( float_flag_invalid STATUS_VAR);
6221         }
6222         return float_relation_unordered;
6223     }
6224     aSign = extractFloat128Sign( a );
6225     bSign = extractFloat128Sign( b );
6226     if ( aSign != bSign ) {
6227         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6228             /* zero case */
6229             return float_relation_equal;
6230         } else {
6231             return 1 - (2 * aSign);
6232         }
6233     } else {
6234         if (a.low == b.low && a.high == b.high) {
6235             return float_relation_equal;
6236         } else {
6237             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6238         }
6239     }
6240 }
6241
6242 int float128_compare( float128 a, float128 b STATUS_PARAM )
6243 {
6244     return float128_compare_internal(a, b, 0 STATUS_VAR);
6245 }
6246
6247 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6248 {
6249     return float128_compare_internal(a, b, 1 STATUS_VAR);
6250 }
6251
6252 /* min() and max() functions. These can't be implemented as
6253  * 'compare and pick one input' because that would mishandle
6254  * NaNs and +0 vs -0.
6255  */
6256 #define MINMAX(s, nan_exp)                                              \
6257 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
6258                                         int ismin STATUS_PARAM )        \
6259 {                                                                       \
6260     flag aSign, bSign;                                                  \
6261     uint ## s ## _t av, bv;                                             \
6262     a = float ## s ## _squash_input_denormal(a STATUS_VAR);             \
6263     b = float ## s ## _squash_input_denormal(b STATUS_VAR);             \
6264     if (float ## s ## _is_any_nan(a) ||                                 \
6265         float ## s ## _is_any_nan(b)) {                                 \
6266         return propagateFloat ## s ## NaN(a, b STATUS_VAR);             \
6267     }                                                                   \
6268     aSign = extractFloat ## s ## Sign(a);                               \
6269     bSign = extractFloat ## s ## Sign(b);                               \
6270     av = float ## s ## _val(a);                                         \
6271     bv = float ## s ## _val(b);                                         \
6272     if (aSign != bSign) {                                               \
6273         if (ismin) {                                                    \
6274             return aSign ? a : b;                                       \
6275         } else {                                                        \
6276             return aSign ? b : a;                                       \
6277         }                                                               \
6278     } else {                                                            \
6279         if (ismin) {                                                    \
6280             return (aSign ^ (av < bv)) ? a : b;                         \
6281         } else {                                                        \
6282             return (aSign ^ (av < bv)) ? b : a;                         \
6283         }                                                               \
6284     }                                                                   \
6285 }                                                                       \
6286                                                                         \
6287 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM)  \
6288 {                                                                       \
6289     return float ## s ## _minmax(a, b, 1 STATUS_VAR);                   \
6290 }                                                                       \
6291                                                                         \
6292 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM)  \
6293 {                                                                       \
6294     return float ## s ## _minmax(a, b, 0 STATUS_VAR);                   \
6295 }
6296
6297 MINMAX(32, 0xff)
6298 MINMAX(64, 0x7ff)
6299
6300
6301 /* Multiply A by 2 raised to the power N.  */
6302 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6303 {
6304     flag aSign;
6305     int16_t aExp;
6306     uint32_t aSig;
6307
6308     a = float32_squash_input_denormal(a STATUS_VAR);
6309     aSig = extractFloat32Frac( a );
6310     aExp = extractFloat32Exp( a );
6311     aSign = extractFloat32Sign( a );
6312
6313     if ( aExp == 0xFF ) {
6314         if ( aSig ) {
6315             return propagateFloat32NaN( a, a STATUS_VAR );
6316         }
6317         return a;
6318     }
6319     if ( aExp != 0 )
6320         aSig |= 0x00800000;
6321     else if ( aSig == 0 )
6322         return a;
6323
6324     if (n > 0x200) {
6325         n = 0x200;
6326     } else if (n < -0x200) {
6327         n = -0x200;
6328     }
6329
6330     aExp += n - 1;
6331     aSig <<= 7;
6332     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6333 }
6334
6335 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6336 {
6337     flag aSign;
6338     int16_t aExp;
6339     uint64_t aSig;
6340
6341     a = float64_squash_input_denormal(a STATUS_VAR);
6342     aSig = extractFloat64Frac( a );
6343     aExp = extractFloat64Exp( a );
6344     aSign = extractFloat64Sign( a );
6345
6346     if ( aExp == 0x7FF ) {
6347         if ( aSig ) {
6348             return propagateFloat64NaN( a, a STATUS_VAR );
6349         }
6350         return a;
6351     }
6352     if ( aExp != 0 )
6353         aSig |= LIT64( 0x0010000000000000 );
6354     else if ( aSig == 0 )
6355         return a;
6356
6357     if (n > 0x1000) {
6358         n = 0x1000;
6359     } else if (n < -0x1000) {
6360         n = -0x1000;
6361     }
6362
6363     aExp += n - 1;
6364     aSig <<= 10;
6365     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6366 }
6367
6368 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6369 {
6370     flag aSign;
6371     int32_t aExp;
6372     uint64_t aSig;
6373
6374     aSig = extractFloatx80Frac( a );
6375     aExp = extractFloatx80Exp( a );
6376     aSign = extractFloatx80Sign( a );
6377
6378     if ( aExp == 0x7FFF ) {
6379         if ( aSig<<1 ) {
6380             return propagateFloatx80NaN( a, a STATUS_VAR );
6381         }
6382         return a;
6383     }
6384
6385     if (aExp == 0 && aSig == 0)
6386         return a;
6387
6388     if (n > 0x10000) {
6389         n = 0x10000;
6390     } else if (n < -0x10000) {
6391         n = -0x10000;
6392     }
6393
6394     aExp += n;
6395     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6396                                           aSign, aExp, aSig, 0 STATUS_VAR );
6397 }
6398
6399 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6400 {
6401     flag aSign;
6402     int32_t aExp;
6403     uint64_t aSig0, aSig1;
6404
6405     aSig1 = extractFloat128Frac1( a );
6406     aSig0 = extractFloat128Frac0( a );
6407     aExp = extractFloat128Exp( a );
6408     aSign = extractFloat128Sign( a );
6409     if ( aExp == 0x7FFF ) {
6410         if ( aSig0 | aSig1 ) {
6411             return propagateFloat128NaN( a, a STATUS_VAR );
6412         }
6413         return a;
6414     }
6415     if ( aExp != 0 )
6416         aSig0 |= LIT64( 0x0001000000000000 );
6417     else if ( aSig0 == 0 && aSig1 == 0 )
6418         return a;
6419
6420     if (n > 0x10000) {
6421         n = 0x10000;
6422     } else if (n < -0x10000) {
6423         n = -0x10000;
6424     }
6425
6426     aExp += n - 1;
6427     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6428                                           STATUS_VAR );
6429
6430 }