fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * Derived from SoftFloat.
   5  */
   6
   7 /*============================================================================
   8
   9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  10 Package, Release 2b.
  11
  12 Written by John R. Hauser.  This work was made possible in part by the
  13 International Computer Science Institute, located at Suite 600, 1947 Center
  14 Street, Berkeley, California 94704.  Funding was partially provided by the
  15 National Science Foundation under grant MIP-9311980.  The original version
  16 of this code was written as part of a project to build a fixed-point vector
  17 processor in collaboration with the University of California at Berkeley,
  18 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  20 arithmetic/SoftFloat.html'.
  21
  22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  24 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  30
  31 Derivative works are acceptable, even for commercial purposes, so long as
  32 (1) the source code for the derivative work includes prominent notice that
  33 the work is derivative, and (2) the source code includes prominent notice with
  34 these four paragraphs for those parts of this code that are retained.
  35
  36 =============================================================================*/
  37
  38 /* softfloat (and in particular the code in softfloat-specialize.h) is
  39  * target-dependent and needs the TARGET_* macros.
  40  */
  41 #include "config.h"
  42
  43 #include "softfloat.h"
  44
  45 /*----------------------------------------------------------------------------
  46 | Primitive arithmetic functions, including multi-word arithmetic, and
  47 | division and square root approximations.  (Can be specialized to target if
  48 | desired.)
  49 *----------------------------------------------------------------------------*/
  50 #include "softfloat-macros.h"
  51
  52 /*----------------------------------------------------------------------------
  53 | Functions and definitions to determine:  (1) whether tininess for underflow
  54 | is detected before or after rounding by default, (2) what (if anything)
  55 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  56 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  57 | are propagated from function inputs to output.  These details are target-
  58 | specific.
  59 *----------------------------------------------------------------------------*/
  60 #include "softfloat-specialize.h"
  61
  62 void set_float_rounding_mode(int val STATUS_PARAM)
  63 {
  64     STATUS(float_rounding_mode) = val;
  65 }
  66
  67 void set_float_exception_flags(int val STATUS_PARAM)
  68 {
  69     STATUS(float_exception_flags) = val;
  70 }
  71
  72 void set_floatx80_rounding_precision(int val STATUS_PARAM)
  73 {
  74     STATUS(floatx80_rounding_precision) = val;
  75 }
  76
  77 /*----------------------------------------------------------------------------
  78 | Returns the fraction bits of the half-precision floating-point value `a'.
  79 *----------------------------------------------------------------------------*/
  80
  81 INLINE uint32_t extractFloat16Frac(float16 a)
  82 {
  83     return float16_val(a) & 0x3ff;
  84 }
  85
  86 /*----------------------------------------------------------------------------
  87 | Returns the exponent bits of the half-precision floating-point value `a'.
  88 *----------------------------------------------------------------------------*/
  89
  90 INLINE int16 extractFloat16Exp(float16 a)
  91 {
  92     return (float16_val(a) >> 10) & 0x1f;
  93 }
  94
  95 /*----------------------------------------------------------------------------
  96 | Returns the sign bit of the single-precision floating-point value `a'.
  97 *----------------------------------------------------------------------------*/
  98
  99 INLINE flag extractFloat16Sign(float16 a)
 100 {
 101     return float16_val(a)>>15;
 102 }
 103
 104 /*----------------------------------------------------------------------------
 105 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
 106 | and 7, and returns the properly rounded 32-bit integer corresponding to the
 107 | input.  If `zSign' is 1, the input is negated before being converted to an
 108 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 109 | is simply rounded to an integer, with the inexact exception raised if the
 110 | input cannot be represented exactly as an integer.  However, if the fixed-
 111 | point input is too large, the invalid exception is raised and the largest
 112 | positive or negative integer is returned.
 113 *----------------------------------------------------------------------------*/
 114
 115 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
 116 {
 117     int8 roundingMode;
 118     flag roundNearestEven;
 119     int8 roundIncrement, roundBits;
 120     int32 z;
 121
 122     roundingMode = STATUS(float_rounding_mode);
 123     roundNearestEven = ( roundingMode == float_round_nearest_even );
 124     roundIncrement = 0x40;
 125     if ( ! roundNearestEven ) {
 126         if ( roundingMode == float_round_to_zero ) {
 127             roundIncrement = 0;
 128         }
 129         else {
 130             roundIncrement = 0x7F;
 131             if ( zSign ) {
 132                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 133             }
 134             else {
 135                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 136             }
 137         }
 138     }
 139     roundBits = absZ & 0x7F;
 140     absZ = ( absZ + roundIncrement )>>7;
 141     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 142     z = absZ;
 143     if ( zSign ) z = - z;
 144     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 145         float_raise( float_flag_invalid STATUS_VAR);
 146         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
 147     }
 148     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 149     return z;
 150
 151 }
 152
 153 /*----------------------------------------------------------------------------
 154 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 155 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 156 | and returns the properly rounded 64-bit integer corresponding to the input.
 157 | If `zSign' is 1, the input is negated before being converted to an integer.
 158 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 159 | the inexact exception raised if the input cannot be represented exactly as
 160 | an integer.  However, if the fixed-point input is too large, the invalid
 161 | exception is raised and the largest positive or negative integer is
 162 | returned.
 163 *----------------------------------------------------------------------------*/
 164
 165 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
 166 {
 167     int8 roundingMode;
 168     flag roundNearestEven, increment;
 169     int64 z;
 170
 171     roundingMode = STATUS(float_rounding_mode);
 172     roundNearestEven = ( roundingMode == float_round_nearest_even );
 173     increment = ( (int64_t) absZ1 < 0 );
 174     if ( ! roundNearestEven ) {
 175         if ( roundingMode == float_round_to_zero ) {
 176             increment = 0;
 177         }
 178         else {
 179             if ( zSign ) {
 180                 increment = ( roundingMode == float_round_down ) && absZ1;
 181             }
 182             else {
 183                 increment = ( roundingMode == float_round_up ) && absZ1;
 184             }
 185         }
 186     }
 187     if ( increment ) {
 188         ++absZ0;
 189         if ( absZ0 == 0 ) goto overflow;
 190         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 191     }
 192     z = absZ0;
 193     if ( zSign ) z = - z;
 194     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 195  overflow:
 196         float_raise( float_flag_invalid STATUS_VAR);
 197         return
 198               zSign ? (int64_t) LIT64( 0x8000000000000000 )
 199             : LIT64( 0x7FFFFFFFFFFFFFFF );
 200     }
 201     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 202     return z;
 203
 204 }
 205
 206 /*----------------------------------------------------------------------------
 207 | Returns the fraction bits of the single-precision floating-point value `a'.
 208 *----------------------------------------------------------------------------*/
 209
 210 INLINE uint32_t extractFloat32Frac( float32 a )
 211 {
 212
 213     return float32_val(a) & 0x007FFFFF;
 214
 215 }
 216
 217 /*----------------------------------------------------------------------------
 218 | Returns the exponent bits of the single-precision floating-point value `a'.
 219 *----------------------------------------------------------------------------*/
 220
 221 INLINE int16 extractFloat32Exp( float32 a )
 222 {
 223
 224     return ( float32_val(a)>>23 ) & 0xFF;
 225
 226 }
 227
 228 /*----------------------------------------------------------------------------
 229 | Returns the sign bit of the single-precision floating-point value `a'.
 230 *----------------------------------------------------------------------------*/
 231
 232 INLINE flag extractFloat32Sign( float32 a )
 233 {
 234
 235     return float32_val(a)>>31;
 236
 237 }
 238
 239 /*----------------------------------------------------------------------------
 240 | If `a' is denormal and we are in flush-to-zero mode then set the
 241 | input-denormal exception and return zero. Otherwise just return the value.
 242 *----------------------------------------------------------------------------*/
 243 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
 244 {
 245     if (STATUS(flush_inputs_to_zero)) {
 246         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
 247             float_raise(float_flag_input_denormal STATUS_VAR);
 248             return make_float32(float32_val(a) & 0x80000000);
 249         }
 250     }
 251     return a;
 252 }
 253
 254 /*----------------------------------------------------------------------------
 255 | Normalizes the subnormal single-precision floating-point value represented
 256 | by the denormalized significand `aSig'.  The normalized exponent and
 257 | significand are stored at the locations pointed to by `zExpPtr' and
 258 | `zSigPtr', respectively.
 259 *----------------------------------------------------------------------------*/
 260
 261 static void
 262  normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr )
 263 {
 264     int8 shiftCount;
 265
 266     shiftCount = countLeadingZeros32( aSig ) - 8;
 267     *zSigPtr = aSig<<shiftCount;
 268     *zExpPtr = 1 - shiftCount;
 269
 270 }
 271
 272 /*----------------------------------------------------------------------------
 273 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 274 | single-precision floating-point value, returning the result.  After being
 275 | shifted into the proper positions, the three fields are simply added
 276 | together to form the result.  This means that any integer portion of `zSig'
 277 | will be added into the exponent.  Since a properly normalized significand
 278 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 279 | than the desired result exponent whenever `zSig' is a complete, normalized
 280 | significand.
 281 *----------------------------------------------------------------------------*/
 282
 283 INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig )
 284 {
 285
 286     return make_float32(
 287           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
 288
 289 }
 290
 291 /*----------------------------------------------------------------------------
 292 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 293 | and significand `zSig', and returns the proper single-precision floating-
 294 | point value corresponding to the abstract input.  Ordinarily, the abstract
 295 | value is simply rounded and packed into the single-precision format, with
 296 | the inexact exception raised if the abstract input cannot be represented
 297 | exactly.  However, if the abstract value is too large, the overflow and
 298 | inexact exceptions are raised and an infinity or maximal finite value is
 299 | returned.  If the abstract value is too small, the input value is rounded to
 300 | a subnormal number, and the underflow and inexact exceptions are raised if
 301 | the abstract input cannot be represented exactly as a subnormal single-
 302 | precision floating-point number.
 303 |     The input significand `zSig' has its binary point between bits 30
 304 | and 29, which is 7 bits to the left of the usual location.  This shifted
 305 | significand must be normalized or smaller.  If `zSig' is not normalized,
 306 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 307 | and it must not require rounding.  In the usual case that `zSig' is
 308 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 309 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 310 | Binary Floating-Point Arithmetic.
 311 *----------------------------------------------------------------------------*/
 312
 313 static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
 314 {
 315     int8 roundingMode;
 316     flag roundNearestEven;
 317     int8 roundIncrement, roundBits;
 318     flag isTiny;
 319
 320     roundingMode = STATUS(float_rounding_mode);
 321     roundNearestEven = ( roundingMode == float_round_nearest_even );
 322     roundIncrement = 0x40;
 323     if ( ! roundNearestEven ) {
 324         if ( roundingMode == float_round_to_zero ) {
 325             roundIncrement = 0;
 326         }
 327         else {
 328             roundIncrement = 0x7F;
 329             if ( zSign ) {
 330                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 331             }
 332             else {
 333                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 334             }
 335         }
 336     }
 337     roundBits = zSig & 0x7F;
 338     if ( 0xFD <= (uint16_t) zExp ) {
 339         if (    ( 0xFD < zExp )
 340              || (    ( zExp == 0xFD )
 341                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
 342            ) {
 343             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 344             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 345         }
 346         if ( zExp < 0 ) {
 347             if (STATUS(flush_to_zero)) {
 348                 float_raise(float_flag_output_denormal STATUS_VAR);
 349                 return packFloat32(zSign, 0, 0);
 350             }
 351             isTiny =
 352                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 353                 || ( zExp < -1 )
 354                 || ( zSig + roundIncrement < 0x80000000 );
 355             shift32RightJamming( zSig, - zExp, &zSig );
 356             zExp = 0;
 357             roundBits = zSig & 0x7F;
 358             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 359         }
 360     }
 361     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 362     zSig = ( zSig + roundIncrement )>>7;
 363     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 364     if ( zSig == 0 ) zExp = 0;
 365     return packFloat32( zSign, zExp, zSig );
 366
 367 }
 368
 369 /*----------------------------------------------------------------------------
 370 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 371 | and significand `zSig', and returns the proper single-precision floating-
 372 | point value corresponding to the abstract input.  This routine is just like
 373 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 374 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 375 | floating-point exponent.
 376 *----------------------------------------------------------------------------*/
 377
 378 static float32
 379  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
 380 {
 381     int8 shiftCount;
 382
 383     shiftCount = countLeadingZeros32( zSig ) - 1;
 384     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 385
 386 }
 387
 388 /*----------------------------------------------------------------------------
 389 | Returns the fraction bits of the double-precision floating-point value `a'.
 390 *----------------------------------------------------------------------------*/
 391
 392 INLINE uint64_t extractFloat64Frac( float64 a )
 393 {
 394
 395     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 396
 397 }
 398
 399 /*----------------------------------------------------------------------------
 400 | Returns the exponent bits of the double-precision floating-point value `a'.
 401 *----------------------------------------------------------------------------*/
 402
 403 INLINE int16 extractFloat64Exp( float64 a )
 404 {
 405
 406     return ( float64_val(a)>>52 ) & 0x7FF;
 407
 408 }
 409
 410 /*----------------------------------------------------------------------------
 411 | Returns the sign bit of the double-precision floating-point value `a'.
 412 *----------------------------------------------------------------------------*/
 413
 414 INLINE flag extractFloat64Sign( float64 a )
 415 {
 416
 417     return float64_val(a)>>63;
 418
 419 }
 420
 421 /*----------------------------------------------------------------------------
 422 | If `a' is denormal and we are in flush-to-zero mode then set the
 423 | input-denormal exception and return zero. Otherwise just return the value.
 424 *----------------------------------------------------------------------------*/
 425 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
 426 {
 427     if (STATUS(flush_inputs_to_zero)) {
 428         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
 429             float_raise(float_flag_input_denormal STATUS_VAR);
 430             return make_float64(float64_val(a) & (1ULL << 63));
 431         }
 432     }
 433     return a;
 434 }
 435
 436 /*----------------------------------------------------------------------------
 437 | Normalizes the subnormal double-precision floating-point value represented
 438 | by the denormalized significand `aSig'.  The normalized exponent and
 439 | significand are stored at the locations pointed to by `zExpPtr' and
 440 | `zSigPtr', respectively.
 441 *----------------------------------------------------------------------------*/
 442
 443 static void
 444  normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr )
 445 {
 446     int8 shiftCount;
 447
 448     shiftCount = countLeadingZeros64( aSig ) - 11;
 449     *zSigPtr = aSig<<shiftCount;
 450     *zExpPtr = 1 - shiftCount;
 451
 452 }
 453
 454 /*----------------------------------------------------------------------------
 455 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 456 | double-precision floating-point value, returning the result.  After being
 457 | shifted into the proper positions, the three fields are simply added
 458 | together to form the result.  This means that any integer portion of `zSig'
 459 | will be added into the exponent.  Since a properly normalized significand
 460 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 461 | than the desired result exponent whenever `zSig' is a complete, normalized
 462 | significand.
 463 *----------------------------------------------------------------------------*/
 464
 465 INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig )
 466 {
 467
 468     return make_float64(
 469         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
 470
 471 }
 472
 473 /*----------------------------------------------------------------------------
 474 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 475 | and significand `zSig', and returns the proper double-precision floating-
 476 | point value corresponding to the abstract input.  Ordinarily, the abstract
 477 | value is simply rounded and packed into the double-precision format, with
 478 | the inexact exception raised if the abstract input cannot be represented
 479 | exactly.  However, if the abstract value is too large, the overflow and
 480 | inexact exceptions are raised and an infinity or maximal finite value is
 481 | returned.  If the abstract value is too small, the input value is rounded
 482 | to a subnormal number, and the underflow and inexact exceptions are raised
 483 | if the abstract input cannot be represented exactly as a subnormal double-
 484 | precision floating-point number.
 485 |     The input significand `zSig' has its binary point between bits 62
 486 | and 61, which is 10 bits to the left of the usual location.  This shifted
 487 | significand must be normalized or smaller.  If `zSig' is not normalized,
 488 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 489 | and it must not require rounding.  In the usual case that `zSig' is
 490 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 491 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 492 | Binary Floating-Point Arithmetic.
 493 *----------------------------------------------------------------------------*/
 494
 495 static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
 496 {
 497     int8 roundingMode;
 498     flag roundNearestEven;
 499     int16 roundIncrement, roundBits;
 500     flag isTiny;
 501
 502     roundingMode = STATUS(float_rounding_mode);
 503     roundNearestEven = ( roundingMode == float_round_nearest_even );
 504     roundIncrement = 0x200;
 505     if ( ! roundNearestEven ) {
 506         if ( roundingMode == float_round_to_zero ) {
 507             roundIncrement = 0;
 508         }
 509         else {
 510             roundIncrement = 0x3FF;
 511             if ( zSign ) {
 512                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 513             }
 514             else {
 515                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 516             }
 517         }
 518     }
 519     roundBits = zSig & 0x3FF;
 520     if ( 0x7FD <= (uint16_t) zExp ) {
 521         if (    ( 0x7FD < zExp )
 522              || (    ( zExp == 0x7FD )
 523                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
 524            ) {
 525             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 526             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 527         }
 528         if ( zExp < 0 ) {
 529             if (STATUS(flush_to_zero)) {
 530                 float_raise(float_flag_output_denormal STATUS_VAR);
 531                 return packFloat64(zSign, 0, 0);
 532             }
 533             isTiny =
 534                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 535                 || ( zExp < -1 )
 536                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 537             shift64RightJamming( zSig, - zExp, &zSig );
 538             zExp = 0;
 539             roundBits = zSig & 0x3FF;
 540             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 541         }
 542     }
 543     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 544     zSig = ( zSig + roundIncrement )>>10;
 545     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 546     if ( zSig == 0 ) zExp = 0;
 547     return packFloat64( zSign, zExp, zSig );
 548
 549 }
 550
 551 /*----------------------------------------------------------------------------
 552 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 553 | and significand `zSig', and returns the proper double-precision floating-
 554 | point value corresponding to the abstract input.  This routine is just like
 555 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 556 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 557 | floating-point exponent.
 558 *----------------------------------------------------------------------------*/
 559
 560 static float64
 561  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
 562 {
 563     int8 shiftCount;
 564
 565     shiftCount = countLeadingZeros64( zSig ) - 1;
 566     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 567
 568 }
 569
 570 /*----------------------------------------------------------------------------
 571 | Returns the fraction bits of the extended double-precision floating-point
 572 | value `a'.
 573 *----------------------------------------------------------------------------*/
 574
 575 INLINE uint64_t extractFloatx80Frac( floatx80 a )
 576 {
 577
 578     return a.low;
 579
 580 }
 581
 582 /*----------------------------------------------------------------------------
 583 | Returns the exponent bits of the extended double-precision floating-point
 584 | value `a'.
 585 *----------------------------------------------------------------------------*/
 586
 587 INLINE int32 extractFloatx80Exp( floatx80 a )
 588 {
 589
 590     return a.high & 0x7FFF;
 591
 592 }
 593
 594 /*----------------------------------------------------------------------------
 595 | Returns the sign bit of the extended double-precision floating-point value
 596 | `a'.
 597 *----------------------------------------------------------------------------*/
 598
 599 INLINE flag extractFloatx80Sign( floatx80 a )
 600 {
 601
 602     return a.high>>15;
 603
 604 }
 605
 606 /*----------------------------------------------------------------------------
 607 | Normalizes the subnormal extended double-precision floating-point value
 608 | represented by the denormalized significand `aSig'.  The normalized exponent
 609 | and significand are stored at the locations pointed to by `zExpPtr' and
 610 | `zSigPtr', respectively.
 611 *----------------------------------------------------------------------------*/
 612
 613 static void
 614  normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
 615 {
 616     int8 shiftCount;
 617
 618     shiftCount = countLeadingZeros64( aSig );
 619     *zSigPtr = aSig<<shiftCount;
 620     *zExpPtr = 1 - shiftCount;
 621
 622 }
 623
 624 /*----------------------------------------------------------------------------
 625 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 626 | extended double-precision floating-point value, returning the result.
 627 *----------------------------------------------------------------------------*/
 628
 629 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
 630 {
 631     floatx80 z;
 632
 633     z.low = zSig;
 634     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
 635     return z;
 636
 637 }
 638
 639 /*----------------------------------------------------------------------------
 640 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 641 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 642 | and returns the proper extended double-precision floating-point value
 643 | corresponding to the abstract input.  Ordinarily, the abstract value is
 644 | rounded and packed into the extended double-precision format, with the
 645 | inexact exception raised if the abstract input cannot be represented
 646 | exactly.  However, if the abstract value is too large, the overflow and
 647 | inexact exceptions are raised and an infinity or maximal finite value is
 648 | returned.  If the abstract value is too small, the input value is rounded to
 649 | a subnormal number, and the underflow and inexact exceptions are raised if
 650 | the abstract input cannot be represented exactly as a subnormal extended
 651 | double-precision floating-point number.
 652 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 653 | number of bits as single or double precision, respectively.  Otherwise, the
 654 | result is rounded to the full precision of the extended double-precision
 655 | format.
 656 |     The input significand must be normalized or smaller.  If the input
 657 | significand is not normalized, `zExp' must be 0; in that case, the result
 658 | returned is a subnormal number, and it must not require rounding.  The
 659 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 660 | Floating-Point Arithmetic.
 661 *----------------------------------------------------------------------------*/
 662
 663 static floatx80
 664  roundAndPackFloatx80(
 665      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 666  STATUS_PARAM)
 667 {
 668     int8 roundingMode;
 669     flag roundNearestEven, increment, isTiny;
 670     int64 roundIncrement, roundMask, roundBits;
 671
 672     roundingMode = STATUS(float_rounding_mode);
 673     roundNearestEven = ( roundingMode == float_round_nearest_even );
 674     if ( roundingPrecision == 80 ) goto precision80;
 675     if ( roundingPrecision == 64 ) {
 676         roundIncrement = LIT64( 0x0000000000000400 );
 677         roundMask = LIT64( 0x00000000000007FF );
 678     }
 679     else if ( roundingPrecision == 32 ) {
 680         roundIncrement = LIT64( 0x0000008000000000 );
 681         roundMask = LIT64( 0x000000FFFFFFFFFF );
 682     }
 683     else {
 684         goto precision80;
 685     }
 686     zSig0 |= ( zSig1 != 0 );
 687     if ( ! roundNearestEven ) {
 688         if ( roundingMode == float_round_to_zero ) {
 689             roundIncrement = 0;
 690         }
 691         else {
 692             roundIncrement = roundMask;
 693             if ( zSign ) {
 694                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 695             }
 696             else {
 697                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 698             }
 699         }
 700     }
 701     roundBits = zSig0 & roundMask;
 702     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 703         if (    ( 0x7FFE < zExp )
 704              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 705            ) {
 706             goto overflow;
 707         }
 708         if ( zExp <= 0 ) {
 709             if (STATUS(flush_to_zero)) {
 710                 float_raise(float_flag_output_denormal STATUS_VAR);
 711                 return packFloatx80(zSign, 0, 0);
 712             }
 713             isTiny =
 714                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 715                 || ( zExp < 0 )
 716                 || ( zSig0 <= zSig0 + roundIncrement );
 717             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 718             zExp = 0;
 719             roundBits = zSig0 & roundMask;
 720             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 721             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 722             zSig0 += roundIncrement;
 723             if ( (int64_t) zSig0 < 0 ) zExp = 1;
 724             roundIncrement = roundMask + 1;
 725             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 726                 roundMask |= roundIncrement;
 727             }
 728             zSig0 &= ~ roundMask;
 729             return packFloatx80( zSign, zExp, zSig0 );
 730         }
 731     }
 732     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 733     zSig0 += roundIncrement;
 734     if ( zSig0 < roundIncrement ) {
 735         ++zExp;
 736         zSig0 = LIT64( 0x8000000000000000 );
 737     }
 738     roundIncrement = roundMask + 1;
 739     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 740         roundMask |= roundIncrement;
 741     }
 742     zSig0 &= ~ roundMask;
 743     if ( zSig0 == 0 ) zExp = 0;
 744     return packFloatx80( zSign, zExp, zSig0 );
 745  precision80:
 746     increment = ( (int64_t) zSig1 < 0 );
 747     if ( ! roundNearestEven ) {
 748         if ( roundingMode == float_round_to_zero ) {
 749             increment = 0;
 750         }
 751         else {
 752             if ( zSign ) {
 753                 increment = ( roundingMode == float_round_down ) && zSig1;
 754             }
 755             else {
 756                 increment = ( roundingMode == float_round_up ) && zSig1;
 757             }
 758         }
 759     }
 760     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 761         if (    ( 0x7FFE < zExp )
 762              || (    ( zExp == 0x7FFE )
 763                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 764                   && increment
 765                 )
 766            ) {
 767             roundMask = 0;
 768  overflow:
 769             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 770             if (    ( roundingMode == float_round_to_zero )
 771                  || ( zSign && ( roundingMode == float_round_up ) )
 772                  || ( ! zSign && ( roundingMode == float_round_down ) )
 773                ) {
 774                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 775             }
 776             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 777         }
 778         if ( zExp <= 0 ) {
 779             isTiny =
 780                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 781                 || ( zExp < 0 )
 782                 || ! increment
 783                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 784             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 785             zExp = 0;
 786             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 787             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 788             if ( roundNearestEven ) {
 789                 increment = ( (int64_t) zSig1 < 0 );
 790             }
 791             else {
 792                 if ( zSign ) {
 793                     increment = ( roundingMode == float_round_down ) && zSig1;
 794                 }
 795                 else {
 796                     increment = ( roundingMode == float_round_up ) && zSig1;
 797                 }
 798             }
 799             if ( increment ) {
 800                 ++zSig0;
 801                 zSig0 &=
 802                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 803                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
 804             }
 805             return packFloatx80( zSign, zExp, zSig0 );
 806         }
 807     }
 808     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 809     if ( increment ) {
 810         ++zSig0;
 811         if ( zSig0 == 0 ) {
 812             ++zExp;
 813             zSig0 = LIT64( 0x8000000000000000 );
 814         }
 815         else {
 816             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 817         }
 818     }
 819     else {
 820         if ( zSig0 == 0 ) zExp = 0;
 821     }
 822     return packFloatx80( zSign, zExp, zSig0 );
 823
 824 }
 825
 826 /*----------------------------------------------------------------------------
 827 | Takes an abstract floating-point value having sign `zSign', exponent
 828 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 829 | and returns the proper extended double-precision floating-point value
 830 | corresponding to the abstract input.  This routine is just like
 831 | `roundAndPackFloatx80' except that the input significand does not have to be
 832 | normalized.
 833 *----------------------------------------------------------------------------*/
 834
 835 static floatx80
 836  normalizeRoundAndPackFloatx80(
 837      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 838  STATUS_PARAM)
 839 {
 840     int8 shiftCount;
 841
 842     if ( zSig0 == 0 ) {
 843         zSig0 = zSig1;
 844         zSig1 = 0;
 845         zExp -= 64;
 846     }
 847     shiftCount = countLeadingZeros64( zSig0 );
 848     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 849     zExp -= shiftCount;
 850     return
 851         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 852
 853 }
 854
 855 /*----------------------------------------------------------------------------
 856 | Returns the least-significant 64 fraction bits of the quadruple-precision
 857 | floating-point value `a'.
 858 *----------------------------------------------------------------------------*/
 859
 860 INLINE uint64_t extractFloat128Frac1( float128 a )
 861 {
 862
 863     return a.low;
 864
 865 }
 866
 867 /*----------------------------------------------------------------------------
 868 | Returns the most-significant 48 fraction bits of the quadruple-precision
 869 | floating-point value `a'.
 870 *----------------------------------------------------------------------------*/
 871
 872 INLINE uint64_t extractFloat128Frac0( float128 a )
 873 {
 874
 875     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 876
 877 }
 878
 879 /*----------------------------------------------------------------------------
 880 | Returns the exponent bits of the quadruple-precision floating-point value
 881 | `a'.
 882 *----------------------------------------------------------------------------*/
 883
 884 INLINE int32 extractFloat128Exp( float128 a )
 885 {
 886
 887     return ( a.high>>48 ) & 0x7FFF;
 888
 889 }
 890
 891 /*----------------------------------------------------------------------------
 892 | Returns the sign bit of the quadruple-precision floating-point value `a'.
 893 *----------------------------------------------------------------------------*/
 894
 895 INLINE flag extractFloat128Sign( float128 a )
 896 {
 897
 898     return a.high>>63;
 899
 900 }
 901
 902 /*----------------------------------------------------------------------------
 903 | Normalizes the subnormal quadruple-precision floating-point value
 904 | represented by the denormalized significand formed by the concatenation of
 905 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 906 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 907 | significand are stored at the location pointed to by `zSig0Ptr', and the
 908 | least significant 64 bits of the normalized significand are stored at the
 909 | location pointed to by `zSig1Ptr'.
 910 *----------------------------------------------------------------------------*/
 911
 912 static void
 913  normalizeFloat128Subnormal(
 914      uint64_t aSig0,
 915      uint64_t aSig1,
 916      int32 *zExpPtr,
 917      uint64_t *zSig0Ptr,
 918      uint64_t *zSig1Ptr
 919  )
 920 {
 921     int8 shiftCount;
 922
 923     if ( aSig0 == 0 ) {
 924         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 925         if ( shiftCount < 0 ) {
 926             *zSig0Ptr = aSig1>>( - shiftCount );
 927             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 928         }
 929         else {
 930             *zSig0Ptr = aSig1<<shiftCount;
 931             *zSig1Ptr = 0;
 932         }
 933         *zExpPtr = - shiftCount - 63;
 934     }
 935     else {
 936         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 937         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 938         *zExpPtr = 1 - shiftCount;
 939     }
 940
 941 }
 942
 943 /*----------------------------------------------------------------------------
 944 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 945 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 946 | floating-point value, returning the result.  After being shifted into the
 947 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 948 | added together to form the most significant 32 bits of the result.  This
 949 | means that any integer portion of `zSig0' will be added into the exponent.
 950 | Since a properly normalized significand will have an integer portion equal
 951 | to 1, the `zExp' input should be 1 less than the desired result exponent
 952 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 953 | significand.
 954 *----------------------------------------------------------------------------*/
 955
 956 INLINE float128
 957  packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
 958 {
 959     float128 z;
 960
 961     z.low = zSig1;
 962     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
 963     return z;
 964
 965 }
 966
 967 /*----------------------------------------------------------------------------
 968 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 969 | and extended significand formed by the concatenation of `zSig0', `zSig1',
 970 | and `zSig2', and returns the proper quadruple-precision floating-point value
 971 | corresponding to the abstract input.  Ordinarily, the abstract value is
 972 | simply rounded and packed into the quadruple-precision format, with the
 973 | inexact exception raised if the abstract input cannot be represented
 974 | exactly.  However, if the abstract value is too large, the overflow and
 975 | inexact exceptions are raised and an infinity or maximal finite value is
 976 | returned.  If the abstract value is too small, the input value is rounded to
 977 | a subnormal number, and the underflow and inexact exceptions are raised if
 978 | the abstract input cannot be represented exactly as a subnormal quadruple-
 979 | precision floating-point number.
 980 |     The input significand must be normalized or smaller.  If the input
 981 | significand is not normalized, `zExp' must be 0; in that case, the result
 982 | returned is a subnormal number, and it must not require rounding.  In the
 983 | usual case that the input significand is normalized, `zExp' must be 1 less
 984 | than the ``true'' floating-point exponent.  The handling of underflow and
 985 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 986 *----------------------------------------------------------------------------*/
 987
 988 static float128
 989  roundAndPackFloat128(
 990      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
 991 {
 992     int8 roundingMode;
 993     flag roundNearestEven, increment, isTiny;
 994
 995     roundingMode = STATUS(float_rounding_mode);
 996     roundNearestEven = ( roundingMode == float_round_nearest_even );
 997     increment = ( (int64_t) zSig2 < 0 );
 998     if ( ! roundNearestEven ) {
 999         if ( roundingMode == float_round_to_zero ) {
1000             increment = 0;
1001         }
1002         else {
1003             if ( zSign ) {
1004                 increment = ( roundingMode == float_round_down ) && zSig2;
1005             }
1006             else {
1007                 increment = ( roundingMode == float_round_up ) && zSig2;
1008             }
1009         }
1010     }
1011     if ( 0x7FFD <= (uint32_t) zExp ) {
1012         if (    ( 0x7FFD < zExp )
1013              || (    ( zExp == 0x7FFD )
1014                   && eq128(
1015                          LIT64( 0x0001FFFFFFFFFFFF ),
1016                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1017                          zSig0,
1018                          zSig1
1019                      )
1020                   && increment
1021                 )
1022            ) {
1023             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1024             if (    ( roundingMode == float_round_to_zero )
1025                  || ( zSign && ( roundingMode == float_round_up ) )
1026                  || ( ! zSign && ( roundingMode == float_round_down ) )
1027                ) {
1028                 return
1029                     packFloat128(
1030                         zSign,
1031                         0x7FFE,
1032                         LIT64( 0x0000FFFFFFFFFFFF ),
1033                         LIT64( 0xFFFFFFFFFFFFFFFF )
1034                     );
1035             }
1036             return packFloat128( zSign, 0x7FFF, 0, 0 );
1037         }
1038         if ( zExp < 0 ) {
1039             if (STATUS(flush_to_zero)) {
1040                 float_raise(float_flag_output_denormal STATUS_VAR);
1041                 return packFloat128(zSign, 0, 0, 0);
1042             }
1043             isTiny =
1044                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1045                 || ( zExp < -1 )
1046                 || ! increment
1047                 || lt128(
1048                        zSig0,
1049                        zSig1,
1050                        LIT64( 0x0001FFFFFFFFFFFF ),
1051                        LIT64( 0xFFFFFFFFFFFFFFFF )
1052                    );
1053             shift128ExtraRightJamming(
1054                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1055             zExp = 0;
1056             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1057             if ( roundNearestEven ) {
1058                 increment = ( (int64_t) zSig2 < 0 );
1059             }
1060             else {
1061                 if ( zSign ) {
1062                     increment = ( roundingMode == float_round_down ) && zSig2;
1063                 }
1064                 else {
1065                     increment = ( roundingMode == float_round_up ) && zSig2;
1066                 }
1067             }
1068         }
1069     }
1070     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1071     if ( increment ) {
1072         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1073         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1074     }
1075     else {
1076         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1077     }
1078     return packFloat128( zSign, zExp, zSig0, zSig1 );
1079
1080 }
1081
1082 /*----------------------------------------------------------------------------
1083 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1084 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1085 | returns the proper quadruple-precision floating-point value corresponding
1086 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1087 | except that the input significand has fewer bits and does not have to be
1088 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1089 | point exponent.
1090 *----------------------------------------------------------------------------*/
1091
1092 static float128
1093  normalizeRoundAndPackFloat128(
1094      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1095 {
1096     int8 shiftCount;
1097     uint64_t zSig2;
1098
1099     if ( zSig0 == 0 ) {
1100         zSig0 = zSig1;
1101         zSig1 = 0;
1102         zExp -= 64;
1103     }
1104     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1105     if ( 0 <= shiftCount ) {
1106         zSig2 = 0;
1107         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1108     }
1109     else {
1110         shift128ExtraRightJamming(
1111             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1112     }
1113     zExp -= shiftCount;
1114     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1115
1116 }
1117
1118 /*----------------------------------------------------------------------------
1119 | Returns the result of converting the 32-bit two's complement integer `a'
1120 | to the single-precision floating-point format.  The conversion is performed
1121 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1122 *----------------------------------------------------------------------------*/
1123
1124 float32 int32_to_float32( int32 a STATUS_PARAM )
1125 {
1126     flag zSign;
1127
1128     if ( a == 0 ) return float32_zero;
1129     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1130     zSign = ( a < 0 );
1131     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1132
1133 }
1134
1135 /*----------------------------------------------------------------------------
1136 | Returns the result of converting the 32-bit two's complement integer `a'
1137 | to the double-precision floating-point format.  The conversion is performed
1138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139 *----------------------------------------------------------------------------*/
1140
1141 float64 int32_to_float64( int32 a STATUS_PARAM )
1142 {
1143     flag zSign;
1144     uint32 absA;
1145     int8 shiftCount;
1146     uint64_t zSig;
1147
1148     if ( a == 0 ) return float64_zero;
1149     zSign = ( a < 0 );
1150     absA = zSign ? - a : a;
1151     shiftCount = countLeadingZeros32( absA ) + 21;
1152     zSig = absA;
1153     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1154
1155 }
1156
1157 /*----------------------------------------------------------------------------
1158 | Returns the result of converting the 32-bit two's complement integer `a'
1159 | to the extended double-precision floating-point format.  The conversion
1160 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1161 | Arithmetic.
1162 *----------------------------------------------------------------------------*/
1163
1164 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1165 {
1166     flag zSign;
1167     uint32 absA;
1168     int8 shiftCount;
1169     uint64_t zSig;
1170
1171     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1172     zSign = ( a < 0 );
1173     absA = zSign ? - a : a;
1174     shiftCount = countLeadingZeros32( absA ) + 32;
1175     zSig = absA;
1176     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1177
1178 }
1179
1180 /*----------------------------------------------------------------------------
1181 | Returns the result of converting the 32-bit two's complement integer `a' to
1182 | the quadruple-precision floating-point format.  The conversion is performed
1183 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1184 *----------------------------------------------------------------------------*/
1185
1186 float128 int32_to_float128( int32 a STATUS_PARAM )
1187 {
1188     flag zSign;
1189     uint32 absA;
1190     int8 shiftCount;
1191     uint64_t zSig0;
1192
1193     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1194     zSign = ( a < 0 );
1195     absA = zSign ? - a : a;
1196     shiftCount = countLeadingZeros32( absA ) + 17;
1197     zSig0 = absA;
1198     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1199
1200 }
1201
1202 /*----------------------------------------------------------------------------
1203 | Returns the result of converting the 64-bit two's complement integer `a'
1204 | to the single-precision floating-point format.  The conversion is performed
1205 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1206 *----------------------------------------------------------------------------*/
1207
1208 float32 int64_to_float32( int64 a STATUS_PARAM )
1209 {
1210     flag zSign;
1211     uint64 absA;
1212     int8 shiftCount;
1213
1214     if ( a == 0 ) return float32_zero;
1215     zSign = ( a < 0 );
1216     absA = zSign ? - a : a;
1217     shiftCount = countLeadingZeros64( absA ) - 40;
1218     if ( 0 <= shiftCount ) {
1219         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1220     }
1221     else {
1222         shiftCount += 7;
1223         if ( shiftCount < 0 ) {
1224             shift64RightJamming( absA, - shiftCount, &absA );
1225         }
1226         else {
1227             absA <<= shiftCount;
1228         }
1229         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1230     }
1231
1232 }
1233
1234 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1235 {
1236     int8 shiftCount;
1237
1238     if ( a == 0 ) return float32_zero;
1239     shiftCount = countLeadingZeros64( a ) - 40;
1240     if ( 0 <= shiftCount ) {
1241         return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1242     }
1243     else {
1244         shiftCount += 7;
1245         if ( shiftCount < 0 ) {
1246             shift64RightJamming( a, - shiftCount, &a );
1247         }
1248         else {
1249             a <<= shiftCount;
1250         }
1251         return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1252     }
1253 }
1254
1255 /*----------------------------------------------------------------------------
1256 | Returns the result of converting the 64-bit two's complement integer `a'
1257 | to the double-precision floating-point format.  The conversion is performed
1258 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1259 *----------------------------------------------------------------------------*/
1260
1261 float64 int64_to_float64( int64 a STATUS_PARAM )
1262 {
1263     flag zSign;
1264
1265     if ( a == 0 ) return float64_zero;
1266     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1267         return packFloat64( 1, 0x43E, 0 );
1268     }
1269     zSign = ( a < 0 );
1270     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1271
1272 }
1273
1274 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1275 {
1276     if ( a == 0 ) return float64_zero;
1277     return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1278
1279 }
1280
1281 /*----------------------------------------------------------------------------
1282 | Returns the result of converting the 64-bit two's complement integer `a'
1283 | to the extended double-precision floating-point format.  The conversion
1284 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1285 | Arithmetic.
1286 *----------------------------------------------------------------------------*/
1287
1288 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1289 {
1290     flag zSign;
1291     uint64 absA;
1292     int8 shiftCount;
1293
1294     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1295     zSign = ( a < 0 );
1296     absA = zSign ? - a : a;
1297     shiftCount = countLeadingZeros64( absA );
1298     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1299
1300 }
1301
1302 /*----------------------------------------------------------------------------
1303 | Returns the result of converting the 64-bit two's complement integer `a' to
1304 | the quadruple-precision floating-point format.  The conversion is performed
1305 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1306 *----------------------------------------------------------------------------*/
1307
1308 float128 int64_to_float128( int64 a STATUS_PARAM )
1309 {
1310     flag zSign;
1311     uint64 absA;
1312     int8 shiftCount;
1313     int32 zExp;
1314     uint64_t zSig0, zSig1;
1315
1316     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1317     zSign = ( a < 0 );
1318     absA = zSign ? - a : a;
1319     shiftCount = countLeadingZeros64( absA ) + 49;
1320     zExp = 0x406E - shiftCount;
1321     if ( 64 <= shiftCount ) {
1322         zSig1 = 0;
1323         zSig0 = absA;
1324         shiftCount -= 64;
1325     }
1326     else {
1327         zSig1 = absA;
1328         zSig0 = 0;
1329     }
1330     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1331     return packFloat128( zSign, zExp, zSig0, zSig1 );
1332
1333 }
1334
1335 /*----------------------------------------------------------------------------
1336 | Returns the result of converting the single-precision floating-point value
1337 | `a' to the 32-bit two's complement integer format.  The conversion is
1338 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1339 | Arithmetic---which means in particular that the conversion is rounded
1340 | according to the current rounding mode.  If `a' is a NaN, the largest
1341 | positive integer is returned.  Otherwise, if the conversion overflows, the
1342 | largest integer with the same sign as `a' is returned.
1343 *----------------------------------------------------------------------------*/
1344
1345 int32 float32_to_int32( float32 a STATUS_PARAM )
1346 {
1347     flag aSign;
1348     int16 aExp, shiftCount;
1349     uint32_t aSig;
1350     uint64_t aSig64;
1351
1352     a = float32_squash_input_denormal(a STATUS_VAR);
1353     aSig = extractFloat32Frac( a );
1354     aExp = extractFloat32Exp( a );
1355     aSign = extractFloat32Sign( a );
1356     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1357     if ( aExp ) aSig |= 0x00800000;
1358     shiftCount = 0xAF - aExp;
1359     aSig64 = aSig;
1360     aSig64 <<= 32;
1361     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1362     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1363
1364 }
1365
1366 /*----------------------------------------------------------------------------
1367 | Returns the result of converting the single-precision floating-point value
1368 | `a' to the 32-bit two's complement integer format.  The conversion is
1369 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1370 | Arithmetic, except that the conversion is always rounded toward zero.
1371 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1372 | the conversion overflows, the largest integer with the same sign as `a' is
1373 | returned.
1374 *----------------------------------------------------------------------------*/
1375
1376 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1377 {
1378     flag aSign;
1379     int16 aExp, shiftCount;
1380     uint32_t aSig;
1381     int32 z;
1382     a = float32_squash_input_denormal(a STATUS_VAR);
1383
1384     aSig = extractFloat32Frac( a );
1385     aExp = extractFloat32Exp( a );
1386     aSign = extractFloat32Sign( a );
1387     shiftCount = aExp - 0x9E;
1388     if ( 0 <= shiftCount ) {
1389         if ( float32_val(a) != 0xCF000000 ) {
1390             float_raise( float_flag_invalid STATUS_VAR);
1391             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1392         }
1393         return (int32_t) 0x80000000;
1394     }
1395     else if ( aExp <= 0x7E ) {
1396         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1397         return 0;
1398     }
1399     aSig = ( aSig | 0x00800000 )<<8;
1400     z = aSig>>( - shiftCount );
1401     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1402         STATUS(float_exception_flags) |= float_flag_inexact;
1403     }
1404     if ( aSign ) z = - z;
1405     return z;
1406
1407 }
1408
1409 /*----------------------------------------------------------------------------
1410 | Returns the result of converting the single-precision floating-point value
1411 | `a' to the 16-bit two's complement integer format.  The conversion is
1412 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1413 | Arithmetic, except that the conversion is always rounded toward zero.
1414 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1415 | the conversion overflows, the largest integer with the same sign as `a' is
1416 | returned.
1417 *----------------------------------------------------------------------------*/
1418
1419 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1420 {
1421     flag aSign;
1422     int16 aExp, shiftCount;
1423     uint32_t aSig;
1424     int32 z;
1425
1426     aSig = extractFloat32Frac( a );
1427     aExp = extractFloat32Exp( a );
1428     aSign = extractFloat32Sign( a );
1429     shiftCount = aExp - 0x8E;
1430     if ( 0 <= shiftCount ) {
1431         if ( float32_val(a) != 0xC7000000 ) {
1432             float_raise( float_flag_invalid STATUS_VAR);
1433             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1434                 return 0x7FFF;
1435             }
1436         }
1437         return (int32_t) 0xffff8000;
1438     }
1439     else if ( aExp <= 0x7E ) {
1440         if ( aExp | aSig ) {
1441             STATUS(float_exception_flags) |= float_flag_inexact;
1442         }
1443         return 0;
1444     }
1445     shiftCount -= 0x10;
1446     aSig = ( aSig | 0x00800000 )<<8;
1447     z = aSig>>( - shiftCount );
1448     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1449         STATUS(float_exception_flags) |= float_flag_inexact;
1450     }
1451     if ( aSign ) {
1452         z = - z;
1453     }
1454     return z;
1455
1456 }
1457
1458 /*----------------------------------------------------------------------------
1459 | Returns the result of converting the single-precision floating-point value
1460 | `a' to the 64-bit two's complement integer format.  The conversion is
1461 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1462 | Arithmetic---which means in particular that the conversion is rounded
1463 | according to the current rounding mode.  If `a' is a NaN, the largest
1464 | positive integer is returned.  Otherwise, if the conversion overflows, the
1465 | largest integer with the same sign as `a' is returned.
1466 *----------------------------------------------------------------------------*/
1467
1468 int64 float32_to_int64( float32 a STATUS_PARAM )
1469 {
1470     flag aSign;
1471     int16 aExp, shiftCount;
1472     uint32_t aSig;
1473     uint64_t aSig64, aSigExtra;
1474     a = float32_squash_input_denormal(a STATUS_VAR);
1475
1476     aSig = extractFloat32Frac( a );
1477     aExp = extractFloat32Exp( a );
1478     aSign = extractFloat32Sign( a );
1479     shiftCount = 0xBE - aExp;
1480     if ( shiftCount < 0 ) {
1481         float_raise( float_flag_invalid STATUS_VAR);
1482         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1483             return LIT64( 0x7FFFFFFFFFFFFFFF );
1484         }
1485         return (int64_t) LIT64( 0x8000000000000000 );
1486     }
1487     if ( aExp ) aSig |= 0x00800000;
1488     aSig64 = aSig;
1489     aSig64 <<= 40;
1490     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1491     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1492
1493 }
1494
1495 /*----------------------------------------------------------------------------
1496 | Returns the result of converting the single-precision floating-point value
1497 | `a' to the 64-bit two's complement integer format.  The conversion is
1498 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1499 | Arithmetic, except that the conversion is always rounded toward zero.  If
1500 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1501 | conversion overflows, the largest integer with the same sign as `a' is
1502 | returned.
1503 *----------------------------------------------------------------------------*/
1504
1505 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1506 {
1507     flag aSign;
1508     int16 aExp, shiftCount;
1509     uint32_t aSig;
1510     uint64_t aSig64;
1511     int64 z;
1512     a = float32_squash_input_denormal(a STATUS_VAR);
1513
1514     aSig = extractFloat32Frac( a );
1515     aExp = extractFloat32Exp( a );
1516     aSign = extractFloat32Sign( a );
1517     shiftCount = aExp - 0xBE;
1518     if ( 0 <= shiftCount ) {
1519         if ( float32_val(a) != 0xDF000000 ) {
1520             float_raise( float_flag_invalid STATUS_VAR);
1521             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1522                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1523             }
1524         }
1525         return (int64_t) LIT64( 0x8000000000000000 );
1526     }
1527     else if ( aExp <= 0x7E ) {
1528         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1529         return 0;
1530     }
1531     aSig64 = aSig | 0x00800000;
1532     aSig64 <<= 40;
1533     z = aSig64>>( - shiftCount );
1534     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1535         STATUS(float_exception_flags) |= float_flag_inexact;
1536     }
1537     if ( aSign ) z = - z;
1538     return z;
1539
1540 }
1541
1542 /*----------------------------------------------------------------------------
1543 | Returns the result of converting the single-precision floating-point value
1544 | `a' to the double-precision floating-point format.  The conversion is
1545 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1546 | Arithmetic.
1547 *----------------------------------------------------------------------------*/
1548
1549 float64 float32_to_float64( float32 a STATUS_PARAM )
1550 {
1551     flag aSign;
1552     int16 aExp;
1553     uint32_t aSig;
1554     a = float32_squash_input_denormal(a STATUS_VAR);
1555
1556     aSig = extractFloat32Frac( a );
1557     aExp = extractFloat32Exp( a );
1558     aSign = extractFloat32Sign( a );
1559     if ( aExp == 0xFF ) {
1560         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1561         return packFloat64( aSign, 0x7FF, 0 );
1562     }
1563     if ( aExp == 0 ) {
1564         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1565         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1566         --aExp;
1567     }
1568     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1569
1570 }
1571
1572 /*----------------------------------------------------------------------------
1573 | Returns the result of converting the single-precision floating-point value
1574 | `a' to the extended double-precision floating-point format.  The conversion
1575 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1576 | Arithmetic.
1577 *----------------------------------------------------------------------------*/
1578
1579 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1580 {
1581     flag aSign;
1582     int16 aExp;
1583     uint32_t aSig;
1584
1585     a = float32_squash_input_denormal(a STATUS_VAR);
1586     aSig = extractFloat32Frac( a );
1587     aExp = extractFloat32Exp( a );
1588     aSign = extractFloat32Sign( a );
1589     if ( aExp == 0xFF ) {
1590         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1591         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1592     }
1593     if ( aExp == 0 ) {
1594         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1595         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1596     }
1597     aSig |= 0x00800000;
1598     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1599
1600 }
1601
1602 /*----------------------------------------------------------------------------
1603 | Returns the result of converting the single-precision floating-point value
1604 | `a' to the double-precision floating-point format.  The conversion is
1605 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1606 | Arithmetic.
1607 *----------------------------------------------------------------------------*/
1608
1609 float128 float32_to_float128( float32 a STATUS_PARAM )
1610 {
1611     flag aSign;
1612     int16 aExp;
1613     uint32_t aSig;
1614
1615     a = float32_squash_input_denormal(a STATUS_VAR);
1616     aSig = extractFloat32Frac( a );
1617     aExp = extractFloat32Exp( a );
1618     aSign = extractFloat32Sign( a );
1619     if ( aExp == 0xFF ) {
1620         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1621         return packFloat128( aSign, 0x7FFF, 0, 0 );
1622     }
1623     if ( aExp == 0 ) {
1624         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1625         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1626         --aExp;
1627     }
1628     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1629
1630 }
1631
1632 /*----------------------------------------------------------------------------
1633 | Rounds the single-precision floating-point value `a' to an integer, and
1634 | returns the result as a single-precision floating-point value.  The
1635 | operation is performed according to the IEC/IEEE Standard for Binary
1636 | Floating-Point Arithmetic.
1637 *----------------------------------------------------------------------------*/
1638
1639 float32 float32_round_to_int( float32 a STATUS_PARAM)
1640 {
1641     flag aSign;
1642     int16 aExp;
1643     uint32_t lastBitMask, roundBitsMask;
1644     int8 roundingMode;
1645     uint32_t z;
1646     a = float32_squash_input_denormal(a STATUS_VAR);
1647
1648     aExp = extractFloat32Exp( a );
1649     if ( 0x96 <= aExp ) {
1650         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1651             return propagateFloat32NaN( a, a STATUS_VAR );
1652         }
1653         return a;
1654     }
1655     if ( aExp <= 0x7E ) {
1656         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1657         STATUS(float_exception_flags) |= float_flag_inexact;
1658         aSign = extractFloat32Sign( a );
1659         switch ( STATUS(float_rounding_mode) ) {
1660          case float_round_nearest_even:
1661             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1662                 return packFloat32( aSign, 0x7F, 0 );
1663             }
1664             break;
1665          case float_round_down:
1666             return make_float32(aSign ? 0xBF800000 : 0);
1667          case float_round_up:
1668             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1669         }
1670         return packFloat32( aSign, 0, 0 );
1671     }
1672     lastBitMask = 1;
1673     lastBitMask <<= 0x96 - aExp;
1674     roundBitsMask = lastBitMask - 1;
1675     z = float32_val(a);
1676     roundingMode = STATUS(float_rounding_mode);
1677     if ( roundingMode == float_round_nearest_even ) {
1678         z += lastBitMask>>1;
1679         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1680     }
1681     else if ( roundingMode != float_round_to_zero ) {
1682         if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1683             z += roundBitsMask;
1684         }
1685     }
1686     z &= ~ roundBitsMask;
1687     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1688     return make_float32(z);
1689
1690 }
1691
1692 /*----------------------------------------------------------------------------
1693 | Returns the result of adding the absolute values of the single-precision
1694 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1695 | before being returned.  `zSign' is ignored if the result is a NaN.
1696 | The addition is performed according to the IEC/IEEE Standard for Binary
1697 | Floating-Point Arithmetic.
1698 *----------------------------------------------------------------------------*/
1699
1700 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1701 {
1702     int16 aExp, bExp, zExp;
1703     uint32_t aSig, bSig, zSig;
1704     int16 expDiff;
1705
1706     aSig = extractFloat32Frac( a );
1707     aExp = extractFloat32Exp( a );
1708     bSig = extractFloat32Frac( b );
1709     bExp = extractFloat32Exp( b );
1710     expDiff = aExp - bExp;
1711     aSig <<= 6;
1712     bSig <<= 6;
1713     if ( 0 < expDiff ) {
1714         if ( aExp == 0xFF ) {
1715             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1716             return a;
1717         }
1718         if ( bExp == 0 ) {
1719             --expDiff;
1720         }
1721         else {
1722             bSig |= 0x20000000;
1723         }
1724         shift32RightJamming( bSig, expDiff, &bSig );
1725         zExp = aExp;
1726     }
1727     else if ( expDiff < 0 ) {
1728         if ( bExp == 0xFF ) {
1729             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1730             return packFloat32( zSign, 0xFF, 0 );
1731         }
1732         if ( aExp == 0 ) {
1733             ++expDiff;
1734         }
1735         else {
1736             aSig |= 0x20000000;
1737         }
1738         shift32RightJamming( aSig, - expDiff, &aSig );
1739         zExp = bExp;
1740     }
1741     else {
1742         if ( aExp == 0xFF ) {
1743             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1744             return a;
1745         }
1746         if ( aExp == 0 ) {
1747             if (STATUS(flush_to_zero)) {
1748                 if (aSig | bSig) {
1749                     float_raise(float_flag_output_denormal STATUS_VAR);
1750                 }
1751                 return packFloat32(zSign, 0, 0);
1752             }
1753             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1754         }
1755         zSig = 0x40000000 + aSig + bSig;
1756         zExp = aExp;
1757         goto roundAndPack;
1758     }
1759     aSig |= 0x20000000;
1760     zSig = ( aSig + bSig )<<1;
1761     --zExp;
1762     if ( (int32_t) zSig < 0 ) {
1763         zSig = aSig + bSig;
1764         ++zExp;
1765     }
1766  roundAndPack:
1767     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1768
1769 }
1770
1771 /*----------------------------------------------------------------------------
1772 | Returns the result of subtracting the absolute values of the single-
1773 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1774 | difference is negated before being returned.  `zSign' is ignored if the
1775 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1776 | Standard for Binary Floating-Point Arithmetic.
1777 *----------------------------------------------------------------------------*/
1778
1779 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1780 {
1781     int16 aExp, bExp, zExp;
1782     uint32_t aSig, bSig, zSig;
1783     int16 expDiff;
1784
1785     aSig = extractFloat32Frac( a );
1786     aExp = extractFloat32Exp( a );
1787     bSig = extractFloat32Frac( b );
1788     bExp = extractFloat32Exp( b );
1789     expDiff = aExp - bExp;
1790     aSig <<= 7;
1791     bSig <<= 7;
1792     if ( 0 < expDiff ) goto aExpBigger;
1793     if ( expDiff < 0 ) goto bExpBigger;
1794     if ( aExp == 0xFF ) {
1795         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1796         float_raise( float_flag_invalid STATUS_VAR);
1797         return float32_default_nan;
1798     }
1799     if ( aExp == 0 ) {
1800         aExp = 1;
1801         bExp = 1;
1802     }
1803     if ( bSig < aSig ) goto aBigger;
1804     if ( aSig < bSig ) goto bBigger;
1805     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1806  bExpBigger:
1807     if ( bExp == 0xFF ) {
1808         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1809         return packFloat32( zSign ^ 1, 0xFF, 0 );
1810     }
1811     if ( aExp == 0 ) {
1812         ++expDiff;
1813     }
1814     else {
1815         aSig |= 0x40000000;
1816     }
1817     shift32RightJamming( aSig, - expDiff, &aSig );
1818     bSig |= 0x40000000;
1819  bBigger:
1820     zSig = bSig - aSig;
1821     zExp = bExp;
1822     zSign ^= 1;
1823     goto normalizeRoundAndPack;
1824  aExpBigger:
1825     if ( aExp == 0xFF ) {
1826         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1827         return a;
1828     }
1829     if ( bExp == 0 ) {
1830         --expDiff;
1831     }
1832     else {
1833         bSig |= 0x40000000;
1834     }
1835     shift32RightJamming( bSig, expDiff, &bSig );
1836     aSig |= 0x40000000;
1837  aBigger:
1838     zSig = aSig - bSig;
1839     zExp = aExp;
1840  normalizeRoundAndPack:
1841     --zExp;
1842     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1843
1844 }
1845
1846 /*----------------------------------------------------------------------------
1847 | Returns the result of adding the single-precision floating-point values `a'
1848 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1849 | Binary Floating-Point Arithmetic.
1850 *----------------------------------------------------------------------------*/
1851
1852 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1853 {
1854     flag aSign, bSign;
1855     a = float32_squash_input_denormal(a STATUS_VAR);
1856     b = float32_squash_input_denormal(b STATUS_VAR);
1857
1858     aSign = extractFloat32Sign( a );
1859     bSign = extractFloat32Sign( b );
1860     if ( aSign == bSign ) {
1861         return addFloat32Sigs( a, b, aSign STATUS_VAR);
1862     }
1863     else {
1864         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1865     }
1866
1867 }
1868
1869 /*----------------------------------------------------------------------------
1870 | Returns the result of subtracting the single-precision floating-point values
1871 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1872 | for Binary Floating-Point Arithmetic.
1873 *----------------------------------------------------------------------------*/
1874
1875 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1876 {
1877     flag aSign, bSign;
1878     a = float32_squash_input_denormal(a STATUS_VAR);
1879     b = float32_squash_input_denormal(b STATUS_VAR);
1880
1881     aSign = extractFloat32Sign( a );
1882     bSign = extractFloat32Sign( b );
1883     if ( aSign == bSign ) {
1884         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1885     }
1886     else {
1887         return addFloat32Sigs( a, b, aSign STATUS_VAR );
1888     }
1889
1890 }
1891
1892 /*----------------------------------------------------------------------------
1893 | Returns the result of multiplying the single-precision floating-point values
1894 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1895 | for Binary Floating-Point Arithmetic.
1896 *----------------------------------------------------------------------------*/
1897
1898 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1899 {
1900     flag aSign, bSign, zSign;
1901     int16 aExp, bExp, zExp;
1902     uint32_t aSig, bSig;
1903     uint64_t zSig64;
1904     uint32_t zSig;
1905
1906     a = float32_squash_input_denormal(a STATUS_VAR);
1907     b = float32_squash_input_denormal(b STATUS_VAR);
1908
1909     aSig = extractFloat32Frac( a );
1910     aExp = extractFloat32Exp( a );
1911     aSign = extractFloat32Sign( a );
1912     bSig = extractFloat32Frac( b );
1913     bExp = extractFloat32Exp( b );
1914     bSign = extractFloat32Sign( b );
1915     zSign = aSign ^ bSign;
1916     if ( aExp == 0xFF ) {
1917         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1918             return propagateFloat32NaN( a, b STATUS_VAR );
1919         }
1920         if ( ( bExp | bSig ) == 0 ) {
1921             float_raise( float_flag_invalid STATUS_VAR);
1922             return float32_default_nan;
1923         }
1924         return packFloat32( zSign, 0xFF, 0 );
1925     }
1926     if ( bExp == 0xFF ) {
1927         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1928         if ( ( aExp | aSig ) == 0 ) {
1929             float_raise( float_flag_invalid STATUS_VAR);
1930             return float32_default_nan;
1931         }
1932         return packFloat32( zSign, 0xFF, 0 );
1933     }
1934     if ( aExp == 0 ) {
1935         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1936         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1937     }
1938     if ( bExp == 0 ) {
1939         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1940         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1941     }
1942     zExp = aExp + bExp - 0x7F;
1943     aSig = ( aSig | 0x00800000 )<<7;
1944     bSig = ( bSig | 0x00800000 )<<8;
1945     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
1946     zSig = zSig64;
1947     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
1948         zSig <<= 1;
1949         --zExp;
1950     }
1951     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1952
1953 }
1954
1955 /*----------------------------------------------------------------------------
1956 | Returns the result of dividing the single-precision floating-point value `a'
1957 | by the corresponding value `b'.  The operation is performed according to the
1958 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1959 *----------------------------------------------------------------------------*/
1960
1961 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1962 {
1963     flag aSign, bSign, zSign;
1964     int16 aExp, bExp, zExp;
1965     uint32_t aSig, bSig, zSig;
1966     a = float32_squash_input_denormal(a STATUS_VAR);
1967     b = float32_squash_input_denormal(b STATUS_VAR);
1968
1969     aSig = extractFloat32Frac( a );
1970     aExp = extractFloat32Exp( a );
1971     aSign = extractFloat32Sign( a );
1972     bSig = extractFloat32Frac( b );
1973     bExp = extractFloat32Exp( b );
1974     bSign = extractFloat32Sign( b );
1975     zSign = aSign ^ bSign;
1976     if ( aExp == 0xFF ) {
1977         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1978         if ( bExp == 0xFF ) {
1979             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1980             float_raise( float_flag_invalid STATUS_VAR);
1981             return float32_default_nan;
1982         }
1983         return packFloat32( zSign, 0xFF, 0 );
1984     }
1985     if ( bExp == 0xFF ) {
1986         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1987         return packFloat32( zSign, 0, 0 );
1988     }
1989     if ( bExp == 0 ) {
1990         if ( bSig == 0 ) {
1991             if ( ( aExp | aSig ) == 0 ) {
1992                 float_raise( float_flag_invalid STATUS_VAR);
1993                 return float32_default_nan;
1994             }
1995             float_raise( float_flag_divbyzero STATUS_VAR);
1996             return packFloat32( zSign, 0xFF, 0 );
1997         }
1998         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1999     }
2000     if ( aExp == 0 ) {
2001         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2002         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2003     }
2004     zExp = aExp - bExp + 0x7D;
2005     aSig = ( aSig | 0x00800000 )<<7;
2006     bSig = ( bSig | 0x00800000 )<<8;
2007     if ( bSig <= ( aSig + aSig ) ) {
2008         aSig >>= 1;
2009         ++zExp;
2010     }
2011     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2012     if ( ( zSig & 0x3F ) == 0 ) {
2013         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2014     }
2015     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2016
2017 }
2018
2019 /*----------------------------------------------------------------------------
2020 | Returns the remainder of the single-precision floating-point value `a'
2021 | with respect to the corresponding value `b'.  The operation is performed
2022 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2023 *----------------------------------------------------------------------------*/
2024
2025 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2026 {
2027     flag aSign, zSign;
2028     int16 aExp, bExp, expDiff;
2029     uint32_t aSig, bSig;
2030     uint32_t q;
2031     uint64_t aSig64, bSig64, q64;
2032     uint32_t alternateASig;
2033     int32_t sigMean;
2034     a = float32_squash_input_denormal(a STATUS_VAR);
2035     b = float32_squash_input_denormal(b STATUS_VAR);
2036
2037     aSig = extractFloat32Frac( a );
2038     aExp = extractFloat32Exp( a );
2039     aSign = extractFloat32Sign( a );
2040     bSig = extractFloat32Frac( b );
2041     bExp = extractFloat32Exp( b );
2042     if ( aExp == 0xFF ) {
2043         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2044             return propagateFloat32NaN( a, b STATUS_VAR );
2045         }
2046         float_raise( float_flag_invalid STATUS_VAR);
2047         return float32_default_nan;
2048     }
2049     if ( bExp == 0xFF ) {
2050         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2051         return a;
2052     }
2053     if ( bExp == 0 ) {
2054         if ( bSig == 0 ) {
2055             float_raise( float_flag_invalid STATUS_VAR);
2056             return float32_default_nan;
2057         }
2058         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2059     }
2060     if ( aExp == 0 ) {
2061         if ( aSig == 0 ) return a;
2062         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2063     }
2064     expDiff = aExp - bExp;
2065     aSig |= 0x00800000;
2066     bSig |= 0x00800000;
2067     if ( expDiff < 32 ) {
2068         aSig <<= 8;
2069         bSig <<= 8;
2070         if ( expDiff < 0 ) {
2071             if ( expDiff < -1 ) return a;
2072             aSig >>= 1;
2073         }
2074         q = ( bSig <= aSig );
2075         if ( q ) aSig -= bSig;
2076         if ( 0 < expDiff ) {
2077             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2078             q >>= 32 - expDiff;
2079             bSig >>= 2;
2080             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2081         }
2082         else {
2083             aSig >>= 2;
2084             bSig >>= 2;
2085         }
2086     }
2087     else {
2088         if ( bSig <= aSig ) aSig -= bSig;
2089         aSig64 = ( (uint64_t) aSig )<<40;
2090         bSig64 = ( (uint64_t) bSig )<<40;
2091         expDiff -= 64;
2092         while ( 0 < expDiff ) {
2093             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2094             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2095             aSig64 = - ( ( bSig * q64 )<<38 );
2096             expDiff -= 62;
2097         }
2098         expDiff += 64;
2099         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2100         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2101         q = q64>>( 64 - expDiff );
2102         bSig <<= 6;
2103         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2104     }
2105     do {
2106         alternateASig = aSig;
2107         ++q;
2108         aSig -= bSig;
2109     } while ( 0 <= (int32_t) aSig );
2110     sigMean = aSig + alternateASig;
2111     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2112         aSig = alternateASig;
2113     }
2114     zSign = ( (int32_t) aSig < 0 );
2115     if ( zSign ) aSig = - aSig;
2116     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2117
2118 }
2119
2120 /*----------------------------------------------------------------------------
2121 | Returns the square root of the single-precision floating-point value `a'.
2122 | The operation is performed according to the IEC/IEEE Standard for Binary
2123 | Floating-Point Arithmetic.
2124 *----------------------------------------------------------------------------*/
2125
2126 float32 float32_sqrt( float32 a STATUS_PARAM )
2127 {
2128     flag aSign;
2129     int16 aExp, zExp;
2130     uint32_t aSig, zSig;
2131     uint64_t rem, term;
2132     a = float32_squash_input_denormal(a STATUS_VAR);
2133
2134     aSig = extractFloat32Frac( a );
2135     aExp = extractFloat32Exp( a );
2136     aSign = extractFloat32Sign( a );
2137     if ( aExp == 0xFF ) {
2138         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2139         if ( ! aSign ) return a;
2140         float_raise( float_flag_invalid STATUS_VAR);
2141         return float32_default_nan;
2142     }
2143     if ( aSign ) {
2144         if ( ( aExp | aSig ) == 0 ) return a;
2145         float_raise( float_flag_invalid STATUS_VAR);
2146         return float32_default_nan;
2147     }
2148     if ( aExp == 0 ) {
2149         if ( aSig == 0 ) return float32_zero;
2150         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2151     }
2152     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2153     aSig = ( aSig | 0x00800000 )<<8;
2154     zSig = estimateSqrt32( aExp, aSig ) + 2;
2155     if ( ( zSig & 0x7F ) <= 5 ) {
2156         if ( zSig < 2 ) {
2157             zSig = 0x7FFFFFFF;
2158             goto roundAndPack;
2159         }
2160         aSig >>= aExp & 1;
2161         term = ( (uint64_t) zSig ) * zSig;
2162         rem = ( ( (uint64_t) aSig )<<32 ) - term;
2163         while ( (int64_t) rem < 0 ) {
2164             --zSig;
2165             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2166         }
2167         zSig |= ( rem != 0 );
2168     }
2169     shift32RightJamming( zSig, 1, &zSig );
2170  roundAndPack:
2171     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2172
2173 }
2174
2175 /*----------------------------------------------------------------------------
2176 | Returns the binary exponential of the single-precision floating-point value
2177 | `a'. The operation is performed according to the IEC/IEEE Standard for
2178 | Binary Floating-Point Arithmetic.
2179 |
2180 | Uses the following identities:
2181 |
2182 | 1. -------------------------------------------------------------------------
2183 |      x    x*ln(2)
2184 |     2  = e
2185 |
2186 | 2. -------------------------------------------------------------------------
2187 |                      2     3     4     5           n
2188 |      x        x     x     x     x     x           x
2189 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2190 |               1!    2!    3!    4!    5!          n!
2191 *----------------------------------------------------------------------------*/
2192
2193 static const float64 float32_exp2_coefficients[15] =
2194 {
2195     const_float64( 0x3ff0000000000000ll ), /*  1 */
2196     const_float64( 0x3fe0000000000000ll ), /*  2 */
2197     const_float64( 0x3fc5555555555555ll ), /*  3 */
2198     const_float64( 0x3fa5555555555555ll ), /*  4 */
2199     const_float64( 0x3f81111111111111ll ), /*  5 */
2200     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2201     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2202     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2203     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2204     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2205     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2206     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2207     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2208     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2209     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2210 };
2211
2212 float32 float32_exp2( float32 a STATUS_PARAM )
2213 {
2214     flag aSign;
2215     int16 aExp;
2216     uint32_t aSig;
2217     float64 r, x, xn;
2218     int i;
2219     a = float32_squash_input_denormal(a STATUS_VAR);
2220
2221     aSig = extractFloat32Frac( a );
2222     aExp = extractFloat32Exp( a );
2223     aSign = extractFloat32Sign( a );
2224
2225     if ( aExp == 0xFF) {
2226         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2227         return (aSign) ? float32_zero : a;
2228     }
2229     if (aExp == 0) {
2230         if (aSig == 0) return float32_one;
2231     }
2232
2233     float_raise( float_flag_inexact STATUS_VAR);
2234
2235     /* ******************************* */
2236     /* using float64 for approximation */
2237     /* ******************************* */
2238     x = float32_to_float64(a STATUS_VAR);
2239     x = float64_mul(x, float64_ln2 STATUS_VAR);
2240
2241     xn = x;
2242     r = float64_one;
2243     for (i = 0 ; i < 15 ; i++) {
2244         float64 f;
2245
2246         f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2247         r = float64_add(r, f STATUS_VAR);
2248
2249         xn = float64_mul(xn, x STATUS_VAR);
2250     }
2251
2252     return float64_to_float32(r, status);
2253 }
2254
2255 /*----------------------------------------------------------------------------
2256 | Returns the binary log of the single-precision floating-point value `a'.
2257 | The operation is performed according to the IEC/IEEE Standard for Binary
2258 | Floating-Point Arithmetic.
2259 *----------------------------------------------------------------------------*/
2260 float32 float32_log2( float32 a STATUS_PARAM )
2261 {
2262     flag aSign, zSign;
2263     int16 aExp;
2264     uint32_t aSig, zSig, i;
2265
2266     a = float32_squash_input_denormal(a STATUS_VAR);
2267     aSig = extractFloat32Frac( a );
2268     aExp = extractFloat32Exp( a );
2269     aSign = extractFloat32Sign( a );
2270
2271     if ( aExp == 0 ) {
2272         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2273         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2274     }
2275     if ( aSign ) {
2276         float_raise( float_flag_invalid STATUS_VAR);
2277         return float32_default_nan;
2278     }
2279     if ( aExp == 0xFF ) {
2280         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2281         return a;
2282     }
2283
2284     aExp -= 0x7F;
2285     aSig |= 0x00800000;
2286     zSign = aExp < 0;
2287     zSig = aExp << 23;
2288
2289     for (i = 1 << 22; i > 0; i >>= 1) {
2290         aSig = ( (uint64_t)aSig * aSig ) >> 23;
2291         if ( aSig & 0x01000000 ) {
2292             aSig >>= 1;
2293             zSig |= i;
2294         }
2295     }
2296
2297     if ( zSign )
2298         zSig = -zSig;
2299
2300     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2301 }
2302
2303 /*----------------------------------------------------------------------------
2304 | Returns 1 if the single-precision floating-point value `a' is equal to
2305 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2306 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2307 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2308 *----------------------------------------------------------------------------*/
2309
2310 int float32_eq( float32 a, float32 b STATUS_PARAM )
2311 {
2312     uint32_t av, bv;
2313     a = float32_squash_input_denormal(a STATUS_VAR);
2314     b = float32_squash_input_denormal(b STATUS_VAR);
2315
2316     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2317          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2318        ) {
2319         float_raise( float_flag_invalid STATUS_VAR);
2320         return 0;
2321     }
2322     av = float32_val(a);
2323     bv = float32_val(b);
2324     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2325 }
2326
2327 /*----------------------------------------------------------------------------
2328 | Returns 1 if the single-precision floating-point value `a' is less than
2329 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
2330 | exception is raised if either operand is a NaN.  The comparison is performed
2331 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2332 *----------------------------------------------------------------------------*/
2333
2334 int float32_le( float32 a, float32 b STATUS_PARAM )
2335 {
2336     flag aSign, bSign;
2337     uint32_t av, bv;
2338     a = float32_squash_input_denormal(a STATUS_VAR);
2339     b = float32_squash_input_denormal(b STATUS_VAR);
2340
2341     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2342          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2343        ) {
2344         float_raise( float_flag_invalid STATUS_VAR);
2345         return 0;
2346     }
2347     aSign = extractFloat32Sign( a );
2348     bSign = extractFloat32Sign( b );
2349     av = float32_val(a);
2350     bv = float32_val(b);
2351     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2352     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2353
2354 }
2355
2356 /*----------------------------------------------------------------------------
2357 | Returns 1 if the single-precision floating-point value `a' is less than
2358 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2359 | raised if either operand is a NaN.  The comparison is performed according
2360 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2361 *----------------------------------------------------------------------------*/
2362
2363 int float32_lt( float32 a, float32 b STATUS_PARAM )
2364 {
2365     flag aSign, bSign;
2366     uint32_t av, bv;
2367     a = float32_squash_input_denormal(a STATUS_VAR);
2368     b = float32_squash_input_denormal(b STATUS_VAR);
2369
2370     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2371          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2372        ) {
2373         float_raise( float_flag_invalid STATUS_VAR);
2374         return 0;
2375     }
2376     aSign = extractFloat32Sign( a );
2377     bSign = extractFloat32Sign( b );
2378     av = float32_val(a);
2379     bv = float32_val(b);
2380     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2381     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2382
2383 }
2384
2385 /*----------------------------------------------------------------------------
2386 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2387 | be compared, and 0 otherwise.  The invalid exception is raised if either
2388 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
2389 | Standard for Binary Floating-Point Arithmetic.
2390 *----------------------------------------------------------------------------*/
2391
2392 int float32_unordered( float32 a, float32 b STATUS_PARAM )
2393 {
2394     a = float32_squash_input_denormal(a STATUS_VAR);
2395     b = float32_squash_input_denormal(b STATUS_VAR);
2396
2397     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2398          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2399        ) {
2400         float_raise( float_flag_invalid STATUS_VAR);
2401         return 1;
2402     }
2403     return 0;
2404 }
2405
2406 /*----------------------------------------------------------------------------
2407 | Returns 1 if the single-precision floating-point value `a' is equal to
2408 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2409 | exception.  The comparison is performed according to the IEC/IEEE Standard
2410 | for Binary Floating-Point Arithmetic.
2411 *----------------------------------------------------------------------------*/
2412
2413 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2414 {
2415     a = float32_squash_input_denormal(a STATUS_VAR);
2416     b = float32_squash_input_denormal(b STATUS_VAR);
2417
2418     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2419          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2420        ) {
2421         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2422             float_raise( float_flag_invalid STATUS_VAR);
2423         }
2424         return 0;
2425     }
2426     return ( float32_val(a) == float32_val(b) ) ||
2427             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2428 }
2429
2430 /*----------------------------------------------------------------------------
2431 | Returns 1 if the single-precision floating-point value `a' is less than or
2432 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2433 | cause an exception.  Otherwise, the comparison is performed according to the
2434 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2435 *----------------------------------------------------------------------------*/
2436
2437 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2438 {
2439     flag aSign, bSign;
2440     uint32_t av, bv;
2441     a = float32_squash_input_denormal(a STATUS_VAR);
2442     b = float32_squash_input_denormal(b STATUS_VAR);
2443
2444     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2445          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2446        ) {
2447         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2448             float_raise( float_flag_invalid STATUS_VAR);
2449         }
2450         return 0;
2451     }
2452     aSign = extractFloat32Sign( a );
2453     bSign = extractFloat32Sign( b );
2454     av = float32_val(a);
2455     bv = float32_val(b);
2456     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2457     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2458
2459 }
2460
2461 /*----------------------------------------------------------------------------
2462 | Returns 1 if the single-precision floating-point value `a' is less than
2463 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2464 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2465 | Standard for Binary Floating-Point Arithmetic.
2466 *----------------------------------------------------------------------------*/
2467
2468 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2469 {
2470     flag aSign, bSign;
2471     uint32_t av, bv;
2472     a = float32_squash_input_denormal(a STATUS_VAR);
2473     b = float32_squash_input_denormal(b STATUS_VAR);
2474
2475     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2476          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2477        ) {
2478         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2479             float_raise( float_flag_invalid STATUS_VAR);
2480         }
2481         return 0;
2482     }
2483     aSign = extractFloat32Sign( a );
2484     bSign = extractFloat32Sign( b );
2485     av = float32_val(a);
2486     bv = float32_val(b);
2487     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2488     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2489
2490 }
2491
2492 /*----------------------------------------------------------------------------
2493 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2494 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
2495 | comparison is performed according to the IEC/IEEE Standard for Binary
2496 | Floating-Point Arithmetic.
2497 *----------------------------------------------------------------------------*/
2498
2499 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2500 {
2501     a = float32_squash_input_denormal(a STATUS_VAR);
2502     b = float32_squash_input_denormal(b STATUS_VAR);
2503
2504     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2505          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2506        ) {
2507         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2508             float_raise( float_flag_invalid STATUS_VAR);
2509         }
2510         return 1;
2511     }
2512     return 0;
2513 }
2514
2515 /*----------------------------------------------------------------------------
2516 | Returns the result of converting the double-precision floating-point value
2517 | `a' to the 32-bit two's complement integer format.  The conversion is
2518 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2519 | Arithmetic---which means in particular that the conversion is rounded
2520 | according to the current rounding mode.  If `a' is a NaN, the largest
2521 | positive integer is returned.  Otherwise, if the conversion overflows, the
2522 | largest integer with the same sign as `a' is returned.
2523 *----------------------------------------------------------------------------*/
2524
2525 int32 float64_to_int32( float64 a STATUS_PARAM )
2526 {
2527     flag aSign;
2528     int16 aExp, shiftCount;
2529     uint64_t aSig;
2530     a = float64_squash_input_denormal(a STATUS_VAR);
2531
2532     aSig = extractFloat64Frac( a );
2533     aExp = extractFloat64Exp( a );
2534     aSign = extractFloat64Sign( a );
2535     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2536     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2537     shiftCount = 0x42C - aExp;
2538     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2539     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2540
2541 }
2542
2543 /*----------------------------------------------------------------------------
2544 | Returns the result of converting the double-precision floating-point value
2545 | `a' to the 32-bit two's complement integer format.  The conversion is
2546 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2547 | Arithmetic, except that the conversion is always rounded toward zero.
2548 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2549 | the conversion overflows, the largest integer with the same sign as `a' is
2550 | returned.
2551 *----------------------------------------------------------------------------*/
2552
2553 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2554 {
2555     flag aSign;
2556     int16 aExp, shiftCount;
2557     uint64_t aSig, savedASig;
2558     int32 z;
2559     a = float64_squash_input_denormal(a STATUS_VAR);
2560
2561     aSig = extractFloat64Frac( a );
2562     aExp = extractFloat64Exp( a );
2563     aSign = extractFloat64Sign( a );
2564     if ( 0x41E < aExp ) {
2565         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2566         goto invalid;
2567     }
2568     else if ( aExp < 0x3FF ) {
2569         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2570         return 0;
2571     }
2572     aSig |= LIT64( 0x0010000000000000 );
2573     shiftCount = 0x433 - aExp;
2574     savedASig = aSig;
2575     aSig >>= shiftCount;
2576     z = aSig;
2577     if ( aSign ) z = - z;
2578     if ( ( z < 0 ) ^ aSign ) {
2579  invalid:
2580         float_raise( float_flag_invalid STATUS_VAR);
2581         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2582     }
2583     if ( ( aSig<<shiftCount ) != savedASig ) {
2584         STATUS(float_exception_flags) |= float_flag_inexact;
2585     }
2586     return z;
2587
2588 }
2589
2590 /*----------------------------------------------------------------------------
2591 | Returns the result of converting the double-precision floating-point value
2592 | `a' to the 16-bit two's complement integer format.  The conversion is
2593 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2594 | Arithmetic, except that the conversion is always rounded toward zero.
2595 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2596 | the conversion overflows, the largest integer with the same sign as `a' is
2597 | returned.
2598 *----------------------------------------------------------------------------*/
2599
2600 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2601 {
2602     flag aSign;
2603     int16 aExp, shiftCount;
2604     uint64_t aSig, savedASig;
2605     int32 z;
2606
2607     aSig = extractFloat64Frac( a );
2608     aExp = extractFloat64Exp( a );
2609     aSign = extractFloat64Sign( a );
2610     if ( 0x40E < aExp ) {
2611         if ( ( aExp == 0x7FF ) && aSig ) {
2612             aSign = 0;
2613         }
2614         goto invalid;
2615     }
2616     else if ( aExp < 0x3FF ) {
2617         if ( aExp || aSig ) {
2618             STATUS(float_exception_flags) |= float_flag_inexact;
2619         }
2620         return 0;
2621     }
2622     aSig |= LIT64( 0x0010000000000000 );
2623     shiftCount = 0x433 - aExp;
2624     savedASig = aSig;
2625     aSig >>= shiftCount;
2626     z = aSig;
2627     if ( aSign ) {
2628         z = - z;
2629     }
2630     if ( ( (int16_t)z < 0 ) ^ aSign ) {
2631  invalid:
2632         float_raise( float_flag_invalid STATUS_VAR);
2633         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2634     }
2635     if ( ( aSig<<shiftCount ) != savedASig ) {
2636         STATUS(float_exception_flags) |= float_flag_inexact;
2637     }
2638     return z;
2639 }
2640
2641 /*----------------------------------------------------------------------------
2642 | Returns the result of converting the double-precision floating-point value
2643 | `a' to the 64-bit two's complement integer format.  The conversion is
2644 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2645 | Arithmetic---which means in particular that the conversion is rounded
2646 | according to the current rounding mode.  If `a' is a NaN, the largest
2647 | positive integer is returned.  Otherwise, if the conversion overflows, the
2648 | largest integer with the same sign as `a' is returned.
2649 *----------------------------------------------------------------------------*/
2650
2651 int64 float64_to_int64( float64 a STATUS_PARAM )
2652 {
2653     flag aSign;
2654     int16 aExp, shiftCount;
2655     uint64_t aSig, aSigExtra;
2656     a = float64_squash_input_denormal(a STATUS_VAR);
2657
2658     aSig = extractFloat64Frac( a );
2659     aExp = extractFloat64Exp( a );
2660     aSign = extractFloat64Sign( a );
2661     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2662     shiftCount = 0x433 - aExp;
2663     if ( shiftCount <= 0 ) {
2664         if ( 0x43E < aExp ) {
2665             float_raise( float_flag_invalid STATUS_VAR);
2666             if (    ! aSign
2667                  || (    ( aExp == 0x7FF )
2668                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2669                ) {
2670                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2671             }
2672             return (int64_t) LIT64( 0x8000000000000000 );
2673         }
2674         aSigExtra = 0;
2675         aSig <<= - shiftCount;
2676     }
2677     else {
2678         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2679     }
2680     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2681
2682 }
2683
2684 /*----------------------------------------------------------------------------
2685 | Returns the result of converting the double-precision floating-point value
2686 | `a' to the 64-bit two's complement integer format.  The conversion is
2687 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2688 | Arithmetic, except that the conversion is always rounded toward zero.
2689 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2690 | the conversion overflows, the largest integer with the same sign as `a' is
2691 | returned.
2692 *----------------------------------------------------------------------------*/
2693
2694 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2695 {
2696     flag aSign;
2697     int16 aExp, shiftCount;
2698     uint64_t aSig;
2699     int64 z;
2700     a = float64_squash_input_denormal(a STATUS_VAR);
2701
2702     aSig = extractFloat64Frac( a );
2703     aExp = extractFloat64Exp( a );
2704     aSign = extractFloat64Sign( a );
2705     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2706     shiftCount = aExp - 0x433;
2707     if ( 0 <= shiftCount ) {
2708         if ( 0x43E <= aExp ) {
2709             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2710                 float_raise( float_flag_invalid STATUS_VAR);
2711                 if (    ! aSign
2712                      || (    ( aExp == 0x7FF )
2713                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2714                    ) {
2715                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2716                 }
2717             }
2718             return (int64_t) LIT64( 0x8000000000000000 );
2719         }
2720         z = aSig<<shiftCount;
2721     }
2722     else {
2723         if ( aExp < 0x3FE ) {
2724             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2725             return 0;
2726         }
2727         z = aSig>>( - shiftCount );
2728         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
2729             STATUS(float_exception_flags) |= float_flag_inexact;
2730         }
2731     }
2732     if ( aSign ) z = - z;
2733     return z;
2734
2735 }
2736
2737 /*----------------------------------------------------------------------------
2738 | Returns the result of converting the double-precision floating-point value
2739 | `a' to the single-precision floating-point format.  The conversion is
2740 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2741 | Arithmetic.
2742 *----------------------------------------------------------------------------*/
2743
2744 float32 float64_to_float32( float64 a STATUS_PARAM )
2745 {
2746     flag aSign;
2747     int16 aExp;
2748     uint64_t aSig;
2749     uint32_t zSig;
2750     a = float64_squash_input_denormal(a STATUS_VAR);
2751
2752     aSig = extractFloat64Frac( a );
2753     aExp = extractFloat64Exp( a );
2754     aSign = extractFloat64Sign( a );
2755     if ( aExp == 0x7FF ) {
2756         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2757         return packFloat32( aSign, 0xFF, 0 );
2758     }
2759     shift64RightJamming( aSig, 22, &aSig );
2760     zSig = aSig;
2761     if ( aExp || zSig ) {
2762         zSig |= 0x40000000;
2763         aExp -= 0x381;
2764     }
2765     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2766
2767 }
2768
2769
2770 /*----------------------------------------------------------------------------
2771 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2772 | half-precision floating-point value, returning the result.  After being
2773 | shifted into the proper positions, the three fields are simply added
2774 | together to form the result.  This means that any integer portion of `zSig'
2775 | will be added into the exponent.  Since a properly normalized significand
2776 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2777 | than the desired result exponent whenever `zSig' is a complete, normalized
2778 | significand.
2779 *----------------------------------------------------------------------------*/
2780 static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig)
2781 {
2782     return make_float16(
2783         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
2784 }
2785
2786 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2787    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2788
2789 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2790 {
2791     flag aSign;
2792     int16 aExp;
2793     uint32_t aSig;
2794
2795     aSign = extractFloat16Sign(a);
2796     aExp = extractFloat16Exp(a);
2797     aSig = extractFloat16Frac(a);
2798
2799     if (aExp == 0x1f && ieee) {
2800         if (aSig) {
2801             return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
2802         }
2803         return packFloat32(aSign, 0xff, aSig << 13);
2804     }
2805     if (aExp == 0) {
2806         int8 shiftCount;
2807
2808         if (aSig == 0) {
2809             return packFloat32(aSign, 0, 0);
2810         }
2811
2812         shiftCount = countLeadingZeros32( aSig ) - 21;
2813         aSig = aSig << shiftCount;
2814         aExp = -shiftCount;
2815     }
2816     return packFloat32( aSign, aExp + 0x70, aSig << 13);
2817 }
2818
2819 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
2820 {
2821     flag aSign;
2822     int16 aExp;
2823     uint32_t aSig;
2824     uint32_t mask;
2825     uint32_t increment;
2826     int8 roundingMode;
2827     a = float32_squash_input_denormal(a STATUS_VAR);
2828
2829     aSig = extractFloat32Frac( a );
2830     aExp = extractFloat32Exp( a );
2831     aSign = extractFloat32Sign( a );
2832     if ( aExp == 0xFF ) {
2833         if (aSig) {
2834             /* Input is a NaN */
2835             float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2836             if (!ieee) {
2837                 return packFloat16(aSign, 0, 0);
2838             }
2839             return r;
2840         }
2841         /* Infinity */
2842         if (!ieee) {
2843             float_raise(float_flag_invalid STATUS_VAR);
2844             return packFloat16(aSign, 0x1f, 0x3ff);
2845         }
2846         return packFloat16(aSign, 0x1f, 0);
2847     }
2848     if (aExp == 0 && aSig == 0) {
2849         return packFloat16(aSign, 0, 0);
2850     }
2851     /* Decimal point between bits 22 and 23.  */
2852     aSig |= 0x00800000;
2853     aExp -= 0x7f;
2854     if (aExp < -14) {
2855         mask = 0x00ffffff;
2856         if (aExp >= -24) {
2857             mask >>= 25 + aExp;
2858         }
2859     } else {
2860         mask = 0x00001fff;
2861     }
2862     if (aSig & mask) {
2863         float_raise( float_flag_underflow STATUS_VAR );
2864         roundingMode = STATUS(float_rounding_mode);
2865         switch (roundingMode) {
2866         case float_round_nearest_even:
2867             increment = (mask + 1) >> 1;
2868             if ((aSig & mask) == increment) {
2869                 increment = aSig & (increment << 1);
2870             }
2871             break;
2872         case float_round_up:
2873             increment = aSign ? 0 : mask;
2874             break;
2875         case float_round_down:
2876             increment = aSign ? mask : 0;
2877             break;
2878         default: /* round_to_zero */
2879             increment = 0;
2880             break;
2881         }
2882         aSig += increment;
2883         if (aSig >= 0x01000000) {
2884             aSig >>= 1;
2885             aExp++;
2886         }
2887     } else if (aExp < -14
2888           && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2889         float_raise( float_flag_underflow STATUS_VAR);
2890     }
2891
2892     if (ieee) {
2893         if (aExp > 15) {
2894             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2895             return packFloat16(aSign, 0x1f, 0);
2896         }
2897     } else {
2898         if (aExp > 16) {
2899             float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
2900             return packFloat16(aSign, 0x1f, 0x3ff);
2901         }
2902     }
2903     if (aExp < -24) {
2904         return packFloat16(aSign, 0, 0);
2905     }
2906     if (aExp < -14) {
2907         aSig >>= -14 - aExp;
2908         aExp = -14;
2909     }
2910     return packFloat16(aSign, aExp + 14, aSig >> 13);
2911 }
2912
2913 /*----------------------------------------------------------------------------
2914 | Returns the result of converting the double-precision floating-point value
2915 | `a' to the extended double-precision floating-point format.  The conversion
2916 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2917 | Arithmetic.
2918 *----------------------------------------------------------------------------*/
2919
2920 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2921 {
2922     flag aSign;
2923     int16 aExp;
2924     uint64_t aSig;
2925
2926     a = float64_squash_input_denormal(a STATUS_VAR);
2927     aSig = extractFloat64Frac( a );
2928     aExp = extractFloat64Exp( a );
2929     aSign = extractFloat64Sign( a );
2930     if ( aExp == 0x7FF ) {
2931         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2932         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2933     }
2934     if ( aExp == 0 ) {
2935         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2936         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2937     }
2938     return
2939         packFloatx80(
2940             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2941
2942 }
2943
2944 /*----------------------------------------------------------------------------
2945 | Returns the result of converting the double-precision floating-point value
2946 | `a' to the quadruple-precision floating-point format.  The conversion is
2947 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2948 | Arithmetic.
2949 *----------------------------------------------------------------------------*/
2950
2951 float128 float64_to_float128( float64 a STATUS_PARAM )
2952 {
2953     flag aSign;
2954     int16 aExp;
2955     uint64_t aSig, zSig0, zSig1;
2956
2957     a = float64_squash_input_denormal(a STATUS_VAR);
2958     aSig = extractFloat64Frac( a );
2959     aExp = extractFloat64Exp( a );
2960     aSign = extractFloat64Sign( a );
2961     if ( aExp == 0x7FF ) {
2962         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2963         return packFloat128( aSign, 0x7FFF, 0, 0 );
2964     }
2965     if ( aExp == 0 ) {
2966         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2967         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2968         --aExp;
2969     }
2970     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2971     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2972
2973 }
2974
2975 /*----------------------------------------------------------------------------
2976 | Rounds the double-precision floating-point value `a' to an integer, and
2977 | returns the result as a double-precision floating-point value.  The
2978 | operation is performed according to the IEC/IEEE Standard for Binary
2979 | Floating-Point Arithmetic.
2980 *----------------------------------------------------------------------------*/
2981
2982 float64 float64_round_to_int( float64 a STATUS_PARAM )
2983 {
2984     flag aSign;
2985     int16 aExp;
2986     uint64_t lastBitMask, roundBitsMask;
2987     int8 roundingMode;
2988     uint64_t z;
2989     a = float64_squash_input_denormal(a STATUS_VAR);
2990
2991     aExp = extractFloat64Exp( a );
2992     if ( 0x433 <= aExp ) {
2993         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2994             return propagateFloat64NaN( a, a STATUS_VAR );
2995         }
2996         return a;
2997     }
2998     if ( aExp < 0x3FF ) {
2999         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3000         STATUS(float_exception_flags) |= float_flag_inexact;
3001         aSign = extractFloat64Sign( a );
3002         switch ( STATUS(float_rounding_mode) ) {
3003          case float_round_nearest_even:
3004             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3005                 return packFloat64( aSign, 0x3FF, 0 );
3006             }
3007             break;
3008          case float_round_down:
3009             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3010          case float_round_up:
3011             return make_float64(
3012             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3013         }
3014         return packFloat64( aSign, 0, 0 );
3015     }
3016     lastBitMask = 1;
3017     lastBitMask <<= 0x433 - aExp;
3018     roundBitsMask = lastBitMask - 1;
3019     z = float64_val(a);
3020     roundingMode = STATUS(float_rounding_mode);
3021     if ( roundingMode == float_round_nearest_even ) {
3022         z += lastBitMask>>1;
3023         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3024     }
3025     else if ( roundingMode != float_round_to_zero ) {
3026         if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3027             z += roundBitsMask;
3028         }
3029     }
3030     z &= ~ roundBitsMask;
3031     if ( z != float64_val(a) )
3032         STATUS(float_exception_flags) |= float_flag_inexact;
3033     return make_float64(z);
3034
3035 }
3036
3037 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3038 {
3039     int oldmode;
3040     float64 res;
3041     oldmode = STATUS(float_rounding_mode);
3042     STATUS(float_rounding_mode) = float_round_to_zero;
3043     res = float64_round_to_int(a STATUS_VAR);
3044     STATUS(float_rounding_mode) = oldmode;
3045     return res;
3046 }
3047
3048 /*----------------------------------------------------------------------------
3049 | Returns the result of adding the absolute values of the double-precision
3050 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3051 | before being returned.  `zSign' is ignored if the result is a NaN.
3052 | The addition is performed according to the IEC/IEEE Standard for Binary
3053 | Floating-Point Arithmetic.
3054 *----------------------------------------------------------------------------*/
3055
3056 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3057 {
3058     int16 aExp, bExp, zExp;
3059     uint64_t aSig, bSig, zSig;
3060     int16 expDiff;
3061
3062     aSig = extractFloat64Frac( a );
3063     aExp = extractFloat64Exp( a );
3064     bSig = extractFloat64Frac( b );
3065     bExp = extractFloat64Exp( b );
3066     expDiff = aExp - bExp;
3067     aSig <<= 9;
3068     bSig <<= 9;
3069     if ( 0 < expDiff ) {
3070         if ( aExp == 0x7FF ) {
3071             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3072             return a;
3073         }
3074         if ( bExp == 0 ) {
3075             --expDiff;
3076         }
3077         else {
3078             bSig |= LIT64( 0x2000000000000000 );
3079         }
3080         shift64RightJamming( bSig, expDiff, &bSig );
3081         zExp = aExp;
3082     }
3083     else if ( expDiff < 0 ) {
3084         if ( bExp == 0x7FF ) {
3085             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3086             return packFloat64( zSign, 0x7FF, 0 );
3087         }
3088         if ( aExp == 0 ) {
3089             ++expDiff;
3090         }
3091         else {
3092             aSig |= LIT64( 0x2000000000000000 );
3093         }
3094         shift64RightJamming( aSig, - expDiff, &aSig );
3095         zExp = bExp;
3096     }
3097     else {
3098         if ( aExp == 0x7FF ) {
3099             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3100             return a;
3101         }
3102         if ( aExp == 0 ) {
3103             if (STATUS(flush_to_zero)) {
3104                 if (aSig | bSig) {
3105                     float_raise(float_flag_output_denormal STATUS_VAR);
3106                 }
3107                 return packFloat64(zSign, 0, 0);
3108             }
3109             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3110         }
3111         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3112         zExp = aExp;
3113         goto roundAndPack;
3114     }
3115     aSig |= LIT64( 0x2000000000000000 );
3116     zSig = ( aSig + bSig )<<1;
3117     --zExp;
3118     if ( (int64_t) zSig < 0 ) {
3119         zSig = aSig + bSig;
3120         ++zExp;
3121     }
3122  roundAndPack:
3123     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3124
3125 }
3126
3127 /*----------------------------------------------------------------------------
3128 | Returns the result of subtracting the absolute values of the double-
3129 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3130 | difference is negated before being returned.  `zSign' is ignored if the
3131 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3132 | Standard for Binary Floating-Point Arithmetic.
3133 *----------------------------------------------------------------------------*/
3134
3135 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3136 {
3137     int16 aExp, bExp, zExp;
3138     uint64_t aSig, bSig, zSig;
3139     int16 expDiff;
3140
3141     aSig = extractFloat64Frac( a );
3142     aExp = extractFloat64Exp( a );
3143     bSig = extractFloat64Frac( b );
3144     bExp = extractFloat64Exp( b );
3145     expDiff = aExp - bExp;
3146     aSig <<= 10;
3147     bSig <<= 10;
3148     if ( 0 < expDiff ) goto aExpBigger;
3149     if ( expDiff < 0 ) goto bExpBigger;
3150     if ( aExp == 0x7FF ) {
3151         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3152         float_raise( float_flag_invalid STATUS_VAR);
3153         return float64_default_nan;
3154     }
3155     if ( aExp == 0 ) {
3156         aExp = 1;
3157         bExp = 1;
3158     }
3159     if ( bSig < aSig ) goto aBigger;
3160     if ( aSig < bSig ) goto bBigger;
3161     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3162  bExpBigger:
3163     if ( bExp == 0x7FF ) {
3164         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3165         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3166     }
3167     if ( aExp == 0 ) {
3168         ++expDiff;
3169     }
3170     else {
3171         aSig |= LIT64( 0x4000000000000000 );
3172     }
3173     shift64RightJamming( aSig, - expDiff, &aSig );
3174     bSig |= LIT64( 0x4000000000000000 );
3175  bBigger:
3176     zSig = bSig - aSig;
3177     zExp = bExp;
3178     zSign ^= 1;
3179     goto normalizeRoundAndPack;
3180  aExpBigger:
3181     if ( aExp == 0x7FF ) {
3182         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3183         return a;
3184     }
3185     if ( bExp == 0 ) {
3186         --expDiff;
3187     }
3188     else {
3189         bSig |= LIT64( 0x4000000000000000 );
3190     }
3191     shift64RightJamming( bSig, expDiff, &bSig );
3192     aSig |= LIT64( 0x4000000000000000 );
3193  aBigger:
3194     zSig = aSig - bSig;
3195     zExp = aExp;
3196  normalizeRoundAndPack:
3197     --zExp;
3198     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3199
3200 }
3201
3202 /*----------------------------------------------------------------------------
3203 | Returns the result of adding the double-precision floating-point values `a'
3204 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3205 | Binary Floating-Point Arithmetic.
3206 *----------------------------------------------------------------------------*/
3207
3208 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3209 {
3210     flag aSign, bSign;
3211     a = float64_squash_input_denormal(a STATUS_VAR);
3212     b = float64_squash_input_denormal(b STATUS_VAR);
3213
3214     aSign = extractFloat64Sign( a );
3215     bSign = extractFloat64Sign( b );
3216     if ( aSign == bSign ) {
3217         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3218     }
3219     else {
3220         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3221     }
3222
3223 }
3224
3225 /*----------------------------------------------------------------------------
3226 | Returns the result of subtracting the double-precision floating-point values
3227 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3228 | for Binary Floating-Point Arithmetic.
3229 *----------------------------------------------------------------------------*/
3230
3231 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3232 {
3233     flag aSign, bSign;
3234     a = float64_squash_input_denormal(a STATUS_VAR);
3235     b = float64_squash_input_denormal(b STATUS_VAR);
3236
3237     aSign = extractFloat64Sign( a );
3238     bSign = extractFloat64Sign( b );
3239     if ( aSign == bSign ) {
3240         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3241     }
3242     else {
3243         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3244     }
3245
3246 }
3247
3248 /*----------------------------------------------------------------------------
3249 | Returns the result of multiplying the double-precision floating-point values
3250 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3251 | for Binary Floating-Point Arithmetic.
3252 *----------------------------------------------------------------------------*/
3253
3254 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3255 {
3256     flag aSign, bSign, zSign;
3257     int16 aExp, bExp, zExp;
3258     uint64_t aSig, bSig, zSig0, zSig1;
3259
3260     a = float64_squash_input_denormal(a STATUS_VAR);
3261     b = float64_squash_input_denormal(b STATUS_VAR);
3262
3263     aSig = extractFloat64Frac( a );
3264     aExp = extractFloat64Exp( a );
3265     aSign = extractFloat64Sign( a );
3266     bSig = extractFloat64Frac( b );
3267     bExp = extractFloat64Exp( b );
3268     bSign = extractFloat64Sign( b );
3269     zSign = aSign ^ bSign;
3270     if ( aExp == 0x7FF ) {
3271         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3272             return propagateFloat64NaN( a, b STATUS_VAR );
3273         }
3274         if ( ( bExp | bSig ) == 0 ) {
3275             float_raise( float_flag_invalid STATUS_VAR);
3276             return float64_default_nan;
3277         }
3278         return packFloat64( zSign, 0x7FF, 0 );
3279     }
3280     if ( bExp == 0x7FF ) {
3281         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3282         if ( ( aExp | aSig ) == 0 ) {
3283             float_raise( float_flag_invalid STATUS_VAR);
3284             return float64_default_nan;
3285         }
3286         return packFloat64( zSign, 0x7FF, 0 );
3287     }
3288     if ( aExp == 0 ) {
3289         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3290         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3291     }
3292     if ( bExp == 0 ) {
3293         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3294         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3295     }
3296     zExp = aExp + bExp - 0x3FF;
3297     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3298     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3299     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3300     zSig0 |= ( zSig1 != 0 );
3301     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3302         zSig0 <<= 1;
3303         --zExp;
3304     }
3305     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3306
3307 }
3308
3309 /*----------------------------------------------------------------------------
3310 | Returns the result of dividing the double-precision floating-point value `a'
3311 | by the corresponding value `b'.  The operation is performed according to
3312 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3313 *----------------------------------------------------------------------------*/
3314
3315 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3316 {
3317     flag aSign, bSign, zSign;
3318     int16 aExp, bExp, zExp;
3319     uint64_t aSig, bSig, zSig;
3320     uint64_t rem0, rem1;
3321     uint64_t term0, term1;
3322     a = float64_squash_input_denormal(a STATUS_VAR);
3323     b = float64_squash_input_denormal(b STATUS_VAR);
3324
3325     aSig = extractFloat64Frac( a );
3326     aExp = extractFloat64Exp( a );
3327     aSign = extractFloat64Sign( a );
3328     bSig = extractFloat64Frac( b );
3329     bExp = extractFloat64Exp( b );
3330     bSign = extractFloat64Sign( b );
3331     zSign = aSign ^ bSign;
3332     if ( aExp == 0x7FF ) {
3333         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3334         if ( bExp == 0x7FF ) {
3335             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3336             float_raise( float_flag_invalid STATUS_VAR);
3337             return float64_default_nan;
3338         }
3339         return packFloat64( zSign, 0x7FF, 0 );
3340     }
3341     if ( bExp == 0x7FF ) {
3342         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3343         return packFloat64( zSign, 0, 0 );
3344     }
3345     if ( bExp == 0 ) {
3346         if ( bSig == 0 ) {
3347             if ( ( aExp | aSig ) == 0 ) {
3348                 float_raise( float_flag_invalid STATUS_VAR);
3349                 return float64_default_nan;
3350             }
3351             float_raise( float_flag_divbyzero STATUS_VAR);
3352             return packFloat64( zSign, 0x7FF, 0 );
3353         }
3354         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3355     }
3356     if ( aExp == 0 ) {
3357         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3358         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3359     }
3360     zExp = aExp - bExp + 0x3FD;
3361     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3362     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3363     if ( bSig <= ( aSig + aSig ) ) {
3364         aSig >>= 1;
3365         ++zExp;
3366     }
3367     zSig = estimateDiv128To64( aSig, 0, bSig );
3368     if ( ( zSig & 0x1FF ) <= 2 ) {
3369         mul64To128( bSig, zSig, &term0, &term1 );
3370         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3371         while ( (int64_t) rem0 < 0 ) {
3372             --zSig;
3373             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3374         }
3375         zSig |= ( rem1 != 0 );
3376     }
3377     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3378
3379 }
3380
3381 /*----------------------------------------------------------------------------
3382 | Returns the remainder of the double-precision floating-point value `a'
3383 | with respect to the corresponding value `b'.  The operation is performed
3384 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3385 *----------------------------------------------------------------------------*/
3386
3387 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3388 {
3389     flag aSign, zSign;
3390     int16 aExp, bExp, expDiff;
3391     uint64_t aSig, bSig;
3392     uint64_t q, alternateASig;
3393     int64_t sigMean;
3394
3395     a = float64_squash_input_denormal(a STATUS_VAR);
3396     b = float64_squash_input_denormal(b STATUS_VAR);
3397     aSig = extractFloat64Frac( a );
3398     aExp = extractFloat64Exp( a );
3399     aSign = extractFloat64Sign( a );
3400     bSig = extractFloat64Frac( b );
3401     bExp = extractFloat64Exp( b );
3402     if ( aExp == 0x7FF ) {
3403         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3404             return propagateFloat64NaN( a, b STATUS_VAR );
3405         }
3406         float_raise( float_flag_invalid STATUS_VAR);
3407         return float64_default_nan;
3408     }
3409     if ( bExp == 0x7FF ) {
3410         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3411         return a;
3412     }
3413     if ( bExp == 0 ) {
3414         if ( bSig == 0 ) {
3415             float_raise( float_flag_invalid STATUS_VAR);
3416             return float64_default_nan;
3417         }
3418         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3419     }
3420     if ( aExp == 0 ) {
3421         if ( aSig == 0 ) return a;
3422         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3423     }
3424     expDiff = aExp - bExp;
3425     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3426     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3427     if ( expDiff < 0 ) {
3428         if ( expDiff < -1 ) return a;
3429         aSig >>= 1;
3430     }
3431     q = ( bSig <= aSig );
3432     if ( q ) aSig -= bSig;
3433     expDiff -= 64;
3434     while ( 0 < expDiff ) {
3435         q = estimateDiv128To64( aSig, 0, bSig );
3436         q = ( 2 < q ) ? q - 2 : 0;
3437         aSig = - ( ( bSig>>2 ) * q );
3438         expDiff -= 62;
3439     }
3440     expDiff += 64;
3441     if ( 0 < expDiff ) {
3442         q = estimateDiv128To64( aSig, 0, bSig );
3443         q = ( 2 < q ) ? q - 2 : 0;
3444         q >>= 64 - expDiff;
3445         bSig >>= 2;
3446         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3447     }
3448     else {
3449         aSig >>= 2;
3450         bSig >>= 2;
3451     }
3452     do {
3453         alternateASig = aSig;
3454         ++q;
3455         aSig -= bSig;
3456     } while ( 0 <= (int64_t) aSig );
3457     sigMean = aSig + alternateASig;
3458     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3459         aSig = alternateASig;
3460     }
3461     zSign = ( (int64_t) aSig < 0 );
3462     if ( zSign ) aSig = - aSig;
3463     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3464
3465 }
3466
3467 /*----------------------------------------------------------------------------
3468 | Returns the square root of the double-precision floating-point value `a'.
3469 | The operation is performed according to the IEC/IEEE Standard for Binary
3470 | Floating-Point Arithmetic.
3471 *----------------------------------------------------------------------------*/
3472
3473 float64 float64_sqrt( float64 a STATUS_PARAM )
3474 {
3475     flag aSign;
3476     int16 aExp, zExp;
3477     uint64_t aSig, zSig, doubleZSig;
3478     uint64_t rem0, rem1, term0, term1;
3479     a = float64_squash_input_denormal(a STATUS_VAR);
3480
3481     aSig = extractFloat64Frac( a );
3482     aExp = extractFloat64Exp( a );
3483     aSign = extractFloat64Sign( a );
3484     if ( aExp == 0x7FF ) {
3485         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3486         if ( ! aSign ) return a;
3487         float_raise( float_flag_invalid STATUS_VAR);
3488         return float64_default_nan;
3489     }
3490     if ( aSign ) {
3491         if ( ( aExp | aSig ) == 0 ) return a;
3492         float_raise( float_flag_invalid STATUS_VAR);
3493         return float64_default_nan;
3494     }
3495     if ( aExp == 0 ) {
3496         if ( aSig == 0 ) return float64_zero;
3497         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3498     }
3499     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3500     aSig |= LIT64( 0x0010000000000000 );
3501     zSig = estimateSqrt32( aExp, aSig>>21 );
3502     aSig <<= 9 - ( aExp & 1 );
3503     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3504     if ( ( zSig & 0x1FF ) <= 5 ) {
3505         doubleZSig = zSig<<1;
3506         mul64To128( zSig, zSig, &term0, &term1 );
3507         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3508         while ( (int64_t) rem0 < 0 ) {
3509             --zSig;
3510             doubleZSig -= 2;
3511             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3512         }
3513         zSig |= ( ( rem0 | rem1 ) != 0 );
3514     }
3515     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3516
3517 }
3518
3519 /*----------------------------------------------------------------------------
3520 | Returns the binary log of the double-precision floating-point value `a'.
3521 | The operation is performed according to the IEC/IEEE Standard for Binary
3522 | Floating-Point Arithmetic.
3523 *----------------------------------------------------------------------------*/
3524 float64 float64_log2( float64 a STATUS_PARAM )
3525 {
3526     flag aSign, zSign;
3527     int16 aExp;
3528     uint64_t aSig, aSig0, aSig1, zSig, i;
3529     a = float64_squash_input_denormal(a STATUS_VAR);
3530
3531     aSig = extractFloat64Frac( a );
3532     aExp = extractFloat64Exp( a );
3533     aSign = extractFloat64Sign( a );
3534
3535     if ( aExp == 0 ) {
3536         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3537         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3538     }
3539     if ( aSign ) {
3540         float_raise( float_flag_invalid STATUS_VAR);
3541         return float64_default_nan;
3542     }
3543     if ( aExp == 0x7FF ) {
3544         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3545         return a;
3546     }
3547
3548     aExp -= 0x3FF;
3549     aSig |= LIT64( 0x0010000000000000 );
3550     zSign = aExp < 0;
3551     zSig = (uint64_t)aExp << 52;
3552     for (i = 1LL << 51; i > 0; i >>= 1) {
3553         mul64To128( aSig, aSig, &aSig0, &aSig1 );
3554         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3555         if ( aSig & LIT64( 0x0020000000000000 ) ) {
3556             aSig >>= 1;
3557             zSig |= i;
3558         }
3559     }
3560
3561     if ( zSign )
3562         zSig = -zSig;
3563     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3564 }
3565
3566 /*----------------------------------------------------------------------------
3567 | Returns 1 if the double-precision floating-point value `a' is equal to the
3568 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
3569 | if either operand is a NaN.  Otherwise, the comparison is performed
3570 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3571 *----------------------------------------------------------------------------*/
3572
3573 int float64_eq( float64 a, float64 b STATUS_PARAM )
3574 {
3575     uint64_t av, bv;
3576     a = float64_squash_input_denormal(a STATUS_VAR);
3577     b = float64_squash_input_denormal(b STATUS_VAR);
3578
3579     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3580          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3581        ) {
3582         float_raise( float_flag_invalid STATUS_VAR);
3583         return 0;
3584     }
3585     av = float64_val(a);
3586     bv = float64_val(b);
3587     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3588
3589 }
3590
3591 /*----------------------------------------------------------------------------
3592 | Returns 1 if the double-precision floating-point value `a' is less than or
3593 | equal to the corresponding value `b', and 0 otherwise.  The invalid
3594 | exception is raised if either operand is a NaN.  The comparison is performed
3595 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3596 *----------------------------------------------------------------------------*/
3597
3598 int float64_le( float64 a, float64 b STATUS_PARAM )
3599 {
3600     flag aSign, bSign;
3601     uint64_t av, bv;
3602     a = float64_squash_input_denormal(a STATUS_VAR);
3603     b = float64_squash_input_denormal(b STATUS_VAR);
3604
3605     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3606          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3607        ) {
3608         float_raise( float_flag_invalid STATUS_VAR);
3609         return 0;
3610     }
3611     aSign = extractFloat64Sign( a );
3612     bSign = extractFloat64Sign( b );
3613     av = float64_val(a);
3614     bv = float64_val(b);
3615     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3616     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3617
3618 }
3619
3620 /*----------------------------------------------------------------------------
3621 | Returns 1 if the double-precision floating-point value `a' is less than
3622 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3623 | raised if either operand is a NaN.  The comparison is performed according
3624 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3625 *----------------------------------------------------------------------------*/
3626
3627 int float64_lt( float64 a, float64 b STATUS_PARAM )
3628 {
3629     flag aSign, bSign;
3630     uint64_t av, bv;
3631
3632     a = float64_squash_input_denormal(a STATUS_VAR);
3633     b = float64_squash_input_denormal(b STATUS_VAR);
3634     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3635          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3636        ) {
3637         float_raise( float_flag_invalid STATUS_VAR);
3638         return 0;
3639     }
3640     aSign = extractFloat64Sign( a );
3641     bSign = extractFloat64Sign( b );
3642     av = float64_val(a);
3643     bv = float64_val(b);
3644     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3645     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3646
3647 }
3648
3649 /*----------------------------------------------------------------------------
3650 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3651 | be compared, and 0 otherwise.  The invalid exception is raised if either
3652 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
3653 | Standard for Binary Floating-Point Arithmetic.
3654 *----------------------------------------------------------------------------*/
3655
3656 int float64_unordered( float64 a, float64 b STATUS_PARAM )
3657 {
3658     a = float64_squash_input_denormal(a STATUS_VAR);
3659     b = float64_squash_input_denormal(b STATUS_VAR);
3660
3661     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3662          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3663        ) {
3664         float_raise( float_flag_invalid STATUS_VAR);
3665         return 1;
3666     }
3667     return 0;
3668 }
3669
3670 /*----------------------------------------------------------------------------
3671 | Returns 1 if the double-precision floating-point value `a' is equal to the
3672 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3673 | exception.The comparison is performed according to the IEC/IEEE Standard
3674 | for Binary Floating-Point Arithmetic.
3675 *----------------------------------------------------------------------------*/
3676
3677 int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
3678 {
3679     uint64_t av, bv;
3680     a = float64_squash_input_denormal(a STATUS_VAR);
3681     b = float64_squash_input_denormal(b STATUS_VAR);
3682
3683     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3684          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3685        ) {
3686         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3687             float_raise( float_flag_invalid STATUS_VAR);
3688         }
3689         return 0;
3690     }
3691     av = float64_val(a);
3692     bv = float64_val(b);
3693     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3694
3695 }
3696
3697 /*----------------------------------------------------------------------------
3698 | Returns 1 if the double-precision floating-point value `a' is less than or
3699 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3700 | cause an exception.  Otherwise, the comparison is performed according to the
3701 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3702 *----------------------------------------------------------------------------*/
3703
3704 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3705 {
3706     flag aSign, bSign;
3707     uint64_t av, bv;
3708     a = float64_squash_input_denormal(a STATUS_VAR);
3709     b = float64_squash_input_denormal(b STATUS_VAR);
3710
3711     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3712          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3713        ) {
3714         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3715             float_raise( float_flag_invalid STATUS_VAR);
3716         }
3717         return 0;
3718     }
3719     aSign = extractFloat64Sign( a );
3720     bSign = extractFloat64Sign( b );
3721     av = float64_val(a);
3722     bv = float64_val(b);
3723     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3724     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3725
3726 }
3727
3728 /*----------------------------------------------------------------------------
3729 | Returns 1 if the double-precision floating-point value `a' is less than
3730 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3731 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3732 | Standard for Binary Floating-Point Arithmetic.
3733 *----------------------------------------------------------------------------*/
3734
3735 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3736 {
3737     flag aSign, bSign;
3738     uint64_t av, bv;
3739     a = float64_squash_input_denormal(a STATUS_VAR);
3740     b = float64_squash_input_denormal(b STATUS_VAR);
3741
3742     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3743          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3744        ) {
3745         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3746             float_raise( float_flag_invalid STATUS_VAR);
3747         }
3748         return 0;
3749     }
3750     aSign = extractFloat64Sign( a );
3751     bSign = extractFloat64Sign( b );
3752     av = float64_val(a);
3753     bv = float64_val(b);
3754     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3755     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3756
3757 }
3758
3759 /*----------------------------------------------------------------------------
3760 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3761 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3762 | comparison is performed according to the IEC/IEEE Standard for Binary
3763 | Floating-Point Arithmetic.
3764 *----------------------------------------------------------------------------*/
3765
3766 int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
3767 {
3768     a = float64_squash_input_denormal(a STATUS_VAR);
3769     b = float64_squash_input_denormal(b STATUS_VAR);
3770
3771     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3772          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3773        ) {
3774         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3775             float_raise( float_flag_invalid STATUS_VAR);
3776         }
3777         return 1;
3778     }
3779     return 0;
3780 }
3781
3782 /*----------------------------------------------------------------------------
3783 | Returns the result of converting the extended double-precision floating-
3784 | point value `a' to the 32-bit two's complement integer format.  The
3785 | conversion is performed according to the IEC/IEEE Standard for Binary
3786 | Floating-Point Arithmetic---which means in particular that the conversion
3787 | is rounded according to the current rounding mode.  If `a' is a NaN, the
3788 | largest positive integer is returned.  Otherwise, if the conversion
3789 | overflows, the largest integer with the same sign as `a' is returned.
3790 *----------------------------------------------------------------------------*/
3791
3792 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3793 {
3794     flag aSign;
3795     int32 aExp, shiftCount;
3796     uint64_t aSig;
3797
3798     aSig = extractFloatx80Frac( a );
3799     aExp = extractFloatx80Exp( a );
3800     aSign = extractFloatx80Sign( a );
3801     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3802     shiftCount = 0x4037 - aExp;
3803     if ( shiftCount <= 0 ) shiftCount = 1;
3804     shift64RightJamming( aSig, shiftCount, &aSig );
3805     return roundAndPackInt32( aSign, aSig STATUS_VAR );
3806
3807 }
3808
3809 /*----------------------------------------------------------------------------
3810 | Returns the result of converting the extended double-precision floating-
3811 | point value `a' to the 32-bit two's complement integer format.  The
3812 | conversion is performed according to the IEC/IEEE Standard for Binary
3813 | Floating-Point Arithmetic, except that the conversion is always rounded
3814 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3815 | Otherwise, if the conversion overflows, the largest integer with the same
3816 | sign as `a' is returned.
3817 *----------------------------------------------------------------------------*/
3818
3819 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3820 {
3821     flag aSign;
3822     int32 aExp, shiftCount;
3823     uint64_t aSig, savedASig;
3824     int32 z;
3825
3826     aSig = extractFloatx80Frac( a );
3827     aExp = extractFloatx80Exp( a );
3828     aSign = extractFloatx80Sign( a );
3829     if ( 0x401E < aExp ) {
3830         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3831         goto invalid;
3832     }
3833     else if ( aExp < 0x3FFF ) {
3834         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3835         return 0;
3836     }
3837     shiftCount = 0x403E - aExp;
3838     savedASig = aSig;
3839     aSig >>= shiftCount;
3840     z = aSig;
3841     if ( aSign ) z = - z;
3842     if ( ( z < 0 ) ^ aSign ) {
3843  invalid:
3844         float_raise( float_flag_invalid STATUS_VAR);
3845         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3846     }
3847     if ( ( aSig<<shiftCount ) != savedASig ) {
3848         STATUS(float_exception_flags) |= float_flag_inexact;
3849     }
3850     return z;
3851
3852 }
3853
3854 /*----------------------------------------------------------------------------
3855 | Returns the result of converting the extended double-precision floating-
3856 | point value `a' to the 64-bit two's complement integer format.  The
3857 | conversion is performed according to the IEC/IEEE Standard for Binary
3858 | Floating-Point Arithmetic---which means in particular that the conversion
3859 | is rounded according to the current rounding mode.  If `a' is a NaN,
3860 | the largest positive integer is returned.  Otherwise, if the conversion
3861 | overflows, the largest integer with the same sign as `a' is returned.
3862 *----------------------------------------------------------------------------*/
3863
3864 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3865 {
3866     flag aSign;
3867     int32 aExp, shiftCount;
3868     uint64_t aSig, aSigExtra;
3869
3870     aSig = extractFloatx80Frac( a );
3871     aExp = extractFloatx80Exp( a );
3872     aSign = extractFloatx80Sign( a );
3873     shiftCount = 0x403E - aExp;
3874     if ( shiftCount <= 0 ) {
3875         if ( shiftCount ) {
3876             float_raise( float_flag_invalid STATUS_VAR);
3877             if (    ! aSign
3878                  || (    ( aExp == 0x7FFF )
3879                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3880                ) {
3881                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3882             }
3883             return (int64_t) LIT64( 0x8000000000000000 );
3884         }
3885         aSigExtra = 0;
3886     }
3887     else {
3888         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3889     }
3890     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3891
3892 }
3893
3894 /*----------------------------------------------------------------------------
3895 | Returns the result of converting the extended double-precision floating-
3896 | point value `a' to the 64-bit two's complement integer format.  The
3897 | conversion is performed according to the IEC/IEEE Standard for Binary
3898 | Floating-Point Arithmetic, except that the conversion is always rounded
3899 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3900 | Otherwise, if the conversion overflows, the largest integer with the same
3901 | sign as `a' is returned.
3902 *----------------------------------------------------------------------------*/
3903
3904 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3905 {
3906     flag aSign;
3907     int32 aExp, shiftCount;
3908     uint64_t aSig;
3909     int64 z;
3910
3911     aSig = extractFloatx80Frac( a );
3912     aExp = extractFloatx80Exp( a );
3913     aSign = extractFloatx80Sign( a );
3914     shiftCount = aExp - 0x403E;
3915     if ( 0 <= shiftCount ) {
3916         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3917         if ( ( a.high != 0xC03E ) || aSig ) {
3918             float_raise( float_flag_invalid STATUS_VAR);
3919             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3920                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3921             }
3922         }
3923         return (int64_t) LIT64( 0x8000000000000000 );
3924     }
3925     else if ( aExp < 0x3FFF ) {
3926         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3927         return 0;
3928     }
3929     z = aSig>>( - shiftCount );
3930     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3931         STATUS(float_exception_flags) |= float_flag_inexact;
3932     }
3933     if ( aSign ) z = - z;
3934     return z;
3935
3936 }
3937
3938 /*----------------------------------------------------------------------------
3939 | Returns the result of converting the extended double-precision floating-
3940 | point value `a' to the single-precision floating-point format.  The
3941 | conversion is performed according to the IEC/IEEE Standard for Binary
3942 | Floating-Point Arithmetic.
3943 *----------------------------------------------------------------------------*/
3944
3945 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3946 {
3947     flag aSign;
3948     int32 aExp;
3949     uint64_t aSig;
3950
3951     aSig = extractFloatx80Frac( a );
3952     aExp = extractFloatx80Exp( a );
3953     aSign = extractFloatx80Sign( a );
3954     if ( aExp == 0x7FFF ) {
3955         if ( (uint64_t) ( aSig<<1 ) ) {
3956             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3957         }
3958         return packFloat32( aSign, 0xFF, 0 );
3959     }
3960     shift64RightJamming( aSig, 33, &aSig );
3961     if ( aExp || aSig ) aExp -= 0x3F81;
3962     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3963
3964 }
3965
3966 /*----------------------------------------------------------------------------
3967 | Returns the result of converting the extended double-precision floating-
3968 | point value `a' to the double-precision floating-point format.  The
3969 | conversion is performed according to the IEC/IEEE Standard for Binary
3970 | Floating-Point Arithmetic.
3971 *----------------------------------------------------------------------------*/
3972
3973 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3974 {
3975     flag aSign;
3976     int32 aExp;
3977     uint64_t aSig, zSig;
3978
3979     aSig = extractFloatx80Frac( a );
3980     aExp = extractFloatx80Exp( a );
3981     aSign = extractFloatx80Sign( a );
3982     if ( aExp == 0x7FFF ) {
3983         if ( (uint64_t) ( aSig<<1 ) ) {
3984             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3985         }
3986         return packFloat64( aSign, 0x7FF, 0 );
3987     }
3988     shift64RightJamming( aSig, 1, &zSig );
3989     if ( aExp || aSig ) aExp -= 0x3C01;
3990     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3991
3992 }
3993
3994 /*----------------------------------------------------------------------------
3995 | Returns the result of converting the extended double-precision floating-
3996 | point value `a' to the quadruple-precision floating-point format.  The
3997 | conversion is performed according to the IEC/IEEE Standard for Binary
3998 | Floating-Point Arithmetic.
3999 *----------------------------------------------------------------------------*/
4000
4001 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4002 {
4003     flag aSign;
4004     int16 aExp;
4005     uint64_t aSig, zSig0, zSig1;
4006
4007     aSig = extractFloatx80Frac( a );
4008     aExp = extractFloatx80Exp( a );
4009     aSign = extractFloatx80Sign( a );
4010     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4011         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4012     }
4013     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4014     return packFloat128( aSign, aExp, zSig0, zSig1 );
4015
4016 }
4017
4018 /*----------------------------------------------------------------------------
4019 | Rounds the extended double-precision floating-point value `a' to an integer,
4020 | and returns the result as an extended quadruple-precision floating-point
4021 | value.  The operation is performed according to the IEC/IEEE Standard for
4022 | Binary Floating-Point Arithmetic.
4023 *----------------------------------------------------------------------------*/
4024
4025 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4026 {
4027     flag aSign;
4028     int32 aExp;
4029     uint64_t lastBitMask, roundBitsMask;
4030     int8 roundingMode;
4031     floatx80 z;
4032
4033     aExp = extractFloatx80Exp( a );
4034     if ( 0x403E <= aExp ) {
4035         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4036             return propagateFloatx80NaN( a, a STATUS_VAR );
4037         }
4038         return a;
4039     }
4040     if ( aExp < 0x3FFF ) {
4041         if (    ( aExp == 0 )
4042              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4043             return a;
4044         }
4045         STATUS(float_exception_flags) |= float_flag_inexact;
4046         aSign = extractFloatx80Sign( a );
4047         switch ( STATUS(float_rounding_mode) ) {
4048          case float_round_nearest_even:
4049             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4050                ) {
4051                 return
4052                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4053             }
4054             break;
4055          case float_round_down:
4056             return
4057                   aSign ?
4058                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4059                 : packFloatx80( 0, 0, 0 );
4060          case float_round_up:
4061             return
4062                   aSign ? packFloatx80( 1, 0, 0 )
4063                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4064         }
4065         return packFloatx80( aSign, 0, 0 );
4066     }
4067     lastBitMask = 1;
4068     lastBitMask <<= 0x403E - aExp;
4069     roundBitsMask = lastBitMask - 1;
4070     z = a;
4071     roundingMode = STATUS(float_rounding_mode);
4072     if ( roundingMode == float_round_nearest_even ) {
4073         z.low += lastBitMask>>1;
4074         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4075     }
4076     else if ( roundingMode != float_round_to_zero ) {
4077         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4078             z.low += roundBitsMask;
4079         }
4080     }
4081     z.low &= ~ roundBitsMask;
4082     if ( z.low == 0 ) {
4083         ++z.high;
4084         z.low = LIT64( 0x8000000000000000 );
4085     }
4086     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4087     return z;
4088
4089 }
4090
4091 /*----------------------------------------------------------------------------
4092 | Returns the result of adding the absolute values of the extended double-
4093 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4094 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4095 | The addition is performed according to the IEC/IEEE Standard for Binary
4096 | Floating-Point Arithmetic.
4097 *----------------------------------------------------------------------------*/
4098
4099 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4100 {
4101     int32 aExp, bExp, zExp;
4102     uint64_t aSig, bSig, zSig0, zSig1;
4103     int32 expDiff;
4104
4105     aSig = extractFloatx80Frac( a );
4106     aExp = extractFloatx80Exp( a );
4107     bSig = extractFloatx80Frac( b );
4108     bExp = extractFloatx80Exp( b );
4109     expDiff = aExp - bExp;
4110     if ( 0 < expDiff ) {
4111         if ( aExp == 0x7FFF ) {
4112             if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4113             return a;
4114         }
4115         if ( bExp == 0 ) --expDiff;
4116         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4117         zExp = aExp;
4118     }
4119     else if ( expDiff < 0 ) {
4120         if ( bExp == 0x7FFF ) {
4121             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4122             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4123         }
4124         if ( aExp == 0 ) ++expDiff;
4125         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4126         zExp = bExp;
4127     }
4128     else {
4129         if ( aExp == 0x7FFF ) {
4130             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4131                 return propagateFloatx80NaN( a, b STATUS_VAR );
4132             }
4133             return a;
4134         }
4135         zSig1 = 0;
4136         zSig0 = aSig + bSig;
4137         if ( aExp == 0 ) {
4138             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4139             goto roundAndPack;
4140         }
4141         zExp = aExp;
4142         goto shiftRight1;
4143     }
4144     zSig0 = aSig + bSig;
4145     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4146  shiftRight1:
4147     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4148     zSig0 |= LIT64( 0x8000000000000000 );
4149     ++zExp;
4150  roundAndPack:
4151     return
4152         roundAndPackFloatx80(
4153             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4154
4155 }
4156
4157 /*----------------------------------------------------------------------------
4158 | Returns the result of subtracting the absolute values of the extended
4159 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4160 | difference is negated before being returned.  `zSign' is ignored if the
4161 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4162 | Standard for Binary Floating-Point Arithmetic.
4163 *----------------------------------------------------------------------------*/
4164
4165 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4166 {
4167     int32 aExp, bExp, zExp;
4168     uint64_t aSig, bSig, zSig0, zSig1;
4169     int32 expDiff;
4170     floatx80 z;
4171
4172     aSig = extractFloatx80Frac( a );
4173     aExp = extractFloatx80Exp( a );
4174     bSig = extractFloatx80Frac( b );
4175     bExp = extractFloatx80Exp( b );
4176     expDiff = aExp - bExp;
4177     if ( 0 < expDiff ) goto aExpBigger;
4178     if ( expDiff < 0 ) goto bExpBigger;
4179     if ( aExp == 0x7FFF ) {
4180         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4181             return propagateFloatx80NaN( a, b STATUS_VAR );
4182         }
4183         float_raise( float_flag_invalid STATUS_VAR);
4184         z.low = floatx80_default_nan_low;
4185         z.high = floatx80_default_nan_high;
4186         return z;
4187     }
4188     if ( aExp == 0 ) {
4189         aExp = 1;
4190         bExp = 1;
4191     }
4192     zSig1 = 0;
4193     if ( bSig < aSig ) goto aBigger;
4194     if ( aSig < bSig ) goto bBigger;
4195     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4196  bExpBigger:
4197     if ( bExp == 0x7FFF ) {
4198         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4199         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4200     }
4201     if ( aExp == 0 ) ++expDiff;
4202     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4203  bBigger:
4204     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4205     zExp = bExp;
4206     zSign ^= 1;
4207     goto normalizeRoundAndPack;
4208  aExpBigger:
4209     if ( aExp == 0x7FFF ) {
4210         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4211         return a;
4212     }
4213     if ( bExp == 0 ) --expDiff;
4214     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4215  aBigger:
4216     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4217     zExp = aExp;
4218  normalizeRoundAndPack:
4219     return
4220         normalizeRoundAndPackFloatx80(
4221             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4222
4223 }
4224
4225 /*----------------------------------------------------------------------------
4226 | Returns the result of adding the extended double-precision floating-point
4227 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4228 | Standard for Binary Floating-Point Arithmetic.
4229 *----------------------------------------------------------------------------*/
4230
4231 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4232 {
4233     flag aSign, bSign;
4234
4235     aSign = extractFloatx80Sign( a );
4236     bSign = extractFloatx80Sign( b );
4237     if ( aSign == bSign ) {
4238         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4239     }
4240     else {
4241         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4242     }
4243
4244 }
4245
4246 /*----------------------------------------------------------------------------
4247 | Returns the result of subtracting the extended double-precision floating-
4248 | point values `a' and `b'.  The operation is performed according to the
4249 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4250 *----------------------------------------------------------------------------*/
4251
4252 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4253 {
4254     flag aSign, bSign;
4255
4256     aSign = extractFloatx80Sign( a );
4257     bSign = extractFloatx80Sign( b );
4258     if ( aSign == bSign ) {
4259         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4260     }
4261     else {
4262         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4263     }
4264
4265 }
4266
4267 /*----------------------------------------------------------------------------
4268 | Returns the result of multiplying the extended double-precision floating-
4269 | point values `a' and `b'.  The operation is performed according to the
4270 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4271 *----------------------------------------------------------------------------*/
4272
4273 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4274 {
4275     flag aSign, bSign, zSign;
4276     int32 aExp, bExp, zExp;
4277     uint64_t aSig, bSig, zSig0, zSig1;
4278     floatx80 z;
4279
4280     aSig = extractFloatx80Frac( a );
4281     aExp = extractFloatx80Exp( a );
4282     aSign = extractFloatx80Sign( a );
4283     bSig = extractFloatx80Frac( b );
4284     bExp = extractFloatx80Exp( b );
4285     bSign = extractFloatx80Sign( b );
4286     zSign = aSign ^ bSign;
4287     if ( aExp == 0x7FFF ) {
4288         if (    (uint64_t) ( aSig<<1 )
4289              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4290             return propagateFloatx80NaN( a, b STATUS_VAR );
4291         }
4292         if ( ( bExp | bSig ) == 0 ) goto invalid;
4293         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4294     }
4295     if ( bExp == 0x7FFF ) {
4296         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4297         if ( ( aExp | aSig ) == 0 ) {
4298  invalid:
4299             float_raise( float_flag_invalid STATUS_VAR);
4300             z.low = floatx80_default_nan_low;
4301             z.high = floatx80_default_nan_high;
4302             return z;
4303         }
4304         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4305     }
4306     if ( aExp == 0 ) {
4307         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4308         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4309     }
4310     if ( bExp == 0 ) {
4311         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4312         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4313     }
4314     zExp = aExp + bExp - 0x3FFE;
4315     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4316     if ( 0 < (int64_t) zSig0 ) {
4317         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4318         --zExp;
4319     }
4320     return
4321         roundAndPackFloatx80(
4322             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4323
4324 }
4325
4326 /*----------------------------------------------------------------------------
4327 | Returns the result of dividing the extended double-precision floating-point
4328 | value `a' by the corresponding value `b'.  The operation is performed
4329 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4330 *----------------------------------------------------------------------------*/
4331
4332 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4333 {
4334     flag aSign, bSign, zSign;
4335     int32 aExp, bExp, zExp;
4336     uint64_t aSig, bSig, zSig0, zSig1;
4337     uint64_t rem0, rem1, rem2, term0, term1, term2;
4338     floatx80 z;
4339
4340     aSig = extractFloatx80Frac( a );
4341     aExp = extractFloatx80Exp( a );
4342     aSign = extractFloatx80Sign( a );
4343     bSig = extractFloatx80Frac( b );
4344     bExp = extractFloatx80Exp( b );
4345     bSign = extractFloatx80Sign( b );
4346     zSign = aSign ^ bSign;
4347     if ( aExp == 0x7FFF ) {
4348         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4349         if ( bExp == 0x7FFF ) {
4350             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4351             goto invalid;
4352         }
4353         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4354     }
4355     if ( bExp == 0x7FFF ) {
4356         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4357         return packFloatx80( zSign, 0, 0 );
4358     }
4359     if ( bExp == 0 ) {
4360         if ( bSig == 0 ) {
4361             if ( ( aExp | aSig ) == 0 ) {
4362  invalid:
4363                 float_raise( float_flag_invalid STATUS_VAR);
4364                 z.low = floatx80_default_nan_low;
4365                 z.high = floatx80_default_nan_high;
4366                 return z;
4367             }
4368             float_raise( float_flag_divbyzero STATUS_VAR);
4369             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4370         }
4371         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4372     }
4373     if ( aExp == 0 ) {
4374         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4375         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4376     }
4377     zExp = aExp - bExp + 0x3FFE;
4378     rem1 = 0;
4379     if ( bSig <= aSig ) {
4380         shift128Right( aSig, 0, 1, &aSig, &rem1 );
4381         ++zExp;
4382     }
4383     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4384     mul64To128( bSig, zSig0, &term0, &term1 );
4385     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4386     while ( (int64_t) rem0 < 0 ) {
4387         --zSig0;
4388         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4389     }
4390     zSig1 = estimateDiv128To64( rem1, 0, bSig );
4391     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4392         mul64To128( bSig, zSig1, &term1, &term2 );
4393         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4394         while ( (int64_t) rem1 < 0 ) {
4395             --zSig1;
4396             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4397         }
4398         zSig1 |= ( ( rem1 | rem2 ) != 0 );
4399     }
4400     return
4401         roundAndPackFloatx80(
4402             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4403
4404 }
4405
4406 /*----------------------------------------------------------------------------
4407 | Returns the remainder of the extended double-precision floating-point value
4408 | `a' with respect to the corresponding value `b'.  The operation is performed
4409 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4410 *----------------------------------------------------------------------------*/
4411
4412 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4413 {
4414     flag aSign, zSign;
4415     int32 aExp, bExp, expDiff;
4416     uint64_t aSig0, aSig1, bSig;
4417     uint64_t q, term0, term1, alternateASig0, alternateASig1;
4418     floatx80 z;
4419
4420     aSig0 = extractFloatx80Frac( a );
4421     aExp = extractFloatx80Exp( a );
4422     aSign = extractFloatx80Sign( a );
4423     bSig = extractFloatx80Frac( b );
4424     bExp = extractFloatx80Exp( b );
4425     if ( aExp == 0x7FFF ) {
4426         if (    (uint64_t) ( aSig0<<1 )
4427              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4428             return propagateFloatx80NaN( a, b STATUS_VAR );
4429         }
4430         goto invalid;
4431     }
4432     if ( bExp == 0x7FFF ) {
4433         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4434         return a;
4435     }
4436     if ( bExp == 0 ) {
4437         if ( bSig == 0 ) {
4438  invalid:
4439             float_raise( float_flag_invalid STATUS_VAR);
4440             z.low = floatx80_default_nan_low;
4441             z.high = floatx80_default_nan_high;
4442             return z;
4443         }
4444         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4445     }
4446     if ( aExp == 0 ) {
4447         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4448         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4449     }
4450     bSig |= LIT64( 0x8000000000000000 );
4451     zSign = aSign;
4452     expDiff = aExp - bExp;
4453     aSig1 = 0;
4454     if ( expDiff < 0 ) {
4455         if ( expDiff < -1 ) return a;
4456         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4457         expDiff = 0;
4458     }
4459     q = ( bSig <= aSig0 );
4460     if ( q ) aSig0 -= bSig;
4461     expDiff -= 64;
4462     while ( 0 < expDiff ) {
4463         q = estimateDiv128To64( aSig0, aSig1, bSig );
4464         q = ( 2 < q ) ? q - 2 : 0;
4465         mul64To128( bSig, q, &term0, &term1 );
4466         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4467         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4468         expDiff -= 62;
4469     }
4470     expDiff += 64;
4471     if ( 0 < expDiff ) {
4472         q = estimateDiv128To64( aSig0, aSig1, bSig );
4473         q = ( 2 < q ) ? q - 2 : 0;
4474         q >>= 64 - expDiff;
4475         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4476         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4477         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4478         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4479             ++q;
4480             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4481         }
4482     }
4483     else {
4484         term1 = 0;
4485         term0 = bSig;
4486     }
4487     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4488     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4489          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4490               && ( q & 1 ) )
4491        ) {
4492         aSig0 = alternateASig0;
4493         aSig1 = alternateASig1;
4494         zSign = ! zSign;
4495     }
4496     return
4497         normalizeRoundAndPackFloatx80(
4498             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4499
4500 }
4501
4502 /*----------------------------------------------------------------------------
4503 | Returns the square root of the extended double-precision floating-point
4504 | value `a'.  The operation is performed according to the IEC/IEEE Standard
4505 | for Binary Floating-Point Arithmetic.
4506 *----------------------------------------------------------------------------*/
4507
4508 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4509 {
4510     flag aSign;
4511     int32 aExp, zExp;
4512     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4513     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4514     floatx80 z;
4515
4516     aSig0 = extractFloatx80Frac( a );
4517     aExp = extractFloatx80Exp( a );
4518     aSign = extractFloatx80Sign( a );
4519     if ( aExp == 0x7FFF ) {
4520         if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4521         if ( ! aSign ) return a;
4522         goto invalid;
4523     }
4524     if ( aSign ) {
4525         if ( ( aExp | aSig0 ) == 0 ) return a;
4526  invalid:
4527         float_raise( float_flag_invalid STATUS_VAR);
4528         z.low = floatx80_default_nan_low;
4529         z.high = floatx80_default_nan_high;
4530         return z;
4531     }
4532     if ( aExp == 0 ) {
4533         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4534         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4535     }
4536     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4537     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4538     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4539     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4540     doubleZSig0 = zSig0<<1;
4541     mul64To128( zSig0, zSig0, &term0, &term1 );
4542     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4543     while ( (int64_t) rem0 < 0 ) {
4544         --zSig0;
4545         doubleZSig0 -= 2;
4546         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4547     }
4548     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4549     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4550         if ( zSig1 == 0 ) zSig1 = 1;
4551         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4552         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4553         mul64To128( zSig1, zSig1, &term2, &term3 );
4554         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4555         while ( (int64_t) rem1 < 0 ) {
4556             --zSig1;
4557             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4558             term3 |= 1;
4559             term2 |= doubleZSig0;
4560             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4561         }
4562         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4563     }
4564     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4565     zSig0 |= doubleZSig0;
4566     return
4567         roundAndPackFloatx80(
4568             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4569
4570 }
4571
4572 /*----------------------------------------------------------------------------
4573 | Returns 1 if the extended double-precision floating-point value `a' is equal
4574 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
4575 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4576 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4577 *----------------------------------------------------------------------------*/
4578
4579 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4580 {
4581
4582     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4583               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4584          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4585               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4586        ) {
4587         float_raise( float_flag_invalid STATUS_VAR);
4588         return 0;
4589     }
4590     return
4591            ( a.low == b.low )
4592         && (    ( a.high == b.high )
4593              || (    ( a.low == 0 )
4594                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4595            );
4596
4597 }
4598
4599 /*----------------------------------------------------------------------------
4600 | Returns 1 if the extended double-precision floating-point value `a' is
4601 | less than or equal to the corresponding value `b', and 0 otherwise.  The
4602 | invalid exception is raised if either operand is a NaN.  The comparison is
4603 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4604 | Arithmetic.
4605 *----------------------------------------------------------------------------*/
4606
4607 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4608 {
4609     flag aSign, bSign;
4610
4611     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4612               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4613          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4614               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4615        ) {
4616         float_raise( float_flag_invalid STATUS_VAR);
4617         return 0;
4618     }
4619     aSign = extractFloatx80Sign( a );
4620     bSign = extractFloatx80Sign( b );
4621     if ( aSign != bSign ) {
4622         return
4623                aSign
4624             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4625                  == 0 );
4626     }
4627     return
4628           aSign ? le128( b.high, b.low, a.high, a.low )
4629         : le128( a.high, a.low, b.high, b.low );
4630
4631 }
4632
4633 /*----------------------------------------------------------------------------
4634 | Returns 1 if the extended double-precision floating-point value `a' is
4635 | less than the corresponding value `b', and 0 otherwise.  The invalid
4636 | exception is raised if either operand is a NaN.  The comparison is performed
4637 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4638 *----------------------------------------------------------------------------*/
4639
4640 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4641 {
4642     flag aSign, bSign;
4643
4644     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4645               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4646          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4647               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4648        ) {
4649         float_raise( float_flag_invalid STATUS_VAR);
4650         return 0;
4651     }
4652     aSign = extractFloatx80Sign( a );
4653     bSign = extractFloatx80Sign( b );
4654     if ( aSign != bSign ) {
4655         return
4656                aSign
4657             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4658                  != 0 );
4659     }
4660     return
4661           aSign ? lt128( b.high, b.low, a.high, a.low )
4662         : lt128( a.high, a.low, b.high, b.low );
4663
4664 }
4665
4666 /*----------------------------------------------------------------------------
4667 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4668 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
4669 | either operand is a NaN.   The comparison is performed according to the
4670 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4671 *----------------------------------------------------------------------------*/
4672 int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
4673 {
4674     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4675               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4676          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4677               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4678        ) {
4679         float_raise( float_flag_invalid STATUS_VAR);
4680         return 1;
4681     }
4682     return 0;
4683 }
4684
4685 /*----------------------------------------------------------------------------
4686 | Returns 1 if the extended double-precision floating-point value `a' is
4687 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4688 | cause an exception.  The comparison is performed according to the IEC/IEEE
4689 | Standard for Binary Floating-Point Arithmetic.
4690 *----------------------------------------------------------------------------*/
4691
4692 int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4693 {
4694
4695     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4696               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4697          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4698               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4699        ) {
4700         if (    floatx80_is_signaling_nan( a )
4701              || floatx80_is_signaling_nan( b ) ) {
4702             float_raise( float_flag_invalid STATUS_VAR);
4703         }
4704         return 0;
4705     }
4706     return
4707            ( a.low == b.low )
4708         && (    ( a.high == b.high )
4709              || (    ( a.low == 0 )
4710                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4711            );
4712
4713 }
4714
4715 /*----------------------------------------------------------------------------
4716 | Returns 1 if the extended double-precision floating-point value `a' is less
4717 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4718 | do not cause an exception.  Otherwise, the comparison is performed according
4719 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4720 *----------------------------------------------------------------------------*/
4721
4722 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4723 {
4724     flag aSign, bSign;
4725
4726     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4727               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4728          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4729               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4730        ) {
4731         if (    floatx80_is_signaling_nan( a )
4732              || floatx80_is_signaling_nan( b ) ) {
4733             float_raise( float_flag_invalid STATUS_VAR);
4734         }
4735         return 0;
4736     }
4737     aSign = extractFloatx80Sign( a );
4738     bSign = extractFloatx80Sign( b );
4739     if ( aSign != bSign ) {
4740         return
4741                aSign
4742             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4743                  == 0 );
4744     }
4745     return
4746           aSign ? le128( b.high, b.low, a.high, a.low )
4747         : le128( a.high, a.low, b.high, b.low );
4748
4749 }
4750
4751 /*----------------------------------------------------------------------------
4752 | Returns 1 if the extended double-precision floating-point value `a' is less
4753 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4754 | an exception.  Otherwise, the comparison is performed according to the
4755 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4756 *----------------------------------------------------------------------------*/
4757
4758 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4759 {
4760     flag aSign, bSign;
4761
4762     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4763               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4764          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4765               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4766        ) {
4767         if (    floatx80_is_signaling_nan( a )
4768              || floatx80_is_signaling_nan( b ) ) {
4769             float_raise( float_flag_invalid STATUS_VAR);
4770         }
4771         return 0;
4772     }
4773     aSign = extractFloatx80Sign( a );
4774     bSign = extractFloatx80Sign( b );
4775     if ( aSign != bSign ) {
4776         return
4777                aSign
4778             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4779                  != 0 );
4780     }
4781     return
4782           aSign ? lt128( b.high, b.low, a.high, a.low )
4783         : lt128( a.high, a.low, b.high, b.low );
4784
4785 }
4786
4787 /*----------------------------------------------------------------------------
4788 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4789 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
4790 | The comparison is performed according to the IEC/IEEE Standard for Binary
4791 | Floating-Point Arithmetic.
4792 *----------------------------------------------------------------------------*/
4793 int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4794 {
4795     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4796               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4797          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4798               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4799        ) {
4800         if (    floatx80_is_signaling_nan( a )
4801              || floatx80_is_signaling_nan( b ) ) {
4802             float_raise( float_flag_invalid STATUS_VAR);
4803         }
4804         return 1;
4805     }
4806     return 0;
4807 }
4808
4809 /*----------------------------------------------------------------------------
4810 | Returns the result of converting the quadruple-precision floating-point
4811 | value `a' to the 32-bit two's complement integer format.  The conversion
4812 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4813 | Arithmetic---which means in particular that the conversion is rounded
4814 | according to the current rounding mode.  If `a' is a NaN, the largest
4815 | positive integer is returned.  Otherwise, if the conversion overflows, the
4816 | largest integer with the same sign as `a' is returned.
4817 *----------------------------------------------------------------------------*/
4818
4819 int32 float128_to_int32( float128 a STATUS_PARAM )
4820 {
4821     flag aSign;
4822     int32 aExp, shiftCount;
4823     uint64_t aSig0, aSig1;
4824
4825     aSig1 = extractFloat128Frac1( a );
4826     aSig0 = extractFloat128Frac0( a );
4827     aExp = extractFloat128Exp( a );
4828     aSign = extractFloat128Sign( a );
4829     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4830     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4831     aSig0 |= ( aSig1 != 0 );
4832     shiftCount = 0x4028 - aExp;
4833     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4834     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4835
4836 }
4837
4838 /*----------------------------------------------------------------------------
4839 | Returns the result of converting the quadruple-precision floating-point
4840 | value `a' to the 32-bit two's complement integer format.  The conversion
4841 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4842 | Arithmetic, except that the conversion is always rounded toward zero.  If
4843 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4844 | conversion overflows, the largest integer with the same sign as `a' is
4845 | returned.
4846 *----------------------------------------------------------------------------*/
4847
4848 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4849 {
4850     flag aSign;
4851     int32 aExp, shiftCount;
4852     uint64_t aSig0, aSig1, savedASig;
4853     int32 z;
4854
4855     aSig1 = extractFloat128Frac1( a );
4856     aSig0 = extractFloat128Frac0( a );
4857     aExp = extractFloat128Exp( a );
4858     aSign = extractFloat128Sign( a );
4859     aSig0 |= ( aSig1 != 0 );
4860     if ( 0x401E < aExp ) {
4861         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4862         goto invalid;
4863     }
4864     else if ( aExp < 0x3FFF ) {
4865         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4866         return 0;
4867     }
4868     aSig0 |= LIT64( 0x0001000000000000 );
4869     shiftCount = 0x402F - aExp;
4870     savedASig = aSig0;
4871     aSig0 >>= shiftCount;
4872     z = aSig0;
4873     if ( aSign ) z = - z;
4874     if ( ( z < 0 ) ^ aSign ) {
4875  invalid:
4876         float_raise( float_flag_invalid STATUS_VAR);
4877         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4878     }
4879     if ( ( aSig0<<shiftCount ) != savedASig ) {
4880         STATUS(float_exception_flags) |= float_flag_inexact;
4881     }
4882     return z;
4883
4884 }
4885
4886 /*----------------------------------------------------------------------------
4887 | Returns the result of converting the quadruple-precision floating-point
4888 | value `a' to the 64-bit two's complement integer format.  The conversion
4889 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4890 | Arithmetic---which means in particular that the conversion is rounded
4891 | according to the current rounding mode.  If `a' is a NaN, the largest
4892 | positive integer is returned.  Otherwise, if the conversion overflows, the
4893 | largest integer with the same sign as `a' is returned.
4894 *----------------------------------------------------------------------------*/
4895
4896 int64 float128_to_int64( float128 a STATUS_PARAM )
4897 {
4898     flag aSign;
4899     int32 aExp, shiftCount;
4900     uint64_t aSig0, aSig1;
4901
4902     aSig1 = extractFloat128Frac1( a );
4903     aSig0 = extractFloat128Frac0( a );
4904     aExp = extractFloat128Exp( a );
4905     aSign = extractFloat128Sign( a );
4906     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4907     shiftCount = 0x402F - aExp;
4908     if ( shiftCount <= 0 ) {
4909         if ( 0x403E < aExp ) {
4910             float_raise( float_flag_invalid STATUS_VAR);
4911             if (    ! aSign
4912                  || (    ( aExp == 0x7FFF )
4913                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4914                     )
4915                ) {
4916                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4917             }
4918             return (int64_t) LIT64( 0x8000000000000000 );
4919         }
4920         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4921     }
4922     else {
4923         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4924     }
4925     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4926
4927 }
4928
4929 /*----------------------------------------------------------------------------
4930 | Returns the result of converting the quadruple-precision floating-point
4931 | value `a' to the 64-bit two's complement integer format.  The conversion
4932 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4933 | Arithmetic, except that the conversion is always rounded toward zero.
4934 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4935 | the conversion overflows, the largest integer with the same sign as `a' is
4936 | returned.
4937 *----------------------------------------------------------------------------*/
4938
4939 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4940 {
4941     flag aSign;
4942     int32 aExp, shiftCount;
4943     uint64_t aSig0, aSig1;
4944     int64 z;
4945
4946     aSig1 = extractFloat128Frac1( a );
4947     aSig0 = extractFloat128Frac0( a );
4948     aExp = extractFloat128Exp( a );
4949     aSign = extractFloat128Sign( a );
4950     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4951     shiftCount = aExp - 0x402F;
4952     if ( 0 < shiftCount ) {
4953         if ( 0x403E <= aExp ) {
4954             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4955             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4956                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4957                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4958             }
4959             else {
4960                 float_raise( float_flag_invalid STATUS_VAR);
4961                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4962                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4963                 }
4964             }
4965             return (int64_t) LIT64( 0x8000000000000000 );
4966         }
4967         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4968         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
4969             STATUS(float_exception_flags) |= float_flag_inexact;
4970         }
4971     }
4972     else {
4973         if ( aExp < 0x3FFF ) {
4974             if ( aExp | aSig0 | aSig1 ) {
4975                 STATUS(float_exception_flags) |= float_flag_inexact;
4976             }
4977             return 0;
4978         }
4979         z = aSig0>>( - shiftCount );
4980         if (    aSig1
4981              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4982             STATUS(float_exception_flags) |= float_flag_inexact;
4983         }
4984     }
4985     if ( aSign ) z = - z;
4986     return z;
4987
4988 }
4989
4990 /*----------------------------------------------------------------------------
4991 | Returns the result of converting the quadruple-precision floating-point
4992 | value `a' to the single-precision floating-point format.  The conversion
4993 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4994 | Arithmetic.
4995 *----------------------------------------------------------------------------*/
4996
4997 float32 float128_to_float32( float128 a STATUS_PARAM )
4998 {
4999     flag aSign;
5000     int32 aExp;
5001     uint64_t aSig0, aSig1;
5002     uint32_t zSig;
5003
5004     aSig1 = extractFloat128Frac1( a );
5005     aSig0 = extractFloat128Frac0( a );
5006     aExp = extractFloat128Exp( a );
5007     aSign = extractFloat128Sign( a );
5008     if ( aExp == 0x7FFF ) {
5009         if ( aSig0 | aSig1 ) {
5010             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5011         }
5012         return packFloat32( aSign, 0xFF, 0 );
5013     }
5014     aSig0 |= ( aSig1 != 0 );
5015     shift64RightJamming( aSig0, 18, &aSig0 );
5016     zSig = aSig0;
5017     if ( aExp || zSig ) {
5018         zSig |= 0x40000000;
5019         aExp -= 0x3F81;
5020     }
5021     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5022
5023 }
5024
5025 /*----------------------------------------------------------------------------
5026 | Returns the result of converting the quadruple-precision floating-point
5027 | value `a' to the double-precision floating-point format.  The conversion
5028 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5029 | Arithmetic.
5030 *----------------------------------------------------------------------------*/
5031
5032 float64 float128_to_float64( float128 a STATUS_PARAM )
5033 {
5034     flag aSign;
5035     int32 aExp;
5036     uint64_t aSig0, aSig1;
5037
5038     aSig1 = extractFloat128Frac1( a );
5039     aSig0 = extractFloat128Frac0( a );
5040     aExp = extractFloat128Exp( a );
5041     aSign = extractFloat128Sign( a );
5042     if ( aExp == 0x7FFF ) {
5043         if ( aSig0 | aSig1 ) {
5044             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5045         }
5046         return packFloat64( aSign, 0x7FF, 0 );
5047     }
5048     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5049     aSig0 |= ( aSig1 != 0 );
5050     if ( aExp || aSig0 ) {
5051         aSig0 |= LIT64( 0x4000000000000000 );
5052         aExp -= 0x3C01;
5053     }
5054     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5055
5056 }
5057
5058 /*----------------------------------------------------------------------------
5059 | Returns the result of converting the quadruple-precision floating-point
5060 | value `a' to the extended double-precision floating-point format.  The
5061 | conversion is performed according to the IEC/IEEE Standard for Binary
5062 | Floating-Point Arithmetic.
5063 *----------------------------------------------------------------------------*/
5064
5065 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5066 {
5067     flag aSign;
5068     int32 aExp;
5069     uint64_t aSig0, aSig1;
5070
5071     aSig1 = extractFloat128Frac1( a );
5072     aSig0 = extractFloat128Frac0( a );
5073     aExp = extractFloat128Exp( a );
5074     aSign = extractFloat128Sign( a );
5075     if ( aExp == 0x7FFF ) {
5076         if ( aSig0 | aSig1 ) {
5077             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5078         }
5079         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5080     }
5081     if ( aExp == 0 ) {
5082         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5083         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5084     }
5085     else {
5086         aSig0 |= LIT64( 0x0001000000000000 );
5087     }
5088     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5089     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5090
5091 }
5092
5093 /*----------------------------------------------------------------------------
5094 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5095 | returns the result as a quadruple-precision floating-point value.  The
5096 | operation is performed according to the IEC/IEEE Standard for Binary
5097 | Floating-Point Arithmetic.
5098 *----------------------------------------------------------------------------*/
5099
5100 float128 float128_round_to_int( float128 a STATUS_PARAM )
5101 {
5102     flag aSign;
5103     int32 aExp;
5104     uint64_t lastBitMask, roundBitsMask;
5105     int8 roundingMode;
5106     float128 z;
5107
5108     aExp = extractFloat128Exp( a );
5109     if ( 0x402F <= aExp ) {
5110         if ( 0x406F <= aExp ) {
5111             if (    ( aExp == 0x7FFF )
5112                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5113                ) {
5114                 return propagateFloat128NaN( a, a STATUS_VAR );
5115             }
5116             return a;
5117         }
5118         lastBitMask = 1;
5119         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5120         roundBitsMask = lastBitMask - 1;
5121         z = a;
5122         roundingMode = STATUS(float_rounding_mode);
5123         if ( roundingMode == float_round_nearest_even ) {
5124             if ( lastBitMask ) {
5125                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5126                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5127             }
5128             else {
5129                 if ( (int64_t) z.low < 0 ) {
5130                     ++z.high;
5131                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5132                 }
5133             }
5134         }
5135         else if ( roundingMode != float_round_to_zero ) {
5136             if (   extractFloat128Sign( z )
5137                  ^ ( roundingMode == float_round_up ) ) {
5138                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5139             }
5140         }
5141         z.low &= ~ roundBitsMask;
5142     }
5143     else {
5144         if ( aExp < 0x3FFF ) {
5145             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5146             STATUS(float_exception_flags) |= float_flag_inexact;
5147             aSign = extractFloat128Sign( a );
5148             switch ( STATUS(float_rounding_mode) ) {
5149              case float_round_nearest_even:
5150                 if (    ( aExp == 0x3FFE )
5151                      && (   extractFloat128Frac0( a )
5152                           | extractFloat128Frac1( a ) )
5153                    ) {
5154                     return packFloat128( aSign, 0x3FFF, 0, 0 );
5155                 }
5156                 break;
5157              case float_round_down:
5158                 return
5159                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5160                     : packFloat128( 0, 0, 0, 0 );
5161              case float_round_up:
5162                 return
5163                       aSign ? packFloat128( 1, 0, 0, 0 )
5164                     : packFloat128( 0, 0x3FFF, 0, 0 );
5165             }
5166             return packFloat128( aSign, 0, 0, 0 );
5167         }
5168         lastBitMask = 1;
5169         lastBitMask <<= 0x402F - aExp;
5170         roundBitsMask = lastBitMask - 1;
5171         z.low = 0;
5172         z.high = a.high;
5173         roundingMode = STATUS(float_rounding_mode);
5174         if ( roundingMode == float_round_nearest_even ) {
5175             z.high += lastBitMask>>1;
5176             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5177                 z.high &= ~ lastBitMask;
5178             }
5179         }
5180         else if ( roundingMode != float_round_to_zero ) {
5181             if (   extractFloat128Sign( z )
5182                  ^ ( roundingMode == float_round_up ) ) {
5183                 z.high |= ( a.low != 0 );
5184                 z.high += roundBitsMask;
5185             }
5186         }
5187         z.high &= ~ roundBitsMask;
5188     }
5189     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5190         STATUS(float_exception_flags) |= float_flag_inexact;
5191     }
5192     return z;
5193
5194 }
5195
5196 /*----------------------------------------------------------------------------
5197 | Returns the result of adding the absolute values of the quadruple-precision
5198 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5199 | before being returned.  `zSign' is ignored if the result is a NaN.
5200 | The addition is performed according to the IEC/IEEE Standard for Binary
5201 | Floating-Point Arithmetic.
5202 *----------------------------------------------------------------------------*/
5203
5204 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5205 {
5206     int32 aExp, bExp, zExp;
5207     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5208     int32 expDiff;
5209
5210     aSig1 = extractFloat128Frac1( a );
5211     aSig0 = extractFloat128Frac0( a );
5212     aExp = extractFloat128Exp( a );
5213     bSig1 = extractFloat128Frac1( b );
5214     bSig0 = extractFloat128Frac0( b );
5215     bExp = extractFloat128Exp( b );
5216     expDiff = aExp - bExp;
5217     if ( 0 < expDiff ) {
5218         if ( aExp == 0x7FFF ) {
5219             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5220             return a;
5221         }
5222         if ( bExp == 0 ) {
5223             --expDiff;
5224         }
5225         else {
5226             bSig0 |= LIT64( 0x0001000000000000 );
5227         }
5228         shift128ExtraRightJamming(
5229             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5230         zExp = aExp;
5231     }
5232     else if ( expDiff < 0 ) {
5233         if ( bExp == 0x7FFF ) {
5234             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5235             return packFloat128( zSign, 0x7FFF, 0, 0 );
5236         }
5237         if ( aExp == 0 ) {
5238             ++expDiff;
5239         }
5240         else {
5241             aSig0 |= LIT64( 0x0001000000000000 );
5242         }
5243         shift128ExtraRightJamming(
5244             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5245         zExp = bExp;
5246     }
5247     else {
5248         if ( aExp == 0x7FFF ) {
5249             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5250                 return propagateFloat128NaN( a, b STATUS_VAR );
5251             }
5252             return a;
5253         }
5254         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5255         if ( aExp == 0 ) {
5256             if (STATUS(flush_to_zero)) {
5257                 if (zSig0 | zSig1) {
5258                     float_raise(float_flag_output_denormal STATUS_VAR);
5259                 }
5260                 return packFloat128(zSign, 0, 0, 0);
5261             }
5262             return packFloat128( zSign, 0, zSig0, zSig1 );
5263         }
5264         zSig2 = 0;
5265         zSig0 |= LIT64( 0x0002000000000000 );
5266         zExp = aExp;
5267         goto shiftRight1;
5268     }
5269     aSig0 |= LIT64( 0x0001000000000000 );
5270     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5271     --zExp;
5272     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5273     ++zExp;
5274  shiftRight1:
5275     shift128ExtraRightJamming(
5276         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5277  roundAndPack:
5278     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5279
5280 }
5281
5282 /*----------------------------------------------------------------------------
5283 | Returns the result of subtracting the absolute values of the quadruple-
5284 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
5285 | difference is negated before being returned.  `zSign' is ignored if the
5286 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5287 | Standard for Binary Floating-Point Arithmetic.
5288 *----------------------------------------------------------------------------*/
5289
5290 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5291 {
5292     int32 aExp, bExp, zExp;
5293     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5294     int32 expDiff;
5295     float128 z;
5296
5297     aSig1 = extractFloat128Frac1( a );
5298     aSig0 = extractFloat128Frac0( a );
5299     aExp = extractFloat128Exp( a );
5300     bSig1 = extractFloat128Frac1( b );
5301     bSig0 = extractFloat128Frac0( b );
5302     bExp = extractFloat128Exp( b );
5303     expDiff = aExp - bExp;
5304     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5305     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5306     if ( 0 < expDiff ) goto aExpBigger;
5307     if ( expDiff < 0 ) goto bExpBigger;
5308     if ( aExp == 0x7FFF ) {
5309         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5310             return propagateFloat128NaN( a, b STATUS_VAR );
5311         }
5312         float_raise( float_flag_invalid STATUS_VAR);
5313         z.low = float128_default_nan_low;
5314         z.high = float128_default_nan_high;
5315         return z;
5316     }
5317     if ( aExp == 0 ) {
5318         aExp = 1;
5319         bExp = 1;
5320     }
5321     if ( bSig0 < aSig0 ) goto aBigger;
5322     if ( aSig0 < bSig0 ) goto bBigger;
5323     if ( bSig1 < aSig1 ) goto aBigger;
5324     if ( aSig1 < bSig1 ) goto bBigger;
5325     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5326  bExpBigger:
5327     if ( bExp == 0x7FFF ) {
5328         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5329         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5330     }
5331     if ( aExp == 0 ) {
5332         ++expDiff;
5333     }
5334     else {
5335         aSig0 |= LIT64( 0x4000000000000000 );
5336     }
5337     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5338     bSig0 |= LIT64( 0x4000000000000000 );
5339  bBigger:
5340     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5341     zExp = bExp;
5342     zSign ^= 1;
5343     goto normalizeRoundAndPack;
5344  aExpBigger:
5345     if ( aExp == 0x7FFF ) {
5346         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5347         return a;
5348     }
5349     if ( bExp == 0 ) {
5350         --expDiff;
5351     }
5352     else {
5353         bSig0 |= LIT64( 0x4000000000000000 );
5354     }
5355     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5356     aSig0 |= LIT64( 0x4000000000000000 );
5357  aBigger:
5358     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5359     zExp = aExp;
5360  normalizeRoundAndPack:
5361     --zExp;
5362     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5363
5364 }
5365
5366 /*----------------------------------------------------------------------------
5367 | Returns the result of adding the quadruple-precision floating-point values
5368 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5369 | for Binary Floating-Point Arithmetic.
5370 *----------------------------------------------------------------------------*/
5371
5372 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5373 {
5374     flag aSign, bSign;
5375
5376     aSign = extractFloat128Sign( a );
5377     bSign = extractFloat128Sign( b );
5378     if ( aSign == bSign ) {
5379         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5380     }
5381     else {
5382         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5383     }
5384
5385 }
5386
5387 /*----------------------------------------------------------------------------
5388 | Returns the result of subtracting the quadruple-precision floating-point
5389 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5390 | Standard for Binary Floating-Point Arithmetic.
5391 *----------------------------------------------------------------------------*/
5392
5393 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5394 {
5395     flag aSign, bSign;
5396
5397     aSign = extractFloat128Sign( a );
5398     bSign = extractFloat128Sign( b );
5399     if ( aSign == bSign ) {
5400         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5401     }
5402     else {
5403         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5404     }
5405
5406 }
5407
5408 /*----------------------------------------------------------------------------
5409 | Returns the result of multiplying the quadruple-precision floating-point
5410 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5411 | Standard for Binary Floating-Point Arithmetic.
5412 *----------------------------------------------------------------------------*/
5413
5414 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5415 {
5416     flag aSign, bSign, zSign;
5417     int32 aExp, bExp, zExp;
5418     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5419     float128 z;
5420
5421     aSig1 = extractFloat128Frac1( a );
5422     aSig0 = extractFloat128Frac0( a );
5423     aExp = extractFloat128Exp( a );
5424     aSign = extractFloat128Sign( a );
5425     bSig1 = extractFloat128Frac1( b );
5426     bSig0 = extractFloat128Frac0( b );
5427     bExp = extractFloat128Exp( b );
5428     bSign = extractFloat128Sign( b );
5429     zSign = aSign ^ bSign;
5430     if ( aExp == 0x7FFF ) {
5431         if (    ( aSig0 | aSig1 )
5432              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5433             return propagateFloat128NaN( a, b STATUS_VAR );
5434         }
5435         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5436         return packFloat128( zSign, 0x7FFF, 0, 0 );
5437     }
5438     if ( bExp == 0x7FFF ) {
5439         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5440         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5441  invalid:
5442             float_raise( float_flag_invalid STATUS_VAR);
5443             z.low = float128_default_nan_low;
5444             z.high = float128_default_nan_high;
5445             return z;
5446         }
5447         return packFloat128( zSign, 0x7FFF, 0, 0 );
5448     }
5449     if ( aExp == 0 ) {
5450         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5451         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5452     }
5453     if ( bExp == 0 ) {
5454         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5455         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5456     }
5457     zExp = aExp + bExp - 0x4000;
5458     aSig0 |= LIT64( 0x0001000000000000 );
5459     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5460     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5461     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5462     zSig2 |= ( zSig3 != 0 );
5463     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5464         shift128ExtraRightJamming(
5465             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5466         ++zExp;
5467     }
5468     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5469
5470 }
5471
5472 /*----------------------------------------------------------------------------
5473 | Returns the result of dividing the quadruple-precision floating-point value
5474 | `a' by the corresponding value `b'.  The operation is performed according to
5475 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5476 *----------------------------------------------------------------------------*/
5477
5478 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5479 {
5480     flag aSign, bSign, zSign;
5481     int32 aExp, bExp, zExp;
5482     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5483     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5484     float128 z;
5485
5486     aSig1 = extractFloat128Frac1( a );
5487     aSig0 = extractFloat128Frac0( a );
5488     aExp = extractFloat128Exp( a );
5489     aSign = extractFloat128Sign( a );
5490     bSig1 = extractFloat128Frac1( b );
5491     bSig0 = extractFloat128Frac0( b );
5492     bExp = extractFloat128Exp( b );
5493     bSign = extractFloat128Sign( b );
5494     zSign = aSign ^ bSign;
5495     if ( aExp == 0x7FFF ) {
5496         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5497         if ( bExp == 0x7FFF ) {
5498             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5499             goto invalid;
5500         }
5501         return packFloat128( zSign, 0x7FFF, 0, 0 );
5502     }
5503     if ( bExp == 0x7FFF ) {
5504         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5505         return packFloat128( zSign, 0, 0, 0 );
5506     }
5507     if ( bExp == 0 ) {
5508         if ( ( bSig0 | bSig1 ) == 0 ) {
5509             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5510  invalid:
5511                 float_raise( float_flag_invalid STATUS_VAR);
5512                 z.low = float128_default_nan_low;
5513                 z.high = float128_default_nan_high;
5514                 return z;
5515             }
5516             float_raise( float_flag_divbyzero STATUS_VAR);
5517             return packFloat128( zSign, 0x7FFF, 0, 0 );
5518         }
5519         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5520     }
5521     if ( aExp == 0 ) {
5522         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5523         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5524     }
5525     zExp = aExp - bExp + 0x3FFD;
5526     shortShift128Left(
5527         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5528     shortShift128Left(
5529         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5530     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5531         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5532         ++zExp;
5533     }
5534     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5535     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5536     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5537     while ( (int64_t) rem0 < 0 ) {
5538         --zSig0;
5539         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5540     }
5541     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5542     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5543         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5544         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5545         while ( (int64_t) rem1 < 0 ) {
5546             --zSig1;
5547             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5548         }
5549         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5550     }
5551     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5552     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5553
5554 }
5555
5556 /*----------------------------------------------------------------------------
5557 | Returns the remainder of the quadruple-precision floating-point value `a'
5558 | with respect to the corresponding value `b'.  The operation is performed
5559 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5560 *----------------------------------------------------------------------------*/
5561
5562 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5563 {
5564     flag aSign, zSign;
5565     int32 aExp, bExp, expDiff;
5566     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5567     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
5568     int64_t sigMean0;
5569     float128 z;
5570
5571     aSig1 = extractFloat128Frac1( a );
5572     aSig0 = extractFloat128Frac0( a );
5573     aExp = extractFloat128Exp( a );
5574     aSign = extractFloat128Sign( a );
5575     bSig1 = extractFloat128Frac1( b );
5576     bSig0 = extractFloat128Frac0( b );
5577     bExp = extractFloat128Exp( b );
5578     if ( aExp == 0x7FFF ) {
5579         if (    ( aSig0 | aSig1 )
5580              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5581             return propagateFloat128NaN( a, b STATUS_VAR );
5582         }
5583         goto invalid;
5584     }
5585     if ( bExp == 0x7FFF ) {
5586         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5587         return a;
5588     }
5589     if ( bExp == 0 ) {
5590         if ( ( bSig0 | bSig1 ) == 0 ) {
5591  invalid:
5592             float_raise( float_flag_invalid STATUS_VAR);
5593             z.low = float128_default_nan_low;
5594             z.high = float128_default_nan_high;
5595             return z;
5596         }
5597         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5598     }
5599     if ( aExp == 0 ) {
5600         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5601         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5602     }
5603     expDiff = aExp - bExp;
5604     if ( expDiff < -1 ) return a;
5605     shortShift128Left(
5606         aSig0 | LIT64( 0x0001000000000000 ),
5607         aSig1,
5608         15 - ( expDiff < 0 ),
5609         &aSig0,
5610         &aSig1
5611     );
5612     shortShift128Left(
5613         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5614     q = le128( bSig0, bSig1, aSig0, aSig1 );
5615     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5616     expDiff -= 64;
5617     while ( 0 < expDiff ) {
5618         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5619         q = ( 4 < q ) ? q - 4 : 0;
5620         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5621         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5622         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5623         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5624         expDiff -= 61;
5625     }
5626     if ( -64 < expDiff ) {
5627         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5628         q = ( 4 < q ) ? q - 4 : 0;
5629         q >>= - expDiff;
5630         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5631         expDiff += 52;
5632         if ( expDiff < 0 ) {
5633             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5634         }
5635         else {
5636             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5637         }
5638         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5639         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5640     }
5641     else {
5642         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5643         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5644     }
5645     do {
5646         alternateASig0 = aSig0;
5647         alternateASig1 = aSig1;
5648         ++q;
5649         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5650     } while ( 0 <= (int64_t) aSig0 );
5651     add128(
5652         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
5653     if (    ( sigMean0 < 0 )
5654          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5655         aSig0 = alternateASig0;
5656         aSig1 = alternateASig1;
5657     }
5658     zSign = ( (int64_t) aSig0 < 0 );
5659     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5660     return
5661         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5662
5663 }
5664
5665 /*----------------------------------------------------------------------------
5666 | Returns the square root of the quadruple-precision floating-point value `a'.
5667 | The operation is performed according to the IEC/IEEE Standard for Binary
5668 | Floating-Point Arithmetic.
5669 *----------------------------------------------------------------------------*/
5670
5671 float128 float128_sqrt( float128 a STATUS_PARAM )
5672 {
5673     flag aSign;
5674     int32 aExp, zExp;
5675     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5676     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5677     float128 z;
5678
5679     aSig1 = extractFloat128Frac1( a );
5680     aSig0 = extractFloat128Frac0( a );
5681     aExp = extractFloat128Exp( a );
5682     aSign = extractFloat128Sign( a );
5683     if ( aExp == 0x7FFF ) {
5684         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5685         if ( ! aSign ) return a;
5686         goto invalid;
5687     }
5688     if ( aSign ) {
5689         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5690  invalid:
5691         float_raise( float_flag_invalid STATUS_VAR);
5692         z.low = float128_default_nan_low;
5693         z.high = float128_default_nan_high;
5694         return z;
5695     }
5696     if ( aExp == 0 ) {
5697         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5698         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5699     }
5700     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5701     aSig0 |= LIT64( 0x0001000000000000 );
5702     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5703     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5704     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5705     doubleZSig0 = zSig0<<1;
5706     mul64To128( zSig0, zSig0, &term0, &term1 );
5707     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5708     while ( (int64_t) rem0 < 0 ) {
5709         --zSig0;
5710         doubleZSig0 -= 2;
5711         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5712     }
5713     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5714     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5715         if ( zSig1 == 0 ) zSig1 = 1;
5716         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5717         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5718         mul64To128( zSig1, zSig1, &term2, &term3 );
5719         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5720         while ( (int64_t) rem1 < 0 ) {
5721             --zSig1;
5722             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5723             term3 |= 1;
5724             term2 |= doubleZSig0;
5725             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5726         }
5727         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5728     }
5729     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5730     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5731
5732 }
5733
5734 /*----------------------------------------------------------------------------
5735 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5736 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5737 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5738 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5739 *----------------------------------------------------------------------------*/
5740
5741 int float128_eq( float128 a, float128 b STATUS_PARAM )
5742 {
5743
5744     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5745               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5746          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5747               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5748        ) {
5749         float_raise( float_flag_invalid STATUS_VAR);
5750         return 0;
5751     }
5752     return
5753            ( a.low == b.low )
5754         && (    ( a.high == b.high )
5755              || (    ( a.low == 0 )
5756                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5757            );
5758
5759 }
5760
5761 /*----------------------------------------------------------------------------
5762 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5763 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
5764 | exception is raised if either operand is a NaN.  The comparison is performed
5765 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5766 *----------------------------------------------------------------------------*/
5767
5768 int float128_le( float128 a, float128 b STATUS_PARAM )
5769 {
5770     flag aSign, bSign;
5771
5772     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5773               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5774          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5775               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5776        ) {
5777         float_raise( float_flag_invalid STATUS_VAR);
5778         return 0;
5779     }
5780     aSign = extractFloat128Sign( a );
5781     bSign = extractFloat128Sign( b );
5782     if ( aSign != bSign ) {
5783         return
5784                aSign
5785             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5786                  == 0 );
5787     }
5788     return
5789           aSign ? le128( b.high, b.low, a.high, a.low )
5790         : le128( a.high, a.low, b.high, b.low );
5791
5792 }
5793
5794 /*----------------------------------------------------------------------------
5795 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5796 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5797 | raised if either operand is a NaN.  The comparison is performed according
5798 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5799 *----------------------------------------------------------------------------*/
5800
5801 int float128_lt( float128 a, float128 b STATUS_PARAM )
5802 {
5803     flag aSign, bSign;
5804
5805     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5806               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5807          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5808               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5809        ) {
5810         float_raise( float_flag_invalid STATUS_VAR);
5811         return 0;
5812     }
5813     aSign = extractFloat128Sign( a );
5814     bSign = extractFloat128Sign( b );
5815     if ( aSign != bSign ) {
5816         return
5817                aSign
5818             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5819                  != 0 );
5820     }
5821     return
5822           aSign ? lt128( b.high, b.low, a.high, a.low )
5823         : lt128( a.high, a.low, b.high, b.low );
5824
5825 }
5826
5827 /*----------------------------------------------------------------------------
5828 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5829 | be compared, and 0 otherwise.  The invalid exception is raised if either
5830 | operand is a NaN. The comparison is performed according to the IEC/IEEE
5831 | Standard for Binary Floating-Point Arithmetic.
5832 *----------------------------------------------------------------------------*/
5833
5834 int float128_unordered( float128 a, float128 b STATUS_PARAM )
5835 {
5836     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5837               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5838          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5839               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5840        ) {
5841         float_raise( float_flag_invalid STATUS_VAR);
5842         return 1;
5843     }
5844     return 0;
5845 }
5846
5847 /*----------------------------------------------------------------------------
5848 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5849 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5850 | exception.  The comparison is performed according to the IEC/IEEE Standard
5851 | for Binary Floating-Point Arithmetic.
5852 *----------------------------------------------------------------------------*/
5853
5854 int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
5855 {
5856
5857     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5858               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5859          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5860               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5861        ) {
5862         if (    float128_is_signaling_nan( a )
5863              || float128_is_signaling_nan( b ) ) {
5864             float_raise( float_flag_invalid STATUS_VAR);
5865         }
5866         return 0;
5867     }
5868     return
5869            ( a.low == b.low )
5870         && (    ( a.high == b.high )
5871              || (    ( a.low == 0 )
5872                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5873            );
5874
5875 }
5876
5877 /*----------------------------------------------------------------------------
5878 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5879 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5880 | cause an exception.  Otherwise, the comparison is performed according to the
5881 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5882 *----------------------------------------------------------------------------*/
5883
5884 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5885 {
5886     flag aSign, bSign;
5887
5888     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5889               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5890          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5891               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5892        ) {
5893         if (    float128_is_signaling_nan( a )
5894              || float128_is_signaling_nan( b ) ) {
5895             float_raise( float_flag_invalid STATUS_VAR);
5896         }
5897         return 0;
5898     }
5899     aSign = extractFloat128Sign( a );
5900     bSign = extractFloat128Sign( b );
5901     if ( aSign != bSign ) {
5902         return
5903                aSign
5904             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5905                  == 0 );
5906     }
5907     return
5908           aSign ? le128( b.high, b.low, a.high, a.low )
5909         : le128( a.high, a.low, b.high, b.low );
5910
5911 }
5912
5913 /*----------------------------------------------------------------------------
5914 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5915 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5916 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5917 | Standard for Binary Floating-Point Arithmetic.
5918 *----------------------------------------------------------------------------*/
5919
5920 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5921 {
5922     flag aSign, bSign;
5923
5924     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5925               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5926          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5927               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5928        ) {
5929         if (    float128_is_signaling_nan( a )
5930              || float128_is_signaling_nan( b ) ) {
5931             float_raise( float_flag_invalid STATUS_VAR);
5932         }
5933         return 0;
5934     }
5935     aSign = extractFloat128Sign( a );
5936     bSign = extractFloat128Sign( b );
5937     if ( aSign != bSign ) {
5938         return
5939                aSign
5940             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5941                  != 0 );
5942     }
5943     return
5944           aSign ? lt128( b.high, b.low, a.high, a.low )
5945         : lt128( a.high, a.low, b.high, b.low );
5946
5947 }
5948
5949 /*----------------------------------------------------------------------------
5950 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5951 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
5952 | comparison is performed according to the IEC/IEEE Standard for Binary
5953 | Floating-Point Arithmetic.
5954 *----------------------------------------------------------------------------*/
5955
5956 int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
5957 {
5958     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5959               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5960          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5961               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5962        ) {
5963         if (    float128_is_signaling_nan( a )
5964              || float128_is_signaling_nan( b ) ) {
5965             float_raise( float_flag_invalid STATUS_VAR);
5966         }
5967         return 1;
5968     }
5969     return 0;
5970 }
5971
5972 /* misc functions */
5973 float32 uint32_to_float32( uint32 a STATUS_PARAM )
5974 {
5975     return int64_to_float32(a STATUS_VAR);
5976 }
5977
5978 float64 uint32_to_float64( uint32 a STATUS_PARAM )
5979 {
5980     return int64_to_float64(a STATUS_VAR);
5981 }
5982
5983 uint32 float32_to_uint32( float32 a STATUS_PARAM )
5984 {
5985     int64_t v;
5986     uint32 res;
5987
5988     v = float32_to_int64(a STATUS_VAR);
5989     if (v < 0) {
5990         res = 0;
5991         float_raise( float_flag_invalid STATUS_VAR);
5992     } else if (v > 0xffffffff) {
5993         res = 0xffffffff;
5994         float_raise( float_flag_invalid STATUS_VAR);
5995     } else {
5996         res = v;
5997     }
5998     return res;
5999 }
6000
6001 uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6002 {
6003     int64_t v;
6004     uint32 res;
6005
6006     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6007     if (v < 0) {
6008         res = 0;
6009         float_raise( float_flag_invalid STATUS_VAR);
6010     } else if (v > 0xffffffff) {
6011         res = 0xffffffff;
6012         float_raise( float_flag_invalid STATUS_VAR);
6013     } else {
6014         res = v;
6015     }
6016     return res;
6017 }
6018
6019 uint16 float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
6020 {
6021     int64_t v;
6022     uint16 res;
6023
6024     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6025     if (v < 0) {
6026         res = 0;
6027         float_raise( float_flag_invalid STATUS_VAR);
6028     } else if (v > 0xffff) {
6029         res = 0xffff;
6030         float_raise( float_flag_invalid STATUS_VAR);
6031     } else {
6032         res = v;
6033     }
6034     return res;
6035 }
6036
6037 uint32 float64_to_uint32( float64 a STATUS_PARAM )
6038 {
6039     int64_t v;
6040     uint32 res;
6041
6042     v = float64_to_int64(a STATUS_VAR);
6043     if (v < 0) {
6044         res = 0;
6045         float_raise( float_flag_invalid STATUS_VAR);
6046     } else if (v > 0xffffffff) {
6047         res = 0xffffffff;
6048         float_raise( float_flag_invalid STATUS_VAR);
6049     } else {
6050         res = v;
6051     }
6052     return res;
6053 }
6054
6055 uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6056 {
6057     int64_t v;
6058     uint32 res;
6059
6060     v = float64_to_int64_round_to_zero(a STATUS_VAR);
6061     if (v < 0) {
6062         res = 0;
6063         float_raise( float_flag_invalid STATUS_VAR);
6064     } else if (v > 0xffffffff) {
6065         res = 0xffffffff;
6066         float_raise( float_flag_invalid STATUS_VAR);
6067     } else {
6068         res = v;
6069     }
6070     return res;
6071 }
6072
6073 uint16 float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
6074 {
6075     int64_t v;
6076     uint16 res;
6077
6078     v = float64_to_int64_round_to_zero(a STATUS_VAR);
6079     if (v < 0) {
6080         res = 0;
6081         float_raise( float_flag_invalid STATUS_VAR);
6082     } else if (v > 0xffff) {
6083         res = 0xffff;
6084         float_raise( float_flag_invalid STATUS_VAR);
6085     } else {
6086         res = v;
6087     }
6088     return res;
6089 }
6090
6091 /* FIXME: This looks broken.  */
6092 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
6093 {
6094     int64_t v;
6095
6096     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6097     v += float64_val(a);
6098     v = float64_to_int64(make_float64(v) STATUS_VAR);
6099
6100     return v - INT64_MIN;
6101 }
6102
6103 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6104 {
6105     int64_t v;
6106
6107     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6108     v += float64_val(a);
6109     v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
6110
6111     return v - INT64_MIN;
6112 }
6113
6114 #define COMPARE(s, nan_exp)                                                  \
6115 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
6116                                       int is_quiet STATUS_PARAM )            \
6117 {                                                                            \
6118     flag aSign, bSign;                                                       \
6119     uint ## s ## _t av, bv;                                                  \
6120     a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
6121     b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
6122                                                                              \
6123     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
6124          extractFloat ## s ## Frac( a ) ) ||                                 \
6125         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
6126           extractFloat ## s ## Frac( b ) )) {                                \
6127         if (!is_quiet ||                                                     \
6128             float ## s ## _is_signaling_nan( a ) ||                          \
6129             float ## s ## _is_signaling_nan( b ) ) {                         \
6130             float_raise( float_flag_invalid STATUS_VAR);                     \
6131         }                                                                    \
6132         return float_relation_unordered;                                     \
6133     }                                                                        \
6134     aSign = extractFloat ## s ## Sign( a );                                  \
6135     bSign = extractFloat ## s ## Sign( b );                                  \
6136     av = float ## s ## _val(a);                                              \
6137     bv = float ## s ## _val(b);                                              \
6138     if ( aSign != bSign ) {                                                  \
6139         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
6140             /* zero case */                                                  \
6141             return float_relation_equal;                                     \
6142         } else {                                                             \
6143             return 1 - (2 * aSign);                                          \
6144         }                                                                    \
6145     } else {                                                                 \
6146         if (av == bv) {                                                      \
6147             return float_relation_equal;                                     \
6148         } else {                                                             \
6149             return 1 - 2 * (aSign ^ ( av < bv ));                            \
6150         }                                                                    \
6151     }                                                                        \
6152 }                                                                            \
6153                                                                              \
6154 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
6155 {                                                                            \
6156     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
6157 }                                                                            \
6158                                                                              \
6159 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
6160 {                                                                            \
6161     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
6162 }
6163
6164 COMPARE(32, 0xff)
6165 COMPARE(64, 0x7ff)
6166
6167 INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6168                                       int is_quiet STATUS_PARAM )
6169 {
6170     flag aSign, bSign;
6171
6172     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6173           ( extractFloatx80Frac( a )<<1 ) ) ||
6174         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6175           ( extractFloatx80Frac( b )<<1 ) )) {
6176         if (!is_quiet ||
6177             floatx80_is_signaling_nan( a ) ||
6178             floatx80_is_signaling_nan( b ) ) {
6179             float_raise( float_flag_invalid STATUS_VAR);
6180         }
6181         return float_relation_unordered;
6182     }
6183     aSign = extractFloatx80Sign( a );
6184     bSign = extractFloatx80Sign( b );
6185     if ( aSign != bSign ) {
6186
6187         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6188              ( ( a.low | b.low ) == 0 ) ) {
6189             /* zero case */
6190             return float_relation_equal;
6191         } else {
6192             return 1 - (2 * aSign);
6193         }
6194     } else {
6195         if (a.low == b.low && a.high == b.high) {
6196             return float_relation_equal;
6197         } else {
6198             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6199         }
6200     }
6201 }
6202
6203 int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6204 {
6205     return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6206 }
6207
6208 int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6209 {
6210     return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6211 }
6212
6213 INLINE int float128_compare_internal( float128 a, float128 b,
6214                                       int is_quiet STATUS_PARAM )
6215 {
6216     flag aSign, bSign;
6217
6218     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6219           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6220         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6221           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6222         if (!is_quiet ||
6223             float128_is_signaling_nan( a ) ||
6224             float128_is_signaling_nan( b ) ) {
6225             float_raise( float_flag_invalid STATUS_VAR);
6226         }
6227         return float_relation_unordered;
6228     }
6229     aSign = extractFloat128Sign( a );
6230     bSign = extractFloat128Sign( b );
6231     if ( aSign != bSign ) {
6232         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6233             /* zero case */
6234             return float_relation_equal;
6235         } else {
6236             return 1 - (2 * aSign);
6237         }
6238     } else {
6239         if (a.low == b.low && a.high == b.high) {
6240             return float_relation_equal;
6241         } else {
6242             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6243         }
6244     }
6245 }
6246
6247 int float128_compare( float128 a, float128 b STATUS_PARAM )
6248 {
6249     return float128_compare_internal(a, b, 0 STATUS_VAR);
6250 }
6251
6252 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6253 {
6254     return float128_compare_internal(a, b, 1 STATUS_VAR);
6255 }
6256
6257 /* min() and max() functions. These can't be implemented as
6258  * 'compare and pick one input' because that would mishandle
6259  * NaNs and +0 vs -0.
6260  */
6261 #define MINMAX(s, nan_exp)                                              \
6262 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
6263                                         int ismin STATUS_PARAM )        \
6264 {                                                                       \
6265     flag aSign, bSign;                                                  \
6266     uint ## s ## _t av, bv;                                             \
6267     a = float ## s ## _squash_input_denormal(a STATUS_VAR);             \
6268     b = float ## s ## _squash_input_denormal(b STATUS_VAR);             \
6269     if (float ## s ## _is_any_nan(a) ||                                 \
6270         float ## s ## _is_any_nan(b)) {                                 \
6271         return propagateFloat ## s ## NaN(a, b STATUS_VAR);             \
6272     }                                                                   \
6273     aSign = extractFloat ## s ## Sign(a);                               \
6274     bSign = extractFloat ## s ## Sign(b);                               \
6275     av = float ## s ## _val(a);                                         \
6276     bv = float ## s ## _val(b);                                         \
6277     if (aSign != bSign) {                                               \
6278         if (ismin) {                                                    \
6279             return aSign ? a : b;                                       \
6280         } else {                                                        \
6281             return aSign ? b : a;                                       \
6282         }                                                               \
6283     } else {                                                            \
6284         if (ismin) {                                                    \
6285             return (aSign ^ (av < bv)) ? a : b;                         \
6286         } else {                                                        \
6287             return (aSign ^ (av < bv)) ? b : a;                         \
6288         }                                                               \
6289     }                                                                   \
6290 }                                                                       \
6291                                                                         \
6292 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM)  \
6293 {                                                                       \
6294     return float ## s ## _minmax(a, b, 1 STATUS_VAR);                   \
6295 }                                                                       \
6296                                                                         \
6297 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM)  \
6298 {                                                                       \
6299     return float ## s ## _minmax(a, b, 0 STATUS_VAR);                   \
6300 }
6301
6302 MINMAX(32, 0xff)
6303 MINMAX(64, 0x7ff)
6304
6305
6306 /* Multiply A by 2 raised to the power N.  */
6307 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6308 {
6309     flag aSign;
6310     int16_t aExp;
6311     uint32_t aSig;
6312
6313     a = float32_squash_input_denormal(a STATUS_VAR);
6314     aSig = extractFloat32Frac( a );
6315     aExp = extractFloat32Exp( a );
6316     aSign = extractFloat32Sign( a );
6317
6318     if ( aExp == 0xFF ) {
6319         if ( aSig ) {
6320             return propagateFloat32NaN( a, a STATUS_VAR );
6321         }
6322         return a;
6323     }
6324     if ( aExp != 0 )
6325         aSig |= 0x00800000;
6326     else if ( aSig == 0 )
6327         return a;
6328
6329     if (n > 0x200) {
6330         n = 0x200;
6331     } else if (n < -0x200) {
6332         n = -0x200;
6333     }
6334
6335     aExp += n - 1;
6336     aSig <<= 7;
6337     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6338 }
6339
6340 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6341 {
6342     flag aSign;
6343     int16_t aExp;
6344     uint64_t aSig;
6345
6346     a = float64_squash_input_denormal(a STATUS_VAR);
6347     aSig = extractFloat64Frac( a );
6348     aExp = extractFloat64Exp( a );
6349     aSign = extractFloat64Sign( a );
6350
6351     if ( aExp == 0x7FF ) {
6352         if ( aSig ) {
6353             return propagateFloat64NaN( a, a STATUS_VAR );
6354         }
6355         return a;
6356     }
6357     if ( aExp != 0 )
6358         aSig |= LIT64( 0x0010000000000000 );
6359     else if ( aSig == 0 )
6360         return a;
6361
6362     if (n > 0x1000) {
6363         n = 0x1000;
6364     } else if (n < -0x1000) {
6365         n = -0x1000;
6366     }
6367
6368     aExp += n - 1;
6369     aSig <<= 10;
6370     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6371 }
6372
6373 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6374 {
6375     flag aSign;
6376     int32_t aExp;
6377     uint64_t aSig;
6378
6379     aSig = extractFloatx80Frac( a );
6380     aExp = extractFloatx80Exp( a );
6381     aSign = extractFloatx80Sign( a );
6382
6383     if ( aExp == 0x7FFF ) {
6384         if ( aSig<<1 ) {
6385             return propagateFloatx80NaN( a, a STATUS_VAR );
6386         }
6387         return a;
6388     }
6389
6390     if (aExp == 0 && aSig == 0)
6391         return a;
6392
6393     if (n > 0x10000) {
6394         n = 0x10000;
6395     } else if (n < -0x10000) {
6396         n = -0x10000;
6397     }
6398
6399     aExp += n;
6400     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6401                                           aSign, aExp, aSig, 0 STATUS_VAR );
6402 }
6403
6404 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6405 {
6406     flag aSign;
6407     int32_t aExp;
6408     uint64_t aSig0, aSig1;
6409
6410     aSig1 = extractFloat128Frac1( a );
6411     aSig0 = extractFloat128Frac0( a );
6412     aExp = extractFloat128Exp( a );
6413     aSign = extractFloat128Sign( a );
6414     if ( aExp == 0x7FFF ) {
6415         if ( aSig0 | aSig1 ) {
6416             return propagateFloat128NaN( a, a STATUS_VAR );
6417         }
6418         return a;
6419     }
6420     if ( aExp != 0 )
6421         aSig0 |= LIT64( 0x0001000000000000 );
6422     else if ( aSig0 == 0 && aSig1 == 0 )
6423         return a;
6424
6425     if (n > 0x10000) {
6426         n = 0x10000;
6427     } else if (n < -0x10000) {
6428         n = -0x10000;
6429     }
6430
6431     aExp += n - 1;
6432     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6433                                           STATUS_VAR );
6434
6435 }