fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * Derived from SoftFloat.
   5  */
   6
   7 /*============================================================================
   8
   9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  10 Package, Release 2b.
  11
  12 Written by John R. Hauser.  This work was made possible in part by the
  13 International Computer Science Institute, located at Suite 600, 1947 Center
  14 Street, Berkeley, California 94704.  Funding was partially provided by the
  15 National Science Foundation under grant MIP-9311980.  The original version
  16 of this code was written as part of a project to build a fixed-point vector
  17 processor in collaboration with the University of California at Berkeley,
  18 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  20 arithmetic/SoftFloat.html'.
  21
  22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  24 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  30
  31 Derivative works are acceptable, even for commercial purposes, so long as
  32 (1) the source code for the derivative work includes prominent notice that
  33 the work is derivative, and (2) the source code includes prominent notice with
  34 these four paragraphs for those parts of this code that are retained.
  35
  36 =============================================================================*/
  37
  38 /* softfloat (and in particular the code in softfloat-specialize.h) is
  39  * target-dependent and needs the TARGET_* macros.
  40  */
  41 #include "config.h"
  42
  43 #include "fpu/softfloat.h"
  44
  45 /*----------------------------------------------------------------------------
  46 | Primitive arithmetic functions, including multi-word arithmetic, and
  47 | division and square root approximations.  (Can be specialized to target if
  48 | desired.)
  49 *----------------------------------------------------------------------------*/
  50 #include "softfloat-macros.h"
  51
  52 /*----------------------------------------------------------------------------
  53 | Functions and definitions to determine:  (1) whether tininess for underflow
  54 | is detected before or after rounding by default, (2) what (if anything)
  55 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  56 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  57 | are propagated from function inputs to output.  These details are target-
  58 | specific.
  59 *----------------------------------------------------------------------------*/
  60 #include "softfloat-specialize.h"
  61
  62 void set_float_rounding_mode(int val STATUS_PARAM)
  63 {
  64     STATUS(float_rounding_mode) = val;
  65 }
  66
  67 void set_float_exception_flags(int val STATUS_PARAM)
  68 {
  69     STATUS(float_exception_flags) = val;
  70 }
  71
  72 void set_floatx80_rounding_precision(int val STATUS_PARAM)
  73 {
  74     STATUS(floatx80_rounding_precision) = val;
  75 }
  76
  77 /*----------------------------------------------------------------------------
  78 | Returns the fraction bits of the half-precision floating-point value `a'.
  79 *----------------------------------------------------------------------------*/
  80
  81 INLINE uint32_t extractFloat16Frac(float16 a)
  82 {
  83     return float16_val(a) & 0x3ff;
  84 }
  85
  86 /*----------------------------------------------------------------------------
  87 | Returns the exponent bits of the half-precision floating-point value `a'.
  88 *----------------------------------------------------------------------------*/
  89
  90 INLINE int_fast16_t extractFloat16Exp(float16 a)
  91 {
  92     return (float16_val(a) >> 10) & 0x1f;
  93 }
  94
  95 /*----------------------------------------------------------------------------
  96 | Returns the sign bit of the single-precision floating-point value `a'.
  97 *----------------------------------------------------------------------------*/
  98
  99 INLINE flag extractFloat16Sign(float16 a)
 100 {
 101     return float16_val(a)>>15;
 102 }
 103
 104 /*----------------------------------------------------------------------------
 105 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
 106 | and 7, and returns the properly rounded 32-bit integer corresponding to the
 107 | input.  If `zSign' is 1, the input is negated before being converted to an
 108 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 109 | is simply rounded to an integer, with the inexact exception raised if the
 110 | input cannot be represented exactly as an integer.  However, if the fixed-
 111 | point input is too large, the invalid exception is raised and the largest
 112 | positive or negative integer is returned.
 113 *----------------------------------------------------------------------------*/
 114
 115 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
 116 {
 117     int8 roundingMode;
 118     flag roundNearestEven;
 119     int8 roundIncrement, roundBits;
 120     int32_t z;
 121
 122     roundingMode = STATUS(float_rounding_mode);
 123     roundNearestEven = ( roundingMode == float_round_nearest_even );
 124     roundIncrement = 0x40;
 125     if ( ! roundNearestEven ) {
 126         if ( roundingMode == float_round_to_zero ) {
 127             roundIncrement = 0;
 128         }
 129         else {
 130             roundIncrement = 0x7F;
 131             if ( zSign ) {
 132                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 133             }
 134             else {
 135                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 136             }
 137         }
 138     }
 139     roundBits = absZ & 0x7F;
 140     absZ = ( absZ + roundIncrement )>>7;
 141     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 142     z = absZ;
 143     if ( zSign ) z = - z;
 144     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 145         float_raise( float_flag_invalid STATUS_VAR);
 146         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
 147     }
 148     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 149     return z;
 150
 151 }
 152
 153 /*----------------------------------------------------------------------------
 154 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 155 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 156 | and returns the properly rounded 64-bit integer corresponding to the input.
 157 | If `zSign' is 1, the input is negated before being converted to an integer.
 158 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 159 | the inexact exception raised if the input cannot be represented exactly as
 160 | an integer.  However, if the fixed-point input is too large, the invalid
 161 | exception is raised and the largest positive or negative integer is
 162 | returned.
 163 *----------------------------------------------------------------------------*/
 164
 165 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
 166 {
 167     int8 roundingMode;
 168     flag roundNearestEven, increment;
 169     int64_t z;
 170
 171     roundingMode = STATUS(float_rounding_mode);
 172     roundNearestEven = ( roundingMode == float_round_nearest_even );
 173     increment = ( (int64_t) absZ1 < 0 );
 174     if ( ! roundNearestEven ) {
 175         if ( roundingMode == float_round_to_zero ) {
 176             increment = 0;
 177         }
 178         else {
 179             if ( zSign ) {
 180                 increment = ( roundingMode == float_round_down ) && absZ1;
 181             }
 182             else {
 183                 increment = ( roundingMode == float_round_up ) && absZ1;
 184             }
 185         }
 186     }
 187     if ( increment ) {
 188         ++absZ0;
 189         if ( absZ0 == 0 ) goto overflow;
 190         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 191     }
 192     z = absZ0;
 193     if ( zSign ) z = - z;
 194     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 195  overflow:
 196         float_raise( float_flag_invalid STATUS_VAR);
 197         return
 198               zSign ? (int64_t) LIT64( 0x8000000000000000 )
 199             : LIT64( 0x7FFFFFFFFFFFFFFF );
 200     }
 201     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 202     return z;
 203
 204 }
 205
 206 /*----------------------------------------------------------------------------
 207 | Returns the fraction bits of the single-precision floating-point value `a'.
 208 *----------------------------------------------------------------------------*/
 209
 210 INLINE uint32_t extractFloat32Frac( float32 a )
 211 {
 212
 213     return float32_val(a) & 0x007FFFFF;
 214
 215 }
 216
 217 /*----------------------------------------------------------------------------
 218 | Returns the exponent bits of the single-precision floating-point value `a'.
 219 *----------------------------------------------------------------------------*/
 220
 221 INLINE int_fast16_t extractFloat32Exp(float32 a)
 222 {
 223
 224     return ( float32_val(a)>>23 ) & 0xFF;
 225
 226 }
 227
 228 /*----------------------------------------------------------------------------
 229 | Returns the sign bit of the single-precision floating-point value `a'.
 230 *----------------------------------------------------------------------------*/
 231
 232 INLINE flag extractFloat32Sign( float32 a )
 233 {
 234
 235     return float32_val(a)>>31;
 236
 237 }
 238
 239 /*----------------------------------------------------------------------------
 240 | If `a' is denormal and we are in flush-to-zero mode then set the
 241 | input-denormal exception and return zero. Otherwise just return the value.
 242 *----------------------------------------------------------------------------*/
 243 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
 244 {
 245     if (STATUS(flush_inputs_to_zero)) {
 246         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
 247             float_raise(float_flag_input_denormal STATUS_VAR);
 248             return make_float32(float32_val(a) & 0x80000000);
 249         }
 250     }
 251     return a;
 252 }
 253
 254 /*----------------------------------------------------------------------------
 255 | Normalizes the subnormal single-precision floating-point value represented
 256 | by the denormalized significand `aSig'.  The normalized exponent and
 257 | significand are stored at the locations pointed to by `zExpPtr' and
 258 | `zSigPtr', respectively.
 259 *----------------------------------------------------------------------------*/
 260
 261 static void
 262  normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
 263 {
 264     int8 shiftCount;
 265
 266     shiftCount = countLeadingZeros32( aSig ) - 8;
 267     *zSigPtr = aSig<<shiftCount;
 268     *zExpPtr = 1 - shiftCount;
 269
 270 }
 271
 272 /*----------------------------------------------------------------------------
 273 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 274 | single-precision floating-point value, returning the result.  After being
 275 | shifted into the proper positions, the three fields are simply added
 276 | together to form the result.  This means that any integer portion of `zSig'
 277 | will be added into the exponent.  Since a properly normalized significand
 278 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 279 | than the desired result exponent whenever `zSig' is a complete, normalized
 280 | significand.
 281 *----------------------------------------------------------------------------*/
 282
 283 INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
 284 {
 285
 286     return make_float32(
 287           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
 288
 289 }
 290
 291 /*----------------------------------------------------------------------------
 292 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 293 | and significand `zSig', and returns the proper single-precision floating-
 294 | point value corresponding to the abstract input.  Ordinarily, the abstract
 295 | value is simply rounded and packed into the single-precision format, with
 296 | the inexact exception raised if the abstract input cannot be represented
 297 | exactly.  However, if the abstract value is too large, the overflow and
 298 | inexact exceptions are raised and an infinity or maximal finite value is
 299 | returned.  If the abstract value is too small, the input value is rounded to
 300 | a subnormal number, and the underflow and inexact exceptions are raised if
 301 | the abstract input cannot be represented exactly as a subnormal single-
 302 | precision floating-point number.
 303 |     The input significand `zSig' has its binary point between bits 30
 304 | and 29, which is 7 bits to the left of the usual location.  This shifted
 305 | significand must be normalized or smaller.  If `zSig' is not normalized,
 306 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 307 | and it must not require rounding.  In the usual case that `zSig' is
 308 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 309 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 310 | Binary Floating-Point Arithmetic.
 311 *----------------------------------------------------------------------------*/
 312
 313 static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
 314 {
 315     int8 roundingMode;
 316     flag roundNearestEven;
 317     int8 roundIncrement, roundBits;
 318     flag isTiny;
 319
 320     roundingMode = STATUS(float_rounding_mode);
 321     roundNearestEven = ( roundingMode == float_round_nearest_even );
 322     roundIncrement = 0x40;
 323     if ( ! roundNearestEven ) {
 324         if ( roundingMode == float_round_to_zero ) {
 325             roundIncrement = 0;
 326         }
 327         else {
 328             roundIncrement = 0x7F;
 329             if ( zSign ) {
 330                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 331             }
 332             else {
 333                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 334             }
 335         }
 336     }
 337     roundBits = zSig & 0x7F;
 338     if ( 0xFD <= (uint16_t) zExp ) {
 339         if (    ( 0xFD < zExp )
 340              || (    ( zExp == 0xFD )
 341                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
 342            ) {
 343             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 344             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 345         }
 346         if ( zExp < 0 ) {
 347             if (STATUS(flush_to_zero)) {
 348                 float_raise(float_flag_output_denormal STATUS_VAR);
 349                 return packFloat32(zSign, 0, 0);
 350             }
 351             isTiny =
 352                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 353                 || ( zExp < -1 )
 354                 || ( zSig + roundIncrement < 0x80000000 );
 355             shift32RightJamming( zSig, - zExp, &zSig );
 356             zExp = 0;
 357             roundBits = zSig & 0x7F;
 358             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 359         }
 360     }
 361     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 362     zSig = ( zSig + roundIncrement )>>7;
 363     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 364     if ( zSig == 0 ) zExp = 0;
 365     return packFloat32( zSign, zExp, zSig );
 366
 367 }
 368
 369 /*----------------------------------------------------------------------------
 370 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 371 | and significand `zSig', and returns the proper single-precision floating-
 372 | point value corresponding to the abstract input.  This routine is just like
 373 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 374 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 375 | floating-point exponent.
 376 *----------------------------------------------------------------------------*/
 377
 378 static float32
 379  normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
 380 {
 381     int8 shiftCount;
 382
 383     shiftCount = countLeadingZeros32( zSig ) - 1;
 384     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 385
 386 }
 387
 388 /*----------------------------------------------------------------------------
 389 | Returns the fraction bits of the double-precision floating-point value `a'.
 390 *----------------------------------------------------------------------------*/
 391
 392 INLINE uint64_t extractFloat64Frac( float64 a )
 393 {
 394
 395     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 396
 397 }
 398
 399 /*----------------------------------------------------------------------------
 400 | Returns the exponent bits of the double-precision floating-point value `a'.
 401 *----------------------------------------------------------------------------*/
 402
 403 INLINE int_fast16_t extractFloat64Exp(float64 a)
 404 {
 405
 406     return ( float64_val(a)>>52 ) & 0x7FF;
 407
 408 }
 409
 410 /*----------------------------------------------------------------------------
 411 | Returns the sign bit of the double-precision floating-point value `a'.
 412 *----------------------------------------------------------------------------*/
 413
 414 INLINE flag extractFloat64Sign( float64 a )
 415 {
 416
 417     return float64_val(a)>>63;
 418
 419 }
 420
 421 /*----------------------------------------------------------------------------
 422 | If `a' is denormal and we are in flush-to-zero mode then set the
 423 | input-denormal exception and return zero. Otherwise just return the value.
 424 *----------------------------------------------------------------------------*/
 425 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
 426 {
 427     if (STATUS(flush_inputs_to_zero)) {
 428         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
 429             float_raise(float_flag_input_denormal STATUS_VAR);
 430             return make_float64(float64_val(a) & (1ULL << 63));
 431         }
 432     }
 433     return a;
 434 }
 435
 436 /*----------------------------------------------------------------------------
 437 | Normalizes the subnormal double-precision floating-point value represented
 438 | by the denormalized significand `aSig'.  The normalized exponent and
 439 | significand are stored at the locations pointed to by `zExpPtr' and
 440 | `zSigPtr', respectively.
 441 *----------------------------------------------------------------------------*/
 442
 443 static void
 444  normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
 445 {
 446     int8 shiftCount;
 447
 448     shiftCount = countLeadingZeros64( aSig ) - 11;
 449     *zSigPtr = aSig<<shiftCount;
 450     *zExpPtr = 1 - shiftCount;
 451
 452 }
 453
 454 /*----------------------------------------------------------------------------
 455 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 456 | double-precision floating-point value, returning the result.  After being
 457 | shifted into the proper positions, the three fields are simply added
 458 | together to form the result.  This means that any integer portion of `zSig'
 459 | will be added into the exponent.  Since a properly normalized significand
 460 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 461 | than the desired result exponent whenever `zSig' is a complete, normalized
 462 | significand.
 463 *----------------------------------------------------------------------------*/
 464
 465 INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
 466 {
 467
 468     return make_float64(
 469         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
 470
 471 }
 472
 473 /*----------------------------------------------------------------------------
 474 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 475 | and significand `zSig', and returns the proper double-precision floating-
 476 | point value corresponding to the abstract input.  Ordinarily, the abstract
 477 | value is simply rounded and packed into the double-precision format, with
 478 | the inexact exception raised if the abstract input cannot be represented
 479 | exactly.  However, if the abstract value is too large, the overflow and
 480 | inexact exceptions are raised and an infinity or maximal finite value is
 481 | returned.  If the abstract value is too small, the input value is rounded
 482 | to a subnormal number, and the underflow and inexact exceptions are raised
 483 | if the abstract input cannot be represented exactly as a subnormal double-
 484 | precision floating-point number.
 485 |     The input significand `zSig' has its binary point between bits 62
 486 | and 61, which is 10 bits to the left of the usual location.  This shifted
 487 | significand must be normalized or smaller.  If `zSig' is not normalized,
 488 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 489 | and it must not require rounding.  In the usual case that `zSig' is
 490 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 491 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 492 | Binary Floating-Point Arithmetic.
 493 *----------------------------------------------------------------------------*/
 494
 495 static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
 496 {
 497     int8 roundingMode;
 498     flag roundNearestEven;
 499     int_fast16_t roundIncrement, roundBits;
 500     flag isTiny;
 501
 502     roundingMode = STATUS(float_rounding_mode);
 503     roundNearestEven = ( roundingMode == float_round_nearest_even );
 504     roundIncrement = 0x200;
 505     if ( ! roundNearestEven ) {
 506         if ( roundingMode == float_round_to_zero ) {
 507             roundIncrement = 0;
 508         }
 509         else {
 510             roundIncrement = 0x3FF;
 511             if ( zSign ) {
 512                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 513             }
 514             else {
 515                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 516             }
 517         }
 518     }
 519     roundBits = zSig & 0x3FF;
 520     if ( 0x7FD <= (uint16_t) zExp ) {
 521         if (    ( 0x7FD < zExp )
 522              || (    ( zExp == 0x7FD )
 523                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
 524            ) {
 525             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 526             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 527         }
 528         if ( zExp < 0 ) {
 529             if (STATUS(flush_to_zero)) {
 530                 float_raise(float_flag_output_denormal STATUS_VAR);
 531                 return packFloat64(zSign, 0, 0);
 532             }
 533             isTiny =
 534                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 535                 || ( zExp < -1 )
 536                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 537             shift64RightJamming( zSig, - zExp, &zSig );
 538             zExp = 0;
 539             roundBits = zSig & 0x3FF;
 540             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 541         }
 542     }
 543     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 544     zSig = ( zSig + roundIncrement )>>10;
 545     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 546     if ( zSig == 0 ) zExp = 0;
 547     return packFloat64( zSign, zExp, zSig );
 548
 549 }
 550
 551 /*----------------------------------------------------------------------------
 552 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 553 | and significand `zSig', and returns the proper double-precision floating-
 554 | point value corresponding to the abstract input.  This routine is just like
 555 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 556 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 557 | floating-point exponent.
 558 *----------------------------------------------------------------------------*/
 559
 560 static float64
 561  normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
 562 {
 563     int8 shiftCount;
 564
 565     shiftCount = countLeadingZeros64( zSig ) - 1;
 566     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 567
 568 }
 569
 570 /*----------------------------------------------------------------------------
 571 | Returns the fraction bits of the extended double-precision floating-point
 572 | value `a'.
 573 *----------------------------------------------------------------------------*/
 574
 575 INLINE uint64_t extractFloatx80Frac( floatx80 a )
 576 {
 577
 578     return a.low;
 579
 580 }
 581
 582 /*----------------------------------------------------------------------------
 583 | Returns the exponent bits of the extended double-precision floating-point
 584 | value `a'.
 585 *----------------------------------------------------------------------------*/
 586
 587 INLINE int32 extractFloatx80Exp( floatx80 a )
 588 {
 589
 590     return a.high & 0x7FFF;
 591
 592 }
 593
 594 /*----------------------------------------------------------------------------
 595 | Returns the sign bit of the extended double-precision floating-point value
 596 | `a'.
 597 *----------------------------------------------------------------------------*/
 598
 599 INLINE flag extractFloatx80Sign( floatx80 a )
 600 {
 601
 602     return a.high>>15;
 603
 604 }
 605
 606 /*----------------------------------------------------------------------------
 607 | Normalizes the subnormal extended double-precision floating-point value
 608 | represented by the denormalized significand `aSig'.  The normalized exponent
 609 | and significand are stored at the locations pointed to by `zExpPtr' and
 610 | `zSigPtr', respectively.
 611 *----------------------------------------------------------------------------*/
 612
 613 static void
 614  normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
 615 {
 616     int8 shiftCount;
 617
 618     shiftCount = countLeadingZeros64( aSig );
 619     *zSigPtr = aSig<<shiftCount;
 620     *zExpPtr = 1 - shiftCount;
 621
 622 }
 623
 624 /*----------------------------------------------------------------------------
 625 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 626 | extended double-precision floating-point value, returning the result.
 627 *----------------------------------------------------------------------------*/
 628
 629 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
 630 {
 631     floatx80 z;
 632
 633     z.low = zSig;
 634     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
 635     return z;
 636
 637 }
 638
 639 /*----------------------------------------------------------------------------
 640 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 641 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 642 | and returns the proper extended double-precision floating-point value
 643 | corresponding to the abstract input.  Ordinarily, the abstract value is
 644 | rounded and packed into the extended double-precision format, with the
 645 | inexact exception raised if the abstract input cannot be represented
 646 | exactly.  However, if the abstract value is too large, the overflow and
 647 | inexact exceptions are raised and an infinity or maximal finite value is
 648 | returned.  If the abstract value is too small, the input value is rounded to
 649 | a subnormal number, and the underflow and inexact exceptions are raised if
 650 | the abstract input cannot be represented exactly as a subnormal extended
 651 | double-precision floating-point number.
 652 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 653 | number of bits as single or double precision, respectively.  Otherwise, the
 654 | result is rounded to the full precision of the extended double-precision
 655 | format.
 656 |     The input significand must be normalized or smaller.  If the input
 657 | significand is not normalized, `zExp' must be 0; in that case, the result
 658 | returned is a subnormal number, and it must not require rounding.  The
 659 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 660 | Floating-Point Arithmetic.
 661 *----------------------------------------------------------------------------*/
 662
 663 static floatx80
 664  roundAndPackFloatx80(
 665      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 666  STATUS_PARAM)
 667 {
 668     int8 roundingMode;
 669     flag roundNearestEven, increment, isTiny;
 670     int64 roundIncrement, roundMask, roundBits;
 671
 672     roundingMode = STATUS(float_rounding_mode);
 673     roundNearestEven = ( roundingMode == float_round_nearest_even );
 674     if ( roundingPrecision == 80 ) goto precision80;
 675     if ( roundingPrecision == 64 ) {
 676         roundIncrement = LIT64( 0x0000000000000400 );
 677         roundMask = LIT64( 0x00000000000007FF );
 678     }
 679     else if ( roundingPrecision == 32 ) {
 680         roundIncrement = LIT64( 0x0000008000000000 );
 681         roundMask = LIT64( 0x000000FFFFFFFFFF );
 682     }
 683     else {
 684         goto precision80;
 685     }
 686     zSig0 |= ( zSig1 != 0 );
 687     if ( ! roundNearestEven ) {
 688         if ( roundingMode == float_round_to_zero ) {
 689             roundIncrement = 0;
 690         }
 691         else {
 692             roundIncrement = roundMask;
 693             if ( zSign ) {
 694                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 695             }
 696             else {
 697                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 698             }
 699         }
 700     }
 701     roundBits = zSig0 & roundMask;
 702     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 703         if (    ( 0x7FFE < zExp )
 704              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 705            ) {
 706             goto overflow;
 707         }
 708         if ( zExp <= 0 ) {
 709             if (STATUS(flush_to_zero)) {
 710                 float_raise(float_flag_output_denormal STATUS_VAR);
 711                 return packFloatx80(zSign, 0, 0);
 712             }
 713             isTiny =
 714                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 715                 || ( zExp < 0 )
 716                 || ( zSig0 <= zSig0 + roundIncrement );
 717             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 718             zExp = 0;
 719             roundBits = zSig0 & roundMask;
 720             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 721             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 722             zSig0 += roundIncrement;
 723             if ( (int64_t) zSig0 < 0 ) zExp = 1;
 724             roundIncrement = roundMask + 1;
 725             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 726                 roundMask |= roundIncrement;
 727             }
 728             zSig0 &= ~ roundMask;
 729             return packFloatx80( zSign, zExp, zSig0 );
 730         }
 731     }
 732     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 733     zSig0 += roundIncrement;
 734     if ( zSig0 < roundIncrement ) {
 735         ++zExp;
 736         zSig0 = LIT64( 0x8000000000000000 );
 737     }
 738     roundIncrement = roundMask + 1;
 739     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 740         roundMask |= roundIncrement;
 741     }
 742     zSig0 &= ~ roundMask;
 743     if ( zSig0 == 0 ) zExp = 0;
 744     return packFloatx80( zSign, zExp, zSig0 );
 745  precision80:
 746     increment = ( (int64_t) zSig1 < 0 );
 747     if ( ! roundNearestEven ) {
 748         if ( roundingMode == float_round_to_zero ) {
 749             increment = 0;
 750         }
 751         else {
 752             if ( zSign ) {
 753                 increment = ( roundingMode == float_round_down ) && zSig1;
 754             }
 755             else {
 756                 increment = ( roundingMode == float_round_up ) && zSig1;
 757             }
 758         }
 759     }
 760     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 761         if (    ( 0x7FFE < zExp )
 762              || (    ( zExp == 0x7FFE )
 763                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 764                   && increment
 765                 )
 766            ) {
 767             roundMask = 0;
 768  overflow:
 769             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 770             if (    ( roundingMode == float_round_to_zero )
 771                  || ( zSign && ( roundingMode == float_round_up ) )
 772                  || ( ! zSign && ( roundingMode == float_round_down ) )
 773                ) {
 774                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 775             }
 776             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 777         }
 778         if ( zExp <= 0 ) {
 779             isTiny =
 780                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 781                 || ( zExp < 0 )
 782                 || ! increment
 783                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 784             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 785             zExp = 0;
 786             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 787             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 788             if ( roundNearestEven ) {
 789                 increment = ( (int64_t) zSig1 < 0 );
 790             }
 791             else {
 792                 if ( zSign ) {
 793                     increment = ( roundingMode == float_round_down ) && zSig1;
 794                 }
 795                 else {
 796                     increment = ( roundingMode == float_round_up ) && zSig1;
 797                 }
 798             }
 799             if ( increment ) {
 800                 ++zSig0;
 801                 zSig0 &=
 802                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 803                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
 804             }
 805             return packFloatx80( zSign, zExp, zSig0 );
 806         }
 807     }
 808     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 809     if ( increment ) {
 810         ++zSig0;
 811         if ( zSig0 == 0 ) {
 812             ++zExp;
 813             zSig0 = LIT64( 0x8000000000000000 );
 814         }
 815         else {
 816             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 817         }
 818     }
 819     else {
 820         if ( zSig0 == 0 ) zExp = 0;
 821     }
 822     return packFloatx80( zSign, zExp, zSig0 );
 823
 824 }
 825
 826 /*----------------------------------------------------------------------------
 827 | Takes an abstract floating-point value having sign `zSign', exponent
 828 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 829 | and returns the proper extended double-precision floating-point value
 830 | corresponding to the abstract input.  This routine is just like
 831 | `roundAndPackFloatx80' except that the input significand does not have to be
 832 | normalized.
 833 *----------------------------------------------------------------------------*/
 834
 835 static floatx80
 836  normalizeRoundAndPackFloatx80(
 837      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 838  STATUS_PARAM)
 839 {
 840     int8 shiftCount;
 841
 842     if ( zSig0 == 0 ) {
 843         zSig0 = zSig1;
 844         zSig1 = 0;
 845         zExp -= 64;
 846     }
 847     shiftCount = countLeadingZeros64( zSig0 );
 848     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 849     zExp -= shiftCount;
 850     return
 851         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 852
 853 }
 854
 855 /*----------------------------------------------------------------------------
 856 | Returns the least-significant 64 fraction bits of the quadruple-precision
 857 | floating-point value `a'.
 858 *----------------------------------------------------------------------------*/
 859
 860 INLINE uint64_t extractFloat128Frac1( float128 a )
 861 {
 862
 863     return a.low;
 864
 865 }
 866
 867 /*----------------------------------------------------------------------------
 868 | Returns the most-significant 48 fraction bits of the quadruple-precision
 869 | floating-point value `a'.
 870 *----------------------------------------------------------------------------*/
 871
 872 INLINE uint64_t extractFloat128Frac0( float128 a )
 873 {
 874
 875     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 876
 877 }
 878
 879 /*----------------------------------------------------------------------------
 880 | Returns the exponent bits of the quadruple-precision floating-point value
 881 | `a'.
 882 *----------------------------------------------------------------------------*/
 883
 884 INLINE int32 extractFloat128Exp( float128 a )
 885 {
 886
 887     return ( a.high>>48 ) & 0x7FFF;
 888
 889 }
 890
 891 /*----------------------------------------------------------------------------
 892 | Returns the sign bit of the quadruple-precision floating-point value `a'.
 893 *----------------------------------------------------------------------------*/
 894
 895 INLINE flag extractFloat128Sign( float128 a )
 896 {
 897
 898     return a.high>>63;
 899
 900 }
 901
 902 /*----------------------------------------------------------------------------
 903 | Normalizes the subnormal quadruple-precision floating-point value
 904 | represented by the denormalized significand formed by the concatenation of
 905 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 906 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 907 | significand are stored at the location pointed to by `zSig0Ptr', and the
 908 | least significant 64 bits of the normalized significand are stored at the
 909 | location pointed to by `zSig1Ptr'.
 910 *----------------------------------------------------------------------------*/
 911
 912 static void
 913  normalizeFloat128Subnormal(
 914      uint64_t aSig0,
 915      uint64_t aSig1,
 916      int32 *zExpPtr,
 917      uint64_t *zSig0Ptr,
 918      uint64_t *zSig1Ptr
 919  )
 920 {
 921     int8 shiftCount;
 922
 923     if ( aSig0 == 0 ) {
 924         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 925         if ( shiftCount < 0 ) {
 926             *zSig0Ptr = aSig1>>( - shiftCount );
 927             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 928         }
 929         else {
 930             *zSig0Ptr = aSig1<<shiftCount;
 931             *zSig1Ptr = 0;
 932         }
 933         *zExpPtr = - shiftCount - 63;
 934     }
 935     else {
 936         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 937         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 938         *zExpPtr = 1 - shiftCount;
 939     }
 940
 941 }
 942
 943 /*----------------------------------------------------------------------------
 944 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 945 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 946 | floating-point value, returning the result.  After being shifted into the
 947 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 948 | added together to form the most significant 32 bits of the result.  This
 949 | means that any integer portion of `zSig0' will be added into the exponent.
 950 | Since a properly normalized significand will have an integer portion equal
 951 | to 1, the `zExp' input should be 1 less than the desired result exponent
 952 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 953 | significand.
 954 *----------------------------------------------------------------------------*/
 955
 956 INLINE float128
 957  packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
 958 {
 959     float128 z;
 960
 961     z.low = zSig1;
 962     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
 963     return z;
 964
 965 }
 966
 967 /*----------------------------------------------------------------------------
 968 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 969 | and extended significand formed by the concatenation of `zSig0', `zSig1',
 970 | and `zSig2', and returns the proper quadruple-precision floating-point value
 971 | corresponding to the abstract input.  Ordinarily, the abstract value is
 972 | simply rounded and packed into the quadruple-precision format, with the
 973 | inexact exception raised if the abstract input cannot be represented
 974 | exactly.  However, if the abstract value is too large, the overflow and
 975 | inexact exceptions are raised and an infinity or maximal finite value is
 976 | returned.  If the abstract value is too small, the input value is rounded to
 977 | a subnormal number, and the underflow and inexact exceptions are raised if
 978 | the abstract input cannot be represented exactly as a subnormal quadruple-
 979 | precision floating-point number.
 980 |     The input significand must be normalized or smaller.  If the input
 981 | significand is not normalized, `zExp' must be 0; in that case, the result
 982 | returned is a subnormal number, and it must not require rounding.  In the
 983 | usual case that the input significand is normalized, `zExp' must be 1 less
 984 | than the ``true'' floating-point exponent.  The handling of underflow and
 985 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 986 *----------------------------------------------------------------------------*/
 987
 988 static float128
 989  roundAndPackFloat128(
 990      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
 991 {
 992     int8 roundingMode;
 993     flag roundNearestEven, increment, isTiny;
 994
 995     roundingMode = STATUS(float_rounding_mode);
 996     roundNearestEven = ( roundingMode == float_round_nearest_even );
 997     increment = ( (int64_t) zSig2 < 0 );
 998     if ( ! roundNearestEven ) {
 999         if ( roundingMode == float_round_to_zero ) {
1000             increment = 0;
1001         }
1002         else {
1003             if ( zSign ) {
1004                 increment = ( roundingMode == float_round_down ) && zSig2;
1005             }
1006             else {
1007                 increment = ( roundingMode == float_round_up ) && zSig2;
1008             }
1009         }
1010     }
1011     if ( 0x7FFD <= (uint32_t) zExp ) {
1012         if (    ( 0x7FFD < zExp )
1013              || (    ( zExp == 0x7FFD )
1014                   && eq128(
1015                          LIT64( 0x0001FFFFFFFFFFFF ),
1016                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1017                          zSig0,
1018                          zSig1
1019                      )
1020                   && increment
1021                 )
1022            ) {
1023             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1024             if (    ( roundingMode == float_round_to_zero )
1025                  || ( zSign && ( roundingMode == float_round_up ) )
1026                  || ( ! zSign && ( roundingMode == float_round_down ) )
1027                ) {
1028                 return
1029                     packFloat128(
1030                         zSign,
1031                         0x7FFE,
1032                         LIT64( 0x0000FFFFFFFFFFFF ),
1033                         LIT64( 0xFFFFFFFFFFFFFFFF )
1034                     );
1035             }
1036             return packFloat128( zSign, 0x7FFF, 0, 0 );
1037         }
1038         if ( zExp < 0 ) {
1039             if (STATUS(flush_to_zero)) {
1040                 float_raise(float_flag_output_denormal STATUS_VAR);
1041                 return packFloat128(zSign, 0, 0, 0);
1042             }
1043             isTiny =
1044                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1045                 || ( zExp < -1 )
1046                 || ! increment
1047                 || lt128(
1048                        zSig0,
1049                        zSig1,
1050                        LIT64( 0x0001FFFFFFFFFFFF ),
1051                        LIT64( 0xFFFFFFFFFFFFFFFF )
1052                    );
1053             shift128ExtraRightJamming(
1054                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1055             zExp = 0;
1056             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1057             if ( roundNearestEven ) {
1058                 increment = ( (int64_t) zSig2 < 0 );
1059             }
1060             else {
1061                 if ( zSign ) {
1062                     increment = ( roundingMode == float_round_down ) && zSig2;
1063                 }
1064                 else {
1065                     increment = ( roundingMode == float_round_up ) && zSig2;
1066                 }
1067             }
1068         }
1069     }
1070     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1071     if ( increment ) {
1072         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1073         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1074     }
1075     else {
1076         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1077     }
1078     return packFloat128( zSign, zExp, zSig0, zSig1 );
1079
1080 }
1081
1082 /*----------------------------------------------------------------------------
1083 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1084 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1085 | returns the proper quadruple-precision floating-point value corresponding
1086 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1087 | except that the input significand has fewer bits and does not have to be
1088 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1089 | point exponent.
1090 *----------------------------------------------------------------------------*/
1091
1092 static float128
1093  normalizeRoundAndPackFloat128(
1094      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1095 {
1096     int8 shiftCount;
1097     uint64_t zSig2;
1098
1099     if ( zSig0 == 0 ) {
1100         zSig0 = zSig1;
1101         zSig1 = 0;
1102         zExp -= 64;
1103     }
1104     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1105     if ( 0 <= shiftCount ) {
1106         zSig2 = 0;
1107         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1108     }
1109     else {
1110         shift128ExtraRightJamming(
1111             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1112     }
1113     zExp -= shiftCount;
1114     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1115
1116 }
1117
1118 /*----------------------------------------------------------------------------
1119 | Returns the result of converting the 32-bit two's complement integer `a'
1120 | to the single-precision floating-point format.  The conversion is performed
1121 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1122 *----------------------------------------------------------------------------*/
1123
1124 float32 int32_to_float32( int32 a STATUS_PARAM )
1125 {
1126     flag zSign;
1127
1128     if ( a == 0 ) return float32_zero;
1129     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1130     zSign = ( a < 0 );
1131     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1132
1133 }
1134
1135 /*----------------------------------------------------------------------------
1136 | Returns the result of converting the 32-bit two's complement integer `a'
1137 | to the double-precision floating-point format.  The conversion is performed
1138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139 *----------------------------------------------------------------------------*/
1140
1141 float64 int32_to_float64( int32 a STATUS_PARAM )
1142 {
1143     flag zSign;
1144     uint32 absA;
1145     int8 shiftCount;
1146     uint64_t zSig;
1147
1148     if ( a == 0 ) return float64_zero;
1149     zSign = ( a < 0 );
1150     absA = zSign ? - a : a;
1151     shiftCount = countLeadingZeros32( absA ) + 21;
1152     zSig = absA;
1153     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1154
1155 }
1156
1157 /*----------------------------------------------------------------------------
1158 | Returns the result of converting the 32-bit two's complement integer `a'
1159 | to the extended double-precision floating-point format.  The conversion
1160 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1161 | Arithmetic.
1162 *----------------------------------------------------------------------------*/
1163
1164 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1165 {
1166     flag zSign;
1167     uint32 absA;
1168     int8 shiftCount;
1169     uint64_t zSig;
1170
1171     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1172     zSign = ( a < 0 );
1173     absA = zSign ? - a : a;
1174     shiftCount = countLeadingZeros32( absA ) + 32;
1175     zSig = absA;
1176     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1177
1178 }
1179
1180 /*----------------------------------------------------------------------------
1181 | Returns the result of converting the 32-bit two's complement integer `a' to
1182 | the quadruple-precision floating-point format.  The conversion is performed
1183 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1184 *----------------------------------------------------------------------------*/
1185
1186 float128 int32_to_float128( int32 a STATUS_PARAM )
1187 {
1188     flag zSign;
1189     uint32 absA;
1190     int8 shiftCount;
1191     uint64_t zSig0;
1192
1193     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1194     zSign = ( a < 0 );
1195     absA = zSign ? - a : a;
1196     shiftCount = countLeadingZeros32( absA ) + 17;
1197     zSig0 = absA;
1198     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1199
1200 }
1201
1202 /*----------------------------------------------------------------------------
1203 | Returns the result of converting the 64-bit two's complement integer `a'
1204 | to the single-precision floating-point format.  The conversion is performed
1205 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1206 *----------------------------------------------------------------------------*/
1207
1208 float32 int64_to_float32( int64 a STATUS_PARAM )
1209 {
1210     flag zSign;
1211     uint64 absA;
1212     int8 shiftCount;
1213
1214     if ( a == 0 ) return float32_zero;
1215     zSign = ( a < 0 );
1216     absA = zSign ? - a : a;
1217     shiftCount = countLeadingZeros64( absA ) - 40;
1218     if ( 0 <= shiftCount ) {
1219         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1220     }
1221     else {
1222         shiftCount += 7;
1223         if ( shiftCount < 0 ) {
1224             shift64RightJamming( absA, - shiftCount, &absA );
1225         }
1226         else {
1227             absA <<= shiftCount;
1228         }
1229         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1230     }
1231
1232 }
1233
1234 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1235 {
1236     int8 shiftCount;
1237
1238     if ( a == 0 ) return float32_zero;
1239     shiftCount = countLeadingZeros64( a ) - 40;
1240     if ( 0 <= shiftCount ) {
1241         return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
1242     }
1243     else {
1244         shiftCount += 7;
1245         if ( shiftCount < 0 ) {
1246             shift64RightJamming( a, - shiftCount, &a );
1247         }
1248         else {
1249             a <<= shiftCount;
1250         }
1251         return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
1252     }
1253 }
1254
1255 /*----------------------------------------------------------------------------
1256 | Returns the result of converting the 64-bit two's complement integer `a'
1257 | to the double-precision floating-point format.  The conversion is performed
1258 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1259 *----------------------------------------------------------------------------*/
1260
1261 float64 int64_to_float64( int64 a STATUS_PARAM )
1262 {
1263     flag zSign;
1264
1265     if ( a == 0 ) return float64_zero;
1266     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1267         return packFloat64( 1, 0x43E, 0 );
1268     }
1269     zSign = ( a < 0 );
1270     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1271
1272 }
1273
1274 float64 uint64_to_float64(uint64 a STATUS_PARAM)
1275 {
1276     int exp =  0x43C;
1277
1278     if (a == 0) {
1279         return float64_zero;
1280     }
1281     if ((int64_t)a < 0) {
1282         shift64RightJamming(a, 1, &a);
1283         exp += 1;
1284     }
1285     return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);
1286 }
1287
1288 /*----------------------------------------------------------------------------
1289 | Returns the result of converting the 64-bit two's complement integer `a'
1290 | to the extended double-precision floating-point format.  The conversion
1291 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1292 | Arithmetic.
1293 *----------------------------------------------------------------------------*/
1294
1295 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1296 {
1297     flag zSign;
1298     uint64 absA;
1299     int8 shiftCount;
1300
1301     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1302     zSign = ( a < 0 );
1303     absA = zSign ? - a : a;
1304     shiftCount = countLeadingZeros64( absA );
1305     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1306
1307 }
1308
1309 /*----------------------------------------------------------------------------
1310 | Returns the result of converting the 64-bit two's complement integer `a' to
1311 | the quadruple-precision floating-point format.  The conversion is performed
1312 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1313 *----------------------------------------------------------------------------*/
1314
1315 float128 int64_to_float128( int64 a STATUS_PARAM )
1316 {
1317     flag zSign;
1318     uint64 absA;
1319     int8 shiftCount;
1320     int32 zExp;
1321     uint64_t zSig0, zSig1;
1322
1323     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1324     zSign = ( a < 0 );
1325     absA = zSign ? - a : a;
1326     shiftCount = countLeadingZeros64( absA ) + 49;
1327     zExp = 0x406E - shiftCount;
1328     if ( 64 <= shiftCount ) {
1329         zSig1 = 0;
1330         zSig0 = absA;
1331         shiftCount -= 64;
1332     }
1333     else {
1334         zSig1 = absA;
1335         zSig0 = 0;
1336     }
1337     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1338     return packFloat128( zSign, zExp, zSig0, zSig1 );
1339
1340 }
1341
1342 float128 uint64_to_float128(uint64 a STATUS_PARAM)
1343 {
1344     if (a == 0) {
1345         return float128_zero;
1346     }
1347     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1348 }
1349
1350 /*----------------------------------------------------------------------------
1351 | Returns the result of converting the single-precision floating-point value
1352 | `a' to the 32-bit two's complement integer format.  The conversion is
1353 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1354 | Arithmetic---which means in particular that the conversion is rounded
1355 | according to the current rounding mode.  If `a' is a NaN, the largest
1356 | positive integer is returned.  Otherwise, if the conversion overflows, the
1357 | largest integer with the same sign as `a' is returned.
1358 *----------------------------------------------------------------------------*/
1359
1360 int32 float32_to_int32( float32 a STATUS_PARAM )
1361 {
1362     flag aSign;
1363     int_fast16_t aExp, shiftCount;
1364     uint32_t aSig;
1365     uint64_t aSig64;
1366
1367     a = float32_squash_input_denormal(a STATUS_VAR);
1368     aSig = extractFloat32Frac( a );
1369     aExp = extractFloat32Exp( a );
1370     aSign = extractFloat32Sign( a );
1371     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1372     if ( aExp ) aSig |= 0x00800000;
1373     shiftCount = 0xAF - aExp;
1374     aSig64 = aSig;
1375     aSig64 <<= 32;
1376     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1377     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1378
1379 }
1380
1381 /*----------------------------------------------------------------------------
1382 | Returns the result of converting the single-precision floating-point value
1383 | `a' to the 32-bit two's complement integer format.  The conversion is
1384 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1385 | Arithmetic, except that the conversion is always rounded toward zero.
1386 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1387 | the conversion overflows, the largest integer with the same sign as `a' is
1388 | returned.
1389 *----------------------------------------------------------------------------*/
1390
1391 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1392 {
1393     flag aSign;
1394     int_fast16_t aExp, shiftCount;
1395     uint32_t aSig;
1396     int32_t z;
1397     a = float32_squash_input_denormal(a STATUS_VAR);
1398
1399     aSig = extractFloat32Frac( a );
1400     aExp = extractFloat32Exp( a );
1401     aSign = extractFloat32Sign( a );
1402     shiftCount = aExp - 0x9E;
1403     if ( 0 <= shiftCount ) {
1404         if ( float32_val(a) != 0xCF000000 ) {
1405             float_raise( float_flag_invalid STATUS_VAR);
1406             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1407         }
1408         return (int32_t) 0x80000000;
1409     }
1410     else if ( aExp <= 0x7E ) {
1411         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1412         return 0;
1413     }
1414     aSig = ( aSig | 0x00800000 )<<8;
1415     z = aSig>>( - shiftCount );
1416     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1417         STATUS(float_exception_flags) |= float_flag_inexact;
1418     }
1419     if ( aSign ) z = - z;
1420     return z;
1421
1422 }
1423
1424 /*----------------------------------------------------------------------------
1425 | Returns the result of converting the single-precision floating-point value
1426 | `a' to the 16-bit two's complement integer format.  The conversion is
1427 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1428 | Arithmetic, except that the conversion is always rounded toward zero.
1429 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1430 | the conversion overflows, the largest integer with the same sign as `a' is
1431 | returned.
1432 *----------------------------------------------------------------------------*/
1433
1434 int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
1435 {
1436     flag aSign;
1437     int_fast16_t aExp, shiftCount;
1438     uint32_t aSig;
1439     int32 z;
1440
1441     aSig = extractFloat32Frac( a );
1442     aExp = extractFloat32Exp( a );
1443     aSign = extractFloat32Sign( a );
1444     shiftCount = aExp - 0x8E;
1445     if ( 0 <= shiftCount ) {
1446         if ( float32_val(a) != 0xC7000000 ) {
1447             float_raise( float_flag_invalid STATUS_VAR);
1448             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1449                 return 0x7FFF;
1450             }
1451         }
1452         return (int32_t) 0xffff8000;
1453     }
1454     else if ( aExp <= 0x7E ) {
1455         if ( aExp | aSig ) {
1456             STATUS(float_exception_flags) |= float_flag_inexact;
1457         }
1458         return 0;
1459     }
1460     shiftCount -= 0x10;
1461     aSig = ( aSig | 0x00800000 )<<8;
1462     z = aSig>>( - shiftCount );
1463     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1464         STATUS(float_exception_flags) |= float_flag_inexact;
1465     }
1466     if ( aSign ) {
1467         z = - z;
1468     }
1469     return z;
1470
1471 }
1472
1473 /*----------------------------------------------------------------------------
1474 | Returns the result of converting the single-precision floating-point value
1475 | `a' to the 64-bit two's complement integer format.  The conversion is
1476 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1477 | Arithmetic---which means in particular that the conversion is rounded
1478 | according to the current rounding mode.  If `a' is a NaN, the largest
1479 | positive integer is returned.  Otherwise, if the conversion overflows, the
1480 | largest integer with the same sign as `a' is returned.
1481 *----------------------------------------------------------------------------*/
1482
1483 int64 float32_to_int64( float32 a STATUS_PARAM )
1484 {
1485     flag aSign;
1486     int_fast16_t aExp, shiftCount;
1487     uint32_t aSig;
1488     uint64_t aSig64, aSigExtra;
1489     a = float32_squash_input_denormal(a STATUS_VAR);
1490
1491     aSig = extractFloat32Frac( a );
1492     aExp = extractFloat32Exp( a );
1493     aSign = extractFloat32Sign( a );
1494     shiftCount = 0xBE - aExp;
1495     if ( shiftCount < 0 ) {
1496         float_raise( float_flag_invalid STATUS_VAR);
1497         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1498             return LIT64( 0x7FFFFFFFFFFFFFFF );
1499         }
1500         return (int64_t) LIT64( 0x8000000000000000 );
1501     }
1502     if ( aExp ) aSig |= 0x00800000;
1503     aSig64 = aSig;
1504     aSig64 <<= 40;
1505     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1506     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1507
1508 }
1509
1510 /*----------------------------------------------------------------------------
1511 | Returns the result of converting the single-precision floating-point value
1512 | `a' to the 64-bit two's complement integer format.  The conversion is
1513 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1514 | Arithmetic, except that the conversion is always rounded toward zero.  If
1515 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1516 | conversion overflows, the largest integer with the same sign as `a' is
1517 | returned.
1518 *----------------------------------------------------------------------------*/
1519
1520 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1521 {
1522     flag aSign;
1523     int_fast16_t aExp, shiftCount;
1524     uint32_t aSig;
1525     uint64_t aSig64;
1526     int64 z;
1527     a = float32_squash_input_denormal(a STATUS_VAR);
1528
1529     aSig = extractFloat32Frac( a );
1530     aExp = extractFloat32Exp( a );
1531     aSign = extractFloat32Sign( a );
1532     shiftCount = aExp - 0xBE;
1533     if ( 0 <= shiftCount ) {
1534         if ( float32_val(a) != 0xDF000000 ) {
1535             float_raise( float_flag_invalid STATUS_VAR);
1536             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1537                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1538             }
1539         }
1540         return (int64_t) LIT64( 0x8000000000000000 );
1541     }
1542     else if ( aExp <= 0x7E ) {
1543         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1544         return 0;
1545     }
1546     aSig64 = aSig | 0x00800000;
1547     aSig64 <<= 40;
1548     z = aSig64>>( - shiftCount );
1549     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1550         STATUS(float_exception_flags) |= float_flag_inexact;
1551     }
1552     if ( aSign ) z = - z;
1553     return z;
1554
1555 }
1556
1557 /*----------------------------------------------------------------------------
1558 | Returns the result of converting the single-precision floating-point value
1559 | `a' to the double-precision floating-point format.  The conversion is
1560 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1561 | Arithmetic.
1562 *----------------------------------------------------------------------------*/
1563
1564 float64 float32_to_float64( float32 a STATUS_PARAM )
1565 {
1566     flag aSign;
1567     int_fast16_t aExp;
1568     uint32_t aSig;
1569     a = float32_squash_input_denormal(a STATUS_VAR);
1570
1571     aSig = extractFloat32Frac( a );
1572     aExp = extractFloat32Exp( a );
1573     aSign = extractFloat32Sign( a );
1574     if ( aExp == 0xFF ) {
1575         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1576         return packFloat64( aSign, 0x7FF, 0 );
1577     }
1578     if ( aExp == 0 ) {
1579         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1580         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1581         --aExp;
1582     }
1583     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1584
1585 }
1586
1587 /*----------------------------------------------------------------------------
1588 | Returns the result of converting the single-precision floating-point value
1589 | `a' to the extended double-precision floating-point format.  The conversion
1590 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1591 | Arithmetic.
1592 *----------------------------------------------------------------------------*/
1593
1594 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1595 {
1596     flag aSign;
1597     int_fast16_t aExp;
1598     uint32_t aSig;
1599
1600     a = float32_squash_input_denormal(a STATUS_VAR);
1601     aSig = extractFloat32Frac( a );
1602     aExp = extractFloat32Exp( a );
1603     aSign = extractFloat32Sign( a );
1604     if ( aExp == 0xFF ) {
1605         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1606         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1607     }
1608     if ( aExp == 0 ) {
1609         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1610         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1611     }
1612     aSig |= 0x00800000;
1613     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1614
1615 }
1616
1617 /*----------------------------------------------------------------------------
1618 | Returns the result of converting the single-precision floating-point value
1619 | `a' to the double-precision floating-point format.  The conversion is
1620 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1621 | Arithmetic.
1622 *----------------------------------------------------------------------------*/
1623
1624 float128 float32_to_float128( float32 a STATUS_PARAM )
1625 {
1626     flag aSign;
1627     int_fast16_t aExp;
1628     uint32_t aSig;
1629
1630     a = float32_squash_input_denormal(a STATUS_VAR);
1631     aSig = extractFloat32Frac( a );
1632     aExp = extractFloat32Exp( a );
1633     aSign = extractFloat32Sign( a );
1634     if ( aExp == 0xFF ) {
1635         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1636         return packFloat128( aSign, 0x7FFF, 0, 0 );
1637     }
1638     if ( aExp == 0 ) {
1639         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1640         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1641         --aExp;
1642     }
1643     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1644
1645 }
1646
1647 /*----------------------------------------------------------------------------
1648 | Rounds the single-precision floating-point value `a' to an integer, and
1649 | returns the result as a single-precision floating-point value.  The
1650 | operation is performed according to the IEC/IEEE Standard for Binary
1651 | Floating-Point Arithmetic.
1652 *----------------------------------------------------------------------------*/
1653
1654 float32 float32_round_to_int( float32 a STATUS_PARAM)
1655 {
1656     flag aSign;
1657     int_fast16_t aExp;
1658     uint32_t lastBitMask, roundBitsMask;
1659     int8 roundingMode;
1660     uint32_t z;
1661     a = float32_squash_input_denormal(a STATUS_VAR);
1662
1663     aExp = extractFloat32Exp( a );
1664     if ( 0x96 <= aExp ) {
1665         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1666             return propagateFloat32NaN( a, a STATUS_VAR );
1667         }
1668         return a;
1669     }
1670     if ( aExp <= 0x7E ) {
1671         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1672         STATUS(float_exception_flags) |= float_flag_inexact;
1673         aSign = extractFloat32Sign( a );
1674         switch ( STATUS(float_rounding_mode) ) {
1675          case float_round_nearest_even:
1676             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1677                 return packFloat32( aSign, 0x7F, 0 );
1678             }
1679             break;
1680          case float_round_down:
1681             return make_float32(aSign ? 0xBF800000 : 0);
1682          case float_round_up:
1683             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1684         }
1685         return packFloat32( aSign, 0, 0 );
1686     }
1687     lastBitMask = 1;
1688     lastBitMask <<= 0x96 - aExp;
1689     roundBitsMask = lastBitMask - 1;
1690     z = float32_val(a);
1691     roundingMode = STATUS(float_rounding_mode);
1692     if ( roundingMode == float_round_nearest_even ) {
1693         z += lastBitMask>>1;
1694         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1695     }
1696     else if ( roundingMode != float_round_to_zero ) {
1697         if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1698             z += roundBitsMask;
1699         }
1700     }
1701     z &= ~ roundBitsMask;
1702     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1703     return make_float32(z);
1704
1705 }
1706
1707 /*----------------------------------------------------------------------------
1708 | Returns the result of adding the absolute values of the single-precision
1709 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1710 | before being returned.  `zSign' is ignored if the result is a NaN.
1711 | The addition is performed according to the IEC/IEEE Standard for Binary
1712 | Floating-Point Arithmetic.
1713 *----------------------------------------------------------------------------*/
1714
1715 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1716 {
1717     int_fast16_t aExp, bExp, zExp;
1718     uint32_t aSig, bSig, zSig;
1719     int_fast16_t expDiff;
1720
1721     aSig = extractFloat32Frac( a );
1722     aExp = extractFloat32Exp( a );
1723     bSig = extractFloat32Frac( b );
1724     bExp = extractFloat32Exp( b );
1725     expDiff = aExp - bExp;
1726     aSig <<= 6;
1727     bSig <<= 6;
1728     if ( 0 < expDiff ) {
1729         if ( aExp == 0xFF ) {
1730             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1731             return a;
1732         }
1733         if ( bExp == 0 ) {
1734             --expDiff;
1735         }
1736         else {
1737             bSig |= 0x20000000;
1738         }
1739         shift32RightJamming( bSig, expDiff, &bSig );
1740         zExp = aExp;
1741     }
1742     else if ( expDiff < 0 ) {
1743         if ( bExp == 0xFF ) {
1744             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1745             return packFloat32( zSign, 0xFF, 0 );
1746         }
1747         if ( aExp == 0 ) {
1748             ++expDiff;
1749         }
1750         else {
1751             aSig |= 0x20000000;
1752         }
1753         shift32RightJamming( aSig, - expDiff, &aSig );
1754         zExp = bExp;
1755     }
1756     else {
1757         if ( aExp == 0xFF ) {
1758             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1759             return a;
1760         }
1761         if ( aExp == 0 ) {
1762             if (STATUS(flush_to_zero)) {
1763                 if (aSig | bSig) {
1764                     float_raise(float_flag_output_denormal STATUS_VAR);
1765                 }
1766                 return packFloat32(zSign, 0, 0);
1767             }
1768             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1769         }
1770         zSig = 0x40000000 + aSig + bSig;
1771         zExp = aExp;
1772         goto roundAndPack;
1773     }
1774     aSig |= 0x20000000;
1775     zSig = ( aSig + bSig )<<1;
1776     --zExp;
1777     if ( (int32_t) zSig < 0 ) {
1778         zSig = aSig + bSig;
1779         ++zExp;
1780     }
1781  roundAndPack:
1782     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1783
1784 }
1785
1786 /*----------------------------------------------------------------------------
1787 | Returns the result of subtracting the absolute values of the single-
1788 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1789 | difference is negated before being returned.  `zSign' is ignored if the
1790 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1791 | Standard for Binary Floating-Point Arithmetic.
1792 *----------------------------------------------------------------------------*/
1793
1794 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1795 {
1796     int_fast16_t aExp, bExp, zExp;
1797     uint32_t aSig, bSig, zSig;
1798     int_fast16_t expDiff;
1799
1800     aSig = extractFloat32Frac( a );
1801     aExp = extractFloat32Exp( a );
1802     bSig = extractFloat32Frac( b );
1803     bExp = extractFloat32Exp( b );
1804     expDiff = aExp - bExp;
1805     aSig <<= 7;
1806     bSig <<= 7;
1807     if ( 0 < expDiff ) goto aExpBigger;
1808     if ( expDiff < 0 ) goto bExpBigger;
1809     if ( aExp == 0xFF ) {
1810         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1811         float_raise( float_flag_invalid STATUS_VAR);
1812         return float32_default_nan;
1813     }
1814     if ( aExp == 0 ) {
1815         aExp = 1;
1816         bExp = 1;
1817     }
1818     if ( bSig < aSig ) goto aBigger;
1819     if ( aSig < bSig ) goto bBigger;
1820     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1821  bExpBigger:
1822     if ( bExp == 0xFF ) {
1823         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1824         return packFloat32( zSign ^ 1, 0xFF, 0 );
1825     }
1826     if ( aExp == 0 ) {
1827         ++expDiff;
1828     }
1829     else {
1830         aSig |= 0x40000000;
1831     }
1832     shift32RightJamming( aSig, - expDiff, &aSig );
1833     bSig |= 0x40000000;
1834  bBigger:
1835     zSig = bSig - aSig;
1836     zExp = bExp;
1837     zSign ^= 1;
1838     goto normalizeRoundAndPack;
1839  aExpBigger:
1840     if ( aExp == 0xFF ) {
1841         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1842         return a;
1843     }
1844     if ( bExp == 0 ) {
1845         --expDiff;
1846     }
1847     else {
1848         bSig |= 0x40000000;
1849     }
1850     shift32RightJamming( bSig, expDiff, &bSig );
1851     aSig |= 0x40000000;
1852  aBigger:
1853     zSig = aSig - bSig;
1854     zExp = aExp;
1855  normalizeRoundAndPack:
1856     --zExp;
1857     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1858
1859 }
1860
1861 /*----------------------------------------------------------------------------
1862 | Returns the result of adding the single-precision floating-point values `a'
1863 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1864 | Binary Floating-Point Arithmetic.
1865 *----------------------------------------------------------------------------*/
1866
1867 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1868 {
1869     flag aSign, bSign;
1870     a = float32_squash_input_denormal(a STATUS_VAR);
1871     b = float32_squash_input_denormal(b STATUS_VAR);
1872
1873     aSign = extractFloat32Sign( a );
1874     bSign = extractFloat32Sign( b );
1875     if ( aSign == bSign ) {
1876         return addFloat32Sigs( a, b, aSign STATUS_VAR);
1877     }
1878     else {
1879         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1880     }
1881
1882 }
1883
1884 /*----------------------------------------------------------------------------
1885 | Returns the result of subtracting the single-precision floating-point values
1886 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1887 | for Binary Floating-Point Arithmetic.
1888 *----------------------------------------------------------------------------*/
1889
1890 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1891 {
1892     flag aSign, bSign;
1893     a = float32_squash_input_denormal(a STATUS_VAR);
1894     b = float32_squash_input_denormal(b STATUS_VAR);
1895
1896     aSign = extractFloat32Sign( a );
1897     bSign = extractFloat32Sign( b );
1898     if ( aSign == bSign ) {
1899         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1900     }
1901     else {
1902         return addFloat32Sigs( a, b, aSign STATUS_VAR );
1903     }
1904
1905 }
1906
1907 /*----------------------------------------------------------------------------
1908 | Returns the result of multiplying the single-precision floating-point values
1909 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1910 | for Binary Floating-Point Arithmetic.
1911 *----------------------------------------------------------------------------*/
1912
1913 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1914 {
1915     flag aSign, bSign, zSign;
1916     int_fast16_t aExp, bExp, zExp;
1917     uint32_t aSig, bSig;
1918     uint64_t zSig64;
1919     uint32_t zSig;
1920
1921     a = float32_squash_input_denormal(a STATUS_VAR);
1922     b = float32_squash_input_denormal(b STATUS_VAR);
1923
1924     aSig = extractFloat32Frac( a );
1925     aExp = extractFloat32Exp( a );
1926     aSign = extractFloat32Sign( a );
1927     bSig = extractFloat32Frac( b );
1928     bExp = extractFloat32Exp( b );
1929     bSign = extractFloat32Sign( b );
1930     zSign = aSign ^ bSign;
1931     if ( aExp == 0xFF ) {
1932         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1933             return propagateFloat32NaN( a, b STATUS_VAR );
1934         }
1935         if ( ( bExp | bSig ) == 0 ) {
1936             float_raise( float_flag_invalid STATUS_VAR);
1937             return float32_default_nan;
1938         }
1939         return packFloat32( zSign, 0xFF, 0 );
1940     }
1941     if ( bExp == 0xFF ) {
1942         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1943         if ( ( aExp | aSig ) == 0 ) {
1944             float_raise( float_flag_invalid STATUS_VAR);
1945             return float32_default_nan;
1946         }
1947         return packFloat32( zSign, 0xFF, 0 );
1948     }
1949     if ( aExp == 0 ) {
1950         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1951         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1952     }
1953     if ( bExp == 0 ) {
1954         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1955         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1956     }
1957     zExp = aExp + bExp - 0x7F;
1958     aSig = ( aSig | 0x00800000 )<<7;
1959     bSig = ( bSig | 0x00800000 )<<8;
1960     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
1961     zSig = zSig64;
1962     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
1963         zSig <<= 1;
1964         --zExp;
1965     }
1966     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1967
1968 }
1969
1970 /*----------------------------------------------------------------------------
1971 | Returns the result of dividing the single-precision floating-point value `a'
1972 | by the corresponding value `b'.  The operation is performed according to the
1973 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1974 *----------------------------------------------------------------------------*/
1975
1976 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1977 {
1978     flag aSign, bSign, zSign;
1979     int_fast16_t aExp, bExp, zExp;
1980     uint32_t aSig, bSig, zSig;
1981     a = float32_squash_input_denormal(a STATUS_VAR);
1982     b = float32_squash_input_denormal(b STATUS_VAR);
1983
1984     aSig = extractFloat32Frac( a );
1985     aExp = extractFloat32Exp( a );
1986     aSign = extractFloat32Sign( a );
1987     bSig = extractFloat32Frac( b );
1988     bExp = extractFloat32Exp( b );
1989     bSign = extractFloat32Sign( b );
1990     zSign = aSign ^ bSign;
1991     if ( aExp == 0xFF ) {
1992         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1993         if ( bExp == 0xFF ) {
1994             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1995             float_raise( float_flag_invalid STATUS_VAR);
1996             return float32_default_nan;
1997         }
1998         return packFloat32( zSign, 0xFF, 0 );
1999     }
2000     if ( bExp == 0xFF ) {
2001         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2002         return packFloat32( zSign, 0, 0 );
2003     }
2004     if ( bExp == 0 ) {
2005         if ( bSig == 0 ) {
2006             if ( ( aExp | aSig ) == 0 ) {
2007                 float_raise( float_flag_invalid STATUS_VAR);
2008                 return float32_default_nan;
2009             }
2010             float_raise( float_flag_divbyzero STATUS_VAR);
2011             return packFloat32( zSign, 0xFF, 0 );
2012         }
2013         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2014     }
2015     if ( aExp == 0 ) {
2016         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2017         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2018     }
2019     zExp = aExp - bExp + 0x7D;
2020     aSig = ( aSig | 0x00800000 )<<7;
2021     bSig = ( bSig | 0x00800000 )<<8;
2022     if ( bSig <= ( aSig + aSig ) ) {
2023         aSig >>= 1;
2024         ++zExp;
2025     }
2026     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2027     if ( ( zSig & 0x3F ) == 0 ) {
2028         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2029     }
2030     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2031
2032 }
2033
2034 /*----------------------------------------------------------------------------
2035 | Returns the remainder of the single-precision floating-point value `a'
2036 | with respect to the corresponding value `b'.  The operation is performed
2037 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2038 *----------------------------------------------------------------------------*/
2039
2040 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2041 {
2042     flag aSign, zSign;
2043     int_fast16_t aExp, bExp, expDiff;
2044     uint32_t aSig, bSig;
2045     uint32_t q;
2046     uint64_t aSig64, bSig64, q64;
2047     uint32_t alternateASig;
2048     int32_t sigMean;
2049     a = float32_squash_input_denormal(a STATUS_VAR);
2050     b = float32_squash_input_denormal(b STATUS_VAR);
2051
2052     aSig = extractFloat32Frac( a );
2053     aExp = extractFloat32Exp( a );
2054     aSign = extractFloat32Sign( a );
2055     bSig = extractFloat32Frac( b );
2056     bExp = extractFloat32Exp( b );
2057     if ( aExp == 0xFF ) {
2058         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2059             return propagateFloat32NaN( a, b STATUS_VAR );
2060         }
2061         float_raise( float_flag_invalid STATUS_VAR);
2062         return float32_default_nan;
2063     }
2064     if ( bExp == 0xFF ) {
2065         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2066         return a;
2067     }
2068     if ( bExp == 0 ) {
2069         if ( bSig == 0 ) {
2070             float_raise( float_flag_invalid STATUS_VAR);
2071             return float32_default_nan;
2072         }
2073         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2074     }
2075     if ( aExp == 0 ) {
2076         if ( aSig == 0 ) return a;
2077         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2078     }
2079     expDiff = aExp - bExp;
2080     aSig |= 0x00800000;
2081     bSig |= 0x00800000;
2082     if ( expDiff < 32 ) {
2083         aSig <<= 8;
2084         bSig <<= 8;
2085         if ( expDiff < 0 ) {
2086             if ( expDiff < -1 ) return a;
2087             aSig >>= 1;
2088         }
2089         q = ( bSig <= aSig );
2090         if ( q ) aSig -= bSig;
2091         if ( 0 < expDiff ) {
2092             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2093             q >>= 32 - expDiff;
2094             bSig >>= 2;
2095             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2096         }
2097         else {
2098             aSig >>= 2;
2099             bSig >>= 2;
2100         }
2101     }
2102     else {
2103         if ( bSig <= aSig ) aSig -= bSig;
2104         aSig64 = ( (uint64_t) aSig )<<40;
2105         bSig64 = ( (uint64_t) bSig )<<40;
2106         expDiff -= 64;
2107         while ( 0 < expDiff ) {
2108             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2109             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2110             aSig64 = - ( ( bSig * q64 )<<38 );
2111             expDiff -= 62;
2112         }
2113         expDiff += 64;
2114         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2115         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2116         q = q64>>( 64 - expDiff );
2117         bSig <<= 6;
2118         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2119     }
2120     do {
2121         alternateASig = aSig;
2122         ++q;
2123         aSig -= bSig;
2124     } while ( 0 <= (int32_t) aSig );
2125     sigMean = aSig + alternateASig;
2126     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2127         aSig = alternateASig;
2128     }
2129     zSign = ( (int32_t) aSig < 0 );
2130     if ( zSign ) aSig = - aSig;
2131     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2132
2133 }
2134
2135 /*----------------------------------------------------------------------------
2136 | Returns the result of multiplying the single-precision floating-point values
2137 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2138 | multiplication.  The operation is performed according to the IEC/IEEE
2139 | Standard for Binary Floating-Point Arithmetic 754-2008.
2140 | The flags argument allows the caller to select negation of the
2141 | addend, the intermediate product, or the final result. (The difference
2142 | between this and having the caller do a separate negation is that negating
2143 | externally will flip the sign bit on NaNs.)
2144 *----------------------------------------------------------------------------*/
2145
2146 float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2147 {
2148     flag aSign, bSign, cSign, zSign;
2149     int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2150     uint32_t aSig, bSig, cSig;
2151     flag pInf, pZero, pSign;
2152     uint64_t pSig64, cSig64, zSig64;
2153     uint32_t pSig;
2154     int shiftcount;
2155     flag signflip, infzero;
2156
2157     a = float32_squash_input_denormal(a STATUS_VAR);
2158     b = float32_squash_input_denormal(b STATUS_VAR);
2159     c = float32_squash_input_denormal(c STATUS_VAR);
2160     aSig = extractFloat32Frac(a);
2161     aExp = extractFloat32Exp(a);
2162     aSign = extractFloat32Sign(a);
2163     bSig = extractFloat32Frac(b);
2164     bExp = extractFloat32Exp(b);
2165     bSign = extractFloat32Sign(b);
2166     cSig = extractFloat32Frac(c);
2167     cExp = extractFloat32Exp(c);
2168     cSign = extractFloat32Sign(c);
2169
2170     infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2171                (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2172
2173     /* It is implementation-defined whether the cases of (0,inf,qnan)
2174      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2175      * they return if they do), so we have to hand this information
2176      * off to the target-specific pick-a-NaN routine.
2177      */
2178     if (((aExp == 0xff) && aSig) ||
2179         ((bExp == 0xff) && bSig) ||
2180         ((cExp == 0xff) && cSig)) {
2181         return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2182     }
2183
2184     if (infzero) {
2185         float_raise(float_flag_invalid STATUS_VAR);
2186         return float32_default_nan;
2187     }
2188
2189     if (flags & float_muladd_negate_c) {
2190         cSign ^= 1;
2191     }
2192
2193     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2194
2195     /* Work out the sign and type of the product */
2196     pSign = aSign ^ bSign;
2197     if (flags & float_muladd_negate_product) {
2198         pSign ^= 1;
2199     }
2200     pInf = (aExp == 0xff) || (bExp == 0xff);
2201     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2202
2203     if (cExp == 0xff) {
2204         if (pInf && (pSign ^ cSign)) {
2205             /* addition of opposite-signed infinities => InvalidOperation */
2206             float_raise(float_flag_invalid STATUS_VAR);
2207             return float32_default_nan;
2208         }
2209         /* Otherwise generate an infinity of the same sign */
2210         return packFloat32(cSign ^ signflip, 0xff, 0);
2211     }
2212
2213     if (pInf) {
2214         return packFloat32(pSign ^ signflip, 0xff, 0);
2215     }
2216
2217     if (pZero) {
2218         if (cExp == 0) {
2219             if (cSig == 0) {
2220                 /* Adding two exact zeroes */
2221                 if (pSign == cSign) {
2222                     zSign = pSign;
2223                 } else if (STATUS(float_rounding_mode) == float_round_down) {
2224                     zSign = 1;
2225                 } else {
2226                     zSign = 0;
2227                 }
2228                 return packFloat32(zSign ^ signflip, 0, 0);
2229             }
2230             /* Exact zero plus a denorm */
2231             if (STATUS(flush_to_zero)) {
2232                 float_raise(float_flag_output_denormal STATUS_VAR);
2233                 return packFloat32(cSign ^ signflip, 0, 0);
2234             }
2235         }
2236         /* Zero plus something non-zero : just return the something */
2237         return packFloat32(cSign ^ signflip, cExp, cSig);
2238     }
2239
2240     if (aExp == 0) {
2241         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2242     }
2243     if (bExp == 0) {
2244         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2245     }
2246
2247     /* Calculate the actual result a * b + c */
2248
2249     /* Multiply first; this is easy. */
2250     /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2251      * because we want the true exponent, not the "one-less-than"
2252      * flavour that roundAndPackFloat32() takes.
2253      */
2254     pExp = aExp + bExp - 0x7e;
2255     aSig = (aSig | 0x00800000) << 7;
2256     bSig = (bSig | 0x00800000) << 8;
2257     pSig64 = (uint64_t)aSig * bSig;
2258     if ((int64_t)(pSig64 << 1) >= 0) {
2259         pSig64 <<= 1;
2260         pExp--;
2261     }
2262
2263     zSign = pSign ^ signflip;
2264
2265     /* Now pSig64 is the significand of the multiply, with the explicit bit in
2266      * position 62.
2267      */
2268     if (cExp == 0) {
2269         if (!cSig) {
2270             /* Throw out the special case of c being an exact zero now */
2271             shift64RightJamming(pSig64, 32, &pSig64);
2272             pSig = pSig64;
2273             return roundAndPackFloat32(zSign, pExp - 1,
2274                                        pSig STATUS_VAR);
2275         }
2276         normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2277     }
2278
2279     cSig64 = (uint64_t)cSig << (62 - 23);
2280     cSig64 |= LIT64(0x4000000000000000);
2281     expDiff = pExp - cExp;
2282
2283     if (pSign == cSign) {
2284         /* Addition */
2285         if (expDiff > 0) {
2286             /* scale c to match p */
2287             shift64RightJamming(cSig64, expDiff, &cSig64);
2288             zExp = pExp;
2289         } else if (expDiff < 0) {
2290             /* scale p to match c */
2291             shift64RightJamming(pSig64, -expDiff, &pSig64);
2292             zExp = cExp;
2293         } else {
2294             /* no scaling needed */
2295             zExp = cExp;
2296         }
2297         /* Add significands and make sure explicit bit ends up in posn 62 */
2298         zSig64 = pSig64 + cSig64;
2299         if ((int64_t)zSig64 < 0) {
2300             shift64RightJamming(zSig64, 1, &zSig64);
2301         } else {
2302             zExp--;
2303         }
2304     } else {
2305         /* Subtraction */
2306         if (expDiff > 0) {
2307             shift64RightJamming(cSig64, expDiff, &cSig64);
2308             zSig64 = pSig64 - cSig64;
2309             zExp = pExp;
2310         } else if (expDiff < 0) {
2311             shift64RightJamming(pSig64, -expDiff, &pSig64);
2312             zSig64 = cSig64 - pSig64;
2313             zExp = cExp;
2314             zSign ^= 1;
2315         } else {
2316             zExp = pExp;
2317             if (cSig64 < pSig64) {
2318                 zSig64 = pSig64 - cSig64;
2319             } else if (pSig64 < cSig64) {
2320                 zSig64 = cSig64 - pSig64;
2321                 zSign ^= 1;
2322             } else {
2323                 /* Exact zero */
2324                 zSign = signflip;
2325                 if (STATUS(float_rounding_mode) == float_round_down) {
2326                     zSign ^= 1;
2327                 }
2328                 return packFloat32(zSign, 0, 0);
2329             }
2330         }
2331         --zExp;
2332         /* Normalize to put the explicit bit back into bit 62. */
2333         shiftcount = countLeadingZeros64(zSig64) - 1;
2334         zSig64 <<= shiftcount;
2335         zExp -= shiftcount;
2336     }
2337     shift64RightJamming(zSig64, 32, &zSig64);
2338     return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2339 }
2340
2341
2342 /*----------------------------------------------------------------------------
2343 | Returns the square root of the single-precision floating-point value `a'.
2344 | The operation is performed according to the IEC/IEEE Standard for Binary
2345 | Floating-Point Arithmetic.
2346 *----------------------------------------------------------------------------*/
2347
2348 float32 float32_sqrt( float32 a STATUS_PARAM )
2349 {
2350     flag aSign;
2351     int_fast16_t aExp, zExp;
2352     uint32_t aSig, zSig;
2353     uint64_t rem, term;
2354     a = float32_squash_input_denormal(a STATUS_VAR);
2355
2356     aSig = extractFloat32Frac( a );
2357     aExp = extractFloat32Exp( a );
2358     aSign = extractFloat32Sign( a );
2359     if ( aExp == 0xFF ) {
2360         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2361         if ( ! aSign ) return a;
2362         float_raise( float_flag_invalid STATUS_VAR);
2363         return float32_default_nan;
2364     }
2365     if ( aSign ) {
2366         if ( ( aExp | aSig ) == 0 ) return a;
2367         float_raise( float_flag_invalid STATUS_VAR);
2368         return float32_default_nan;
2369     }
2370     if ( aExp == 0 ) {
2371         if ( aSig == 0 ) return float32_zero;
2372         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2373     }
2374     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2375     aSig = ( aSig | 0x00800000 )<<8;
2376     zSig = estimateSqrt32( aExp, aSig ) + 2;
2377     if ( ( zSig & 0x7F ) <= 5 ) {
2378         if ( zSig < 2 ) {
2379             zSig = 0x7FFFFFFF;
2380             goto roundAndPack;
2381         }
2382         aSig >>= aExp & 1;
2383         term = ( (uint64_t) zSig ) * zSig;
2384         rem = ( ( (uint64_t) aSig )<<32 ) - term;
2385         while ( (int64_t) rem < 0 ) {
2386             --zSig;
2387             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2388         }
2389         zSig |= ( rem != 0 );
2390     }
2391     shift32RightJamming( zSig, 1, &zSig );
2392  roundAndPack:
2393     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2394
2395 }
2396
2397 /*----------------------------------------------------------------------------
2398 | Returns the binary exponential of the single-precision floating-point value
2399 | `a'. The operation is performed according to the IEC/IEEE Standard for
2400 | Binary Floating-Point Arithmetic.
2401 |
2402 | Uses the following identities:
2403 |
2404 | 1. -------------------------------------------------------------------------
2405 |      x    x*ln(2)
2406 |     2  = e
2407 |
2408 | 2. -------------------------------------------------------------------------
2409 |                      2     3     4     5           n
2410 |      x        x     x     x     x     x           x
2411 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2412 |               1!    2!    3!    4!    5!          n!
2413 *----------------------------------------------------------------------------*/
2414
2415 static const float64 float32_exp2_coefficients[15] =
2416 {
2417     const_float64( 0x3ff0000000000000ll ), /*  1 */
2418     const_float64( 0x3fe0000000000000ll ), /*  2 */
2419     const_float64( 0x3fc5555555555555ll ), /*  3 */
2420     const_float64( 0x3fa5555555555555ll ), /*  4 */
2421     const_float64( 0x3f81111111111111ll ), /*  5 */
2422     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2423     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2424     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2425     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2426     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2427     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2428     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2429     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2430     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2431     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2432 };
2433
2434 float32 float32_exp2( float32 a STATUS_PARAM )
2435 {
2436     flag aSign;
2437     int_fast16_t aExp;
2438     uint32_t aSig;
2439     float64 r, x, xn;
2440     int i;
2441     a = float32_squash_input_denormal(a STATUS_VAR);
2442
2443     aSig = extractFloat32Frac( a );
2444     aExp = extractFloat32Exp( a );
2445     aSign = extractFloat32Sign( a );
2446
2447     if ( aExp == 0xFF) {
2448         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2449         return (aSign) ? float32_zero : a;
2450     }
2451     if (aExp == 0) {
2452         if (aSig == 0) return float32_one;
2453     }
2454
2455     float_raise( float_flag_inexact STATUS_VAR);
2456
2457     /* ******************************* */
2458     /* using float64 for approximation */
2459     /* ******************************* */
2460     x = float32_to_float64(a STATUS_VAR);
2461     x = float64_mul(x, float64_ln2 STATUS_VAR);
2462
2463     xn = x;
2464     r = float64_one;
2465     for (i = 0 ; i < 15 ; i++) {
2466         float64 f;
2467
2468         f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2469         r = float64_add(r, f STATUS_VAR);
2470
2471         xn = float64_mul(xn, x STATUS_VAR);
2472     }
2473
2474     return float64_to_float32(r, status);
2475 }
2476
2477 /*----------------------------------------------------------------------------
2478 | Returns the binary log of the single-precision floating-point value `a'.
2479 | The operation is performed according to the IEC/IEEE Standard for Binary
2480 | Floating-Point Arithmetic.
2481 *----------------------------------------------------------------------------*/
2482 float32 float32_log2( float32 a STATUS_PARAM )
2483 {
2484     flag aSign, zSign;
2485     int_fast16_t aExp;
2486     uint32_t aSig, zSig, i;
2487
2488     a = float32_squash_input_denormal(a STATUS_VAR);
2489     aSig = extractFloat32Frac( a );
2490     aExp = extractFloat32Exp( a );
2491     aSign = extractFloat32Sign( a );
2492
2493     if ( aExp == 0 ) {
2494         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2495         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2496     }
2497     if ( aSign ) {
2498         float_raise( float_flag_invalid STATUS_VAR);
2499         return float32_default_nan;
2500     }
2501     if ( aExp == 0xFF ) {
2502         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2503         return a;
2504     }
2505
2506     aExp -= 0x7F;
2507     aSig |= 0x00800000;
2508     zSign = aExp < 0;
2509     zSig = aExp << 23;
2510
2511     for (i = 1 << 22; i > 0; i >>= 1) {
2512         aSig = ( (uint64_t)aSig * aSig ) >> 23;
2513         if ( aSig & 0x01000000 ) {
2514             aSig >>= 1;
2515             zSig |= i;
2516         }
2517     }
2518
2519     if ( zSign )
2520         zSig = -zSig;
2521
2522     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2523 }
2524
2525 /*----------------------------------------------------------------------------
2526 | Returns 1 if the single-precision floating-point value `a' is equal to
2527 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2528 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2529 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2530 *----------------------------------------------------------------------------*/
2531
2532 int float32_eq( float32 a, float32 b STATUS_PARAM )
2533 {
2534     uint32_t av, bv;
2535     a = float32_squash_input_denormal(a STATUS_VAR);
2536     b = float32_squash_input_denormal(b STATUS_VAR);
2537
2538     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2539          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2540        ) {
2541         float_raise( float_flag_invalid STATUS_VAR);
2542         return 0;
2543     }
2544     av = float32_val(a);
2545     bv = float32_val(b);
2546     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2547 }
2548
2549 /*----------------------------------------------------------------------------
2550 | Returns 1 if the single-precision floating-point value `a' is less than
2551 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
2552 | exception is raised if either operand is a NaN.  The comparison is performed
2553 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2554 *----------------------------------------------------------------------------*/
2555
2556 int float32_le( float32 a, float32 b STATUS_PARAM )
2557 {
2558     flag aSign, bSign;
2559     uint32_t av, bv;
2560     a = float32_squash_input_denormal(a STATUS_VAR);
2561     b = float32_squash_input_denormal(b STATUS_VAR);
2562
2563     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2564          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2565        ) {
2566         float_raise( float_flag_invalid STATUS_VAR);
2567         return 0;
2568     }
2569     aSign = extractFloat32Sign( a );
2570     bSign = extractFloat32Sign( b );
2571     av = float32_val(a);
2572     bv = float32_val(b);
2573     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2574     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2575
2576 }
2577
2578 /*----------------------------------------------------------------------------
2579 | Returns 1 if the single-precision floating-point value `a' is less than
2580 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2581 | raised if either operand is a NaN.  The comparison is performed according
2582 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2583 *----------------------------------------------------------------------------*/
2584
2585 int float32_lt( float32 a, float32 b STATUS_PARAM )
2586 {
2587     flag aSign, bSign;
2588     uint32_t av, bv;
2589     a = float32_squash_input_denormal(a STATUS_VAR);
2590     b = float32_squash_input_denormal(b STATUS_VAR);
2591
2592     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2593          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2594        ) {
2595         float_raise( float_flag_invalid STATUS_VAR);
2596         return 0;
2597     }
2598     aSign = extractFloat32Sign( a );
2599     bSign = extractFloat32Sign( b );
2600     av = float32_val(a);
2601     bv = float32_val(b);
2602     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2603     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2604
2605 }
2606
2607 /*----------------------------------------------------------------------------
2608 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2609 | be compared, and 0 otherwise.  The invalid exception is raised if either
2610 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
2611 | Standard for Binary Floating-Point Arithmetic.
2612 *----------------------------------------------------------------------------*/
2613
2614 int float32_unordered( float32 a, float32 b STATUS_PARAM )
2615 {
2616     a = float32_squash_input_denormal(a STATUS_VAR);
2617     b = float32_squash_input_denormal(b STATUS_VAR);
2618
2619     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2620          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2621        ) {
2622         float_raise( float_flag_invalid STATUS_VAR);
2623         return 1;
2624     }
2625     return 0;
2626 }
2627
2628 /*----------------------------------------------------------------------------
2629 | Returns 1 if the single-precision floating-point value `a' is equal to
2630 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2631 | exception.  The comparison is performed according to the IEC/IEEE Standard
2632 | for Binary Floating-Point Arithmetic.
2633 *----------------------------------------------------------------------------*/
2634
2635 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2636 {
2637     a = float32_squash_input_denormal(a STATUS_VAR);
2638     b = float32_squash_input_denormal(b STATUS_VAR);
2639
2640     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2641          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2642        ) {
2643         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2644             float_raise( float_flag_invalid STATUS_VAR);
2645         }
2646         return 0;
2647     }
2648     return ( float32_val(a) == float32_val(b) ) ||
2649             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2650 }
2651
2652 /*----------------------------------------------------------------------------
2653 | Returns 1 if the single-precision floating-point value `a' is less than or
2654 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2655 | cause an exception.  Otherwise, the comparison is performed according to the
2656 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2657 *----------------------------------------------------------------------------*/
2658
2659 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2660 {
2661     flag aSign, bSign;
2662     uint32_t av, bv;
2663     a = float32_squash_input_denormal(a STATUS_VAR);
2664     b = float32_squash_input_denormal(b STATUS_VAR);
2665
2666     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2667          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2668        ) {
2669         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2670             float_raise( float_flag_invalid STATUS_VAR);
2671         }
2672         return 0;
2673     }
2674     aSign = extractFloat32Sign( a );
2675     bSign = extractFloat32Sign( b );
2676     av = float32_val(a);
2677     bv = float32_val(b);
2678     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2679     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2680
2681 }
2682
2683 /*----------------------------------------------------------------------------
2684 | Returns 1 if the single-precision floating-point value `a' is less than
2685 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2686 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2687 | Standard for Binary Floating-Point Arithmetic.
2688 *----------------------------------------------------------------------------*/
2689
2690 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2691 {
2692     flag aSign, bSign;
2693     uint32_t av, bv;
2694     a = float32_squash_input_denormal(a STATUS_VAR);
2695     b = float32_squash_input_denormal(b STATUS_VAR);
2696
2697     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2698          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2699        ) {
2700         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2701             float_raise( float_flag_invalid STATUS_VAR);
2702         }
2703         return 0;
2704     }
2705     aSign = extractFloat32Sign( a );
2706     bSign = extractFloat32Sign( b );
2707     av = float32_val(a);
2708     bv = float32_val(b);
2709     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2710     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2711
2712 }
2713
2714 /*----------------------------------------------------------------------------
2715 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2716 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
2717 | comparison is performed according to the IEC/IEEE Standard for Binary
2718 | Floating-Point Arithmetic.
2719 *----------------------------------------------------------------------------*/
2720
2721 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2722 {
2723     a = float32_squash_input_denormal(a STATUS_VAR);
2724     b = float32_squash_input_denormal(b STATUS_VAR);
2725
2726     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2727          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2728        ) {
2729         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2730             float_raise( float_flag_invalid STATUS_VAR);
2731         }
2732         return 1;
2733     }
2734     return 0;
2735 }
2736
2737 /*----------------------------------------------------------------------------
2738 | Returns the result of converting the double-precision floating-point value
2739 | `a' to the 32-bit two's complement integer format.  The conversion is
2740 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2741 | Arithmetic---which means in particular that the conversion is rounded
2742 | according to the current rounding mode.  If `a' is a NaN, the largest
2743 | positive integer is returned.  Otherwise, if the conversion overflows, the
2744 | largest integer with the same sign as `a' is returned.
2745 *----------------------------------------------------------------------------*/
2746
2747 int32 float64_to_int32( float64 a STATUS_PARAM )
2748 {
2749     flag aSign;
2750     int_fast16_t aExp, shiftCount;
2751     uint64_t aSig;
2752     a = float64_squash_input_denormal(a STATUS_VAR);
2753
2754     aSig = extractFloat64Frac( a );
2755     aExp = extractFloat64Exp( a );
2756     aSign = extractFloat64Sign( a );
2757     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2758     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2759     shiftCount = 0x42C - aExp;
2760     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2761     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2762
2763 }
2764
2765 /*----------------------------------------------------------------------------
2766 | Returns the result of converting the double-precision floating-point value
2767 | `a' to the 32-bit two's complement integer format.  The conversion is
2768 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2769 | Arithmetic, except that the conversion is always rounded toward zero.
2770 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2771 | the conversion overflows, the largest integer with the same sign as `a' is
2772 | returned.
2773 *----------------------------------------------------------------------------*/
2774
2775 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2776 {
2777     flag aSign;
2778     int_fast16_t aExp, shiftCount;
2779     uint64_t aSig, savedASig;
2780     int32_t z;
2781     a = float64_squash_input_denormal(a STATUS_VAR);
2782
2783     aSig = extractFloat64Frac( a );
2784     aExp = extractFloat64Exp( a );
2785     aSign = extractFloat64Sign( a );
2786     if ( 0x41E < aExp ) {
2787         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2788         goto invalid;
2789     }
2790     else if ( aExp < 0x3FF ) {
2791         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2792         return 0;
2793     }
2794     aSig |= LIT64( 0x0010000000000000 );
2795     shiftCount = 0x433 - aExp;
2796     savedASig = aSig;
2797     aSig >>= shiftCount;
2798     z = aSig;
2799     if ( aSign ) z = - z;
2800     if ( ( z < 0 ) ^ aSign ) {
2801  invalid:
2802         float_raise( float_flag_invalid STATUS_VAR);
2803         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2804     }
2805     if ( ( aSig<<shiftCount ) != savedASig ) {
2806         STATUS(float_exception_flags) |= float_flag_inexact;
2807     }
2808     return z;
2809
2810 }
2811
2812 /*----------------------------------------------------------------------------
2813 | Returns the result of converting the double-precision floating-point value
2814 | `a' to the 16-bit two's complement integer format.  The conversion is
2815 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2816 | Arithmetic, except that the conversion is always rounded toward zero.
2817 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2818 | the conversion overflows, the largest integer with the same sign as `a' is
2819 | returned.
2820 *----------------------------------------------------------------------------*/
2821
2822 int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
2823 {
2824     flag aSign;
2825     int_fast16_t aExp, shiftCount;
2826     uint64_t aSig, savedASig;
2827     int32 z;
2828
2829     aSig = extractFloat64Frac( a );
2830     aExp = extractFloat64Exp( a );
2831     aSign = extractFloat64Sign( a );
2832     if ( 0x40E < aExp ) {
2833         if ( ( aExp == 0x7FF ) && aSig ) {
2834             aSign = 0;
2835         }
2836         goto invalid;
2837     }
2838     else if ( aExp < 0x3FF ) {
2839         if ( aExp || aSig ) {
2840             STATUS(float_exception_flags) |= float_flag_inexact;
2841         }
2842         return 0;
2843     }
2844     aSig |= LIT64( 0x0010000000000000 );
2845     shiftCount = 0x433 - aExp;
2846     savedASig = aSig;
2847     aSig >>= shiftCount;
2848     z = aSig;
2849     if ( aSign ) {
2850         z = - z;
2851     }
2852     if ( ( (int16_t)z < 0 ) ^ aSign ) {
2853  invalid:
2854         float_raise( float_flag_invalid STATUS_VAR);
2855         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2856     }
2857     if ( ( aSig<<shiftCount ) != savedASig ) {
2858         STATUS(float_exception_flags) |= float_flag_inexact;
2859     }
2860     return z;
2861 }
2862
2863 /*----------------------------------------------------------------------------
2864 | Returns the result of converting the double-precision floating-point value
2865 | `a' to the 64-bit two's complement integer format.  The conversion is
2866 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2867 | Arithmetic---which means in particular that the conversion is rounded
2868 | according to the current rounding mode.  If `a' is a NaN, the largest
2869 | positive integer is returned.  Otherwise, if the conversion overflows, the
2870 | largest integer with the same sign as `a' is returned.
2871 *----------------------------------------------------------------------------*/
2872
2873 int64 float64_to_int64( float64 a STATUS_PARAM )
2874 {
2875     flag aSign;
2876     int_fast16_t aExp, shiftCount;
2877     uint64_t aSig, aSigExtra;
2878     a = float64_squash_input_denormal(a STATUS_VAR);
2879
2880     aSig = extractFloat64Frac( a );
2881     aExp = extractFloat64Exp( a );
2882     aSign = extractFloat64Sign( a );
2883     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2884     shiftCount = 0x433 - aExp;
2885     if ( shiftCount <= 0 ) {
2886         if ( 0x43E < aExp ) {
2887             float_raise( float_flag_invalid STATUS_VAR);
2888             if (    ! aSign
2889                  || (    ( aExp == 0x7FF )
2890                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2891                ) {
2892                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2893             }
2894             return (int64_t) LIT64( 0x8000000000000000 );
2895         }
2896         aSigExtra = 0;
2897         aSig <<= - shiftCount;
2898     }
2899     else {
2900         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2901     }
2902     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2903
2904 }
2905
2906 /*----------------------------------------------------------------------------
2907 | Returns the result of converting the double-precision floating-point value
2908 | `a' to the 64-bit two's complement integer format.  The conversion is
2909 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2910 | Arithmetic, except that the conversion is always rounded toward zero.
2911 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2912 | the conversion overflows, the largest integer with the same sign as `a' is
2913 | returned.
2914 *----------------------------------------------------------------------------*/
2915
2916 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2917 {
2918     flag aSign;
2919     int_fast16_t aExp, shiftCount;
2920     uint64_t aSig;
2921     int64 z;
2922     a = float64_squash_input_denormal(a STATUS_VAR);
2923
2924     aSig = extractFloat64Frac( a );
2925     aExp = extractFloat64Exp( a );
2926     aSign = extractFloat64Sign( a );
2927     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2928     shiftCount = aExp - 0x433;
2929     if ( 0 <= shiftCount ) {
2930         if ( 0x43E <= aExp ) {
2931             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2932                 float_raise( float_flag_invalid STATUS_VAR);
2933                 if (    ! aSign
2934                      || (    ( aExp == 0x7FF )
2935                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2936                    ) {
2937                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2938                 }
2939             }
2940             return (int64_t) LIT64( 0x8000000000000000 );
2941         }
2942         z = aSig<<shiftCount;
2943     }
2944     else {
2945         if ( aExp < 0x3FE ) {
2946             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2947             return 0;
2948         }
2949         z = aSig>>( - shiftCount );
2950         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
2951             STATUS(float_exception_flags) |= float_flag_inexact;
2952         }
2953     }
2954     if ( aSign ) z = - z;
2955     return z;
2956
2957 }
2958
2959 /*----------------------------------------------------------------------------
2960 | Returns the result of converting the double-precision floating-point value
2961 | `a' to the single-precision floating-point format.  The conversion is
2962 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2963 | Arithmetic.
2964 *----------------------------------------------------------------------------*/
2965
2966 float32 float64_to_float32( float64 a STATUS_PARAM )
2967 {
2968     flag aSign;
2969     int_fast16_t aExp;
2970     uint64_t aSig;
2971     uint32_t zSig;
2972     a = float64_squash_input_denormal(a STATUS_VAR);
2973
2974     aSig = extractFloat64Frac( a );
2975     aExp = extractFloat64Exp( a );
2976     aSign = extractFloat64Sign( a );
2977     if ( aExp == 0x7FF ) {
2978         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2979         return packFloat32( aSign, 0xFF, 0 );
2980     }
2981     shift64RightJamming( aSig, 22, &aSig );
2982     zSig = aSig;
2983     if ( aExp || zSig ) {
2984         zSig |= 0x40000000;
2985         aExp -= 0x381;
2986     }
2987     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2988
2989 }
2990
2991
2992 /*----------------------------------------------------------------------------
2993 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2994 | half-precision floating-point value, returning the result.  After being
2995 | shifted into the proper positions, the three fields are simply added
2996 | together to form the result.  This means that any integer portion of `zSig'
2997 | will be added into the exponent.  Since a properly normalized significand
2998 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2999 | than the desired result exponent whenever `zSig' is a complete, normalized
3000 | significand.
3001 *----------------------------------------------------------------------------*/
3002 static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3003 {
3004     return make_float16(
3005         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3006 }
3007
3008 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3009    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3010
3011 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
3012 {
3013     flag aSign;
3014     int_fast16_t aExp;
3015     uint32_t aSig;
3016
3017     aSign = extractFloat16Sign(a);
3018     aExp = extractFloat16Exp(a);
3019     aSig = extractFloat16Frac(a);
3020
3021     if (aExp == 0x1f && ieee) {
3022         if (aSig) {
3023             return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3024         }
3025         return packFloat32(aSign, 0xff, 0);
3026     }
3027     if (aExp == 0) {
3028         int8 shiftCount;
3029
3030         if (aSig == 0) {
3031             return packFloat32(aSign, 0, 0);
3032         }
3033
3034         shiftCount = countLeadingZeros32( aSig ) - 21;
3035         aSig = aSig << shiftCount;
3036         aExp = -shiftCount;
3037     }
3038     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3039 }
3040
3041 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
3042 {
3043     flag aSign;
3044     int_fast16_t aExp;
3045     uint32_t aSig;
3046     uint32_t mask;
3047     uint32_t increment;
3048     int8 roundingMode;
3049     a = float32_squash_input_denormal(a STATUS_VAR);
3050
3051     aSig = extractFloat32Frac( a );
3052     aExp = extractFloat32Exp( a );
3053     aSign = extractFloat32Sign( a );
3054     if ( aExp == 0xFF ) {
3055         if (aSig) {
3056             /* Input is a NaN */
3057             float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3058             if (!ieee) {
3059                 return packFloat16(aSign, 0, 0);
3060             }
3061             return r;
3062         }
3063         /* Infinity */
3064         if (!ieee) {
3065             float_raise(float_flag_invalid STATUS_VAR);
3066             return packFloat16(aSign, 0x1f, 0x3ff);
3067         }
3068         return packFloat16(aSign, 0x1f, 0);
3069     }
3070     if (aExp == 0 && aSig == 0) {
3071         return packFloat16(aSign, 0, 0);
3072     }
3073     /* Decimal point between bits 22 and 23.  */
3074     aSig |= 0x00800000;
3075     aExp -= 0x7f;
3076     if (aExp < -14) {
3077         mask = 0x00ffffff;
3078         if (aExp >= -24) {
3079             mask >>= 25 + aExp;
3080         }
3081     } else {
3082         mask = 0x00001fff;
3083     }
3084     if (aSig & mask) {
3085         float_raise( float_flag_underflow STATUS_VAR );
3086         roundingMode = STATUS(float_rounding_mode);
3087         switch (roundingMode) {
3088         case float_round_nearest_even:
3089             increment = (mask + 1) >> 1;
3090             if ((aSig & mask) == increment) {
3091                 increment = aSig & (increment << 1);
3092             }
3093             break;
3094         case float_round_up:
3095             increment = aSign ? 0 : mask;
3096             break;
3097         case float_round_down:
3098             increment = aSign ? mask : 0;
3099             break;
3100         default: /* round_to_zero */
3101             increment = 0;
3102             break;
3103         }
3104         aSig += increment;
3105         if (aSig >= 0x01000000) {
3106             aSig >>= 1;
3107             aExp++;
3108         }
3109     } else if (aExp < -14
3110           && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
3111         float_raise( float_flag_underflow STATUS_VAR);
3112     }
3113
3114     if (ieee) {
3115         if (aExp > 15) {
3116             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
3117             return packFloat16(aSign, 0x1f, 0);
3118         }
3119     } else {
3120         if (aExp > 16) {
3121             float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
3122             return packFloat16(aSign, 0x1f, 0x3ff);
3123         }
3124     }
3125     if (aExp < -24) {
3126         return packFloat16(aSign, 0, 0);
3127     }
3128     if (aExp < -14) {
3129         aSig >>= -14 - aExp;
3130         aExp = -14;
3131     }
3132     return packFloat16(aSign, aExp + 14, aSig >> 13);
3133 }
3134
3135 /*----------------------------------------------------------------------------
3136 | Returns the result of converting the double-precision floating-point value
3137 | `a' to the extended double-precision floating-point format.  The conversion
3138 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3139 | Arithmetic.
3140 *----------------------------------------------------------------------------*/
3141
3142 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3143 {
3144     flag aSign;
3145     int_fast16_t aExp;
3146     uint64_t aSig;
3147
3148     a = float64_squash_input_denormal(a STATUS_VAR);
3149     aSig = extractFloat64Frac( a );
3150     aExp = extractFloat64Exp( a );
3151     aSign = extractFloat64Sign( a );
3152     if ( aExp == 0x7FF ) {
3153         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3154         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3155     }
3156     if ( aExp == 0 ) {
3157         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3158         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3159     }
3160     return
3161         packFloatx80(
3162             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3163
3164 }
3165
3166 /*----------------------------------------------------------------------------
3167 | Returns the result of converting the double-precision floating-point value
3168 | `a' to the quadruple-precision floating-point format.  The conversion is
3169 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3170 | Arithmetic.
3171 *----------------------------------------------------------------------------*/
3172
3173 float128 float64_to_float128( float64 a STATUS_PARAM )
3174 {
3175     flag aSign;
3176     int_fast16_t aExp;
3177     uint64_t aSig, zSig0, zSig1;
3178
3179     a = float64_squash_input_denormal(a STATUS_VAR);
3180     aSig = extractFloat64Frac( a );
3181     aExp = extractFloat64Exp( a );
3182     aSign = extractFloat64Sign( a );
3183     if ( aExp == 0x7FF ) {
3184         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3185         return packFloat128( aSign, 0x7FFF, 0, 0 );
3186     }
3187     if ( aExp == 0 ) {
3188         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3189         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3190         --aExp;
3191     }
3192     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3193     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3194
3195 }
3196
3197 /*----------------------------------------------------------------------------
3198 | Rounds the double-precision floating-point value `a' to an integer, and
3199 | returns the result as a double-precision floating-point value.  The
3200 | operation is performed according to the IEC/IEEE Standard for Binary
3201 | Floating-Point Arithmetic.
3202 *----------------------------------------------------------------------------*/
3203
3204 float64 float64_round_to_int( float64 a STATUS_PARAM )
3205 {
3206     flag aSign;
3207     int_fast16_t aExp;
3208     uint64_t lastBitMask, roundBitsMask;
3209     int8 roundingMode;
3210     uint64_t z;
3211     a = float64_squash_input_denormal(a STATUS_VAR);
3212
3213     aExp = extractFloat64Exp( a );
3214     if ( 0x433 <= aExp ) {
3215         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3216             return propagateFloat64NaN( a, a STATUS_VAR );
3217         }
3218         return a;
3219     }
3220     if ( aExp < 0x3FF ) {
3221         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3222         STATUS(float_exception_flags) |= float_flag_inexact;
3223         aSign = extractFloat64Sign( a );
3224         switch ( STATUS(float_rounding_mode) ) {
3225          case float_round_nearest_even:
3226             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3227                 return packFloat64( aSign, 0x3FF, 0 );
3228             }
3229             break;
3230          case float_round_down:
3231             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3232          case float_round_up:
3233             return make_float64(
3234             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3235         }
3236         return packFloat64( aSign, 0, 0 );
3237     }
3238     lastBitMask = 1;
3239     lastBitMask <<= 0x433 - aExp;
3240     roundBitsMask = lastBitMask - 1;
3241     z = float64_val(a);
3242     roundingMode = STATUS(float_rounding_mode);
3243     if ( roundingMode == float_round_nearest_even ) {
3244         z += lastBitMask>>1;
3245         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3246     }
3247     else if ( roundingMode != float_round_to_zero ) {
3248         if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3249             z += roundBitsMask;
3250         }
3251     }
3252     z &= ~ roundBitsMask;
3253     if ( z != float64_val(a) )
3254         STATUS(float_exception_flags) |= float_flag_inexact;
3255     return make_float64(z);
3256
3257 }
3258
3259 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3260 {
3261     int oldmode;
3262     float64 res;
3263     oldmode = STATUS(float_rounding_mode);
3264     STATUS(float_rounding_mode) = float_round_to_zero;
3265     res = float64_round_to_int(a STATUS_VAR);
3266     STATUS(float_rounding_mode) = oldmode;
3267     return res;
3268 }
3269
3270 /*----------------------------------------------------------------------------
3271 | Returns the result of adding the absolute values of the double-precision
3272 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3273 | before being returned.  `zSign' is ignored if the result is a NaN.
3274 | The addition is performed according to the IEC/IEEE Standard for Binary
3275 | Floating-Point Arithmetic.
3276 *----------------------------------------------------------------------------*/
3277
3278 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3279 {
3280     int_fast16_t aExp, bExp, zExp;
3281     uint64_t aSig, bSig, zSig;
3282     int_fast16_t expDiff;
3283
3284     aSig = extractFloat64Frac( a );
3285     aExp = extractFloat64Exp( a );
3286     bSig = extractFloat64Frac( b );
3287     bExp = extractFloat64Exp( b );
3288     expDiff = aExp - bExp;
3289     aSig <<= 9;
3290     bSig <<= 9;
3291     if ( 0 < expDiff ) {
3292         if ( aExp == 0x7FF ) {
3293             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3294             return a;
3295         }
3296         if ( bExp == 0 ) {
3297             --expDiff;
3298         }
3299         else {
3300             bSig |= LIT64( 0x2000000000000000 );
3301         }
3302         shift64RightJamming( bSig, expDiff, &bSig );
3303         zExp = aExp;
3304     }
3305     else if ( expDiff < 0 ) {
3306         if ( bExp == 0x7FF ) {
3307             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3308             return packFloat64( zSign, 0x7FF, 0 );
3309         }
3310         if ( aExp == 0 ) {
3311             ++expDiff;
3312         }
3313         else {
3314             aSig |= LIT64( 0x2000000000000000 );
3315         }
3316         shift64RightJamming( aSig, - expDiff, &aSig );
3317         zExp = bExp;
3318     }
3319     else {
3320         if ( aExp == 0x7FF ) {
3321             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3322             return a;
3323         }
3324         if ( aExp == 0 ) {
3325             if (STATUS(flush_to_zero)) {
3326                 if (aSig | bSig) {
3327                     float_raise(float_flag_output_denormal STATUS_VAR);
3328                 }
3329                 return packFloat64(zSign, 0, 0);
3330             }
3331             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3332         }
3333         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3334         zExp = aExp;
3335         goto roundAndPack;
3336     }
3337     aSig |= LIT64( 0x2000000000000000 );
3338     zSig = ( aSig + bSig )<<1;
3339     --zExp;
3340     if ( (int64_t) zSig < 0 ) {
3341         zSig = aSig + bSig;
3342         ++zExp;
3343     }
3344  roundAndPack:
3345     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3346
3347 }
3348
3349 /*----------------------------------------------------------------------------
3350 | Returns the result of subtracting the absolute values of the double-
3351 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3352 | difference is negated before being returned.  `zSign' is ignored if the
3353 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3354 | Standard for Binary Floating-Point Arithmetic.
3355 *----------------------------------------------------------------------------*/
3356
3357 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3358 {
3359     int_fast16_t aExp, bExp, zExp;
3360     uint64_t aSig, bSig, zSig;
3361     int_fast16_t expDiff;
3362
3363     aSig = extractFloat64Frac( a );
3364     aExp = extractFloat64Exp( a );
3365     bSig = extractFloat64Frac( b );
3366     bExp = extractFloat64Exp( b );
3367     expDiff = aExp - bExp;
3368     aSig <<= 10;
3369     bSig <<= 10;
3370     if ( 0 < expDiff ) goto aExpBigger;
3371     if ( expDiff < 0 ) goto bExpBigger;
3372     if ( aExp == 0x7FF ) {
3373         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3374         float_raise( float_flag_invalid STATUS_VAR);
3375         return float64_default_nan;
3376     }
3377     if ( aExp == 0 ) {
3378         aExp = 1;
3379         bExp = 1;
3380     }
3381     if ( bSig < aSig ) goto aBigger;
3382     if ( aSig < bSig ) goto bBigger;
3383     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3384  bExpBigger:
3385     if ( bExp == 0x7FF ) {
3386         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3387         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3388     }
3389     if ( aExp == 0 ) {
3390         ++expDiff;
3391     }
3392     else {
3393         aSig |= LIT64( 0x4000000000000000 );
3394     }
3395     shift64RightJamming( aSig, - expDiff, &aSig );
3396     bSig |= LIT64( 0x4000000000000000 );
3397  bBigger:
3398     zSig = bSig - aSig;
3399     zExp = bExp;
3400     zSign ^= 1;
3401     goto normalizeRoundAndPack;
3402  aExpBigger:
3403     if ( aExp == 0x7FF ) {
3404         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3405         return a;
3406     }
3407     if ( bExp == 0 ) {
3408         --expDiff;
3409     }
3410     else {
3411         bSig |= LIT64( 0x4000000000000000 );
3412     }
3413     shift64RightJamming( bSig, expDiff, &bSig );
3414     aSig |= LIT64( 0x4000000000000000 );
3415  aBigger:
3416     zSig = aSig - bSig;
3417     zExp = aExp;
3418  normalizeRoundAndPack:
3419     --zExp;
3420     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3421
3422 }
3423
3424 /*----------------------------------------------------------------------------
3425 | Returns the result of adding the double-precision floating-point values `a'
3426 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3427 | Binary Floating-Point Arithmetic.
3428 *----------------------------------------------------------------------------*/
3429
3430 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3431 {
3432     flag aSign, bSign;
3433     a = float64_squash_input_denormal(a STATUS_VAR);
3434     b = float64_squash_input_denormal(b STATUS_VAR);
3435
3436     aSign = extractFloat64Sign( a );
3437     bSign = extractFloat64Sign( b );
3438     if ( aSign == bSign ) {
3439         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3440     }
3441     else {
3442         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3443     }
3444
3445 }
3446
3447 /*----------------------------------------------------------------------------
3448 | Returns the result of subtracting the double-precision floating-point values
3449 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3450 | for Binary Floating-Point Arithmetic.
3451 *----------------------------------------------------------------------------*/
3452
3453 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3454 {
3455     flag aSign, bSign;
3456     a = float64_squash_input_denormal(a STATUS_VAR);
3457     b = float64_squash_input_denormal(b STATUS_VAR);
3458
3459     aSign = extractFloat64Sign( a );
3460     bSign = extractFloat64Sign( b );
3461     if ( aSign == bSign ) {
3462         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3463     }
3464     else {
3465         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3466     }
3467
3468 }
3469
3470 /*----------------------------------------------------------------------------
3471 | Returns the result of multiplying the double-precision floating-point values
3472 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3473 | for Binary Floating-Point Arithmetic.
3474 *----------------------------------------------------------------------------*/
3475
3476 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3477 {
3478     flag aSign, bSign, zSign;
3479     int_fast16_t aExp, bExp, zExp;
3480     uint64_t aSig, bSig, zSig0, zSig1;
3481
3482     a = float64_squash_input_denormal(a STATUS_VAR);
3483     b = float64_squash_input_denormal(b STATUS_VAR);
3484
3485     aSig = extractFloat64Frac( a );
3486     aExp = extractFloat64Exp( a );
3487     aSign = extractFloat64Sign( a );
3488     bSig = extractFloat64Frac( b );
3489     bExp = extractFloat64Exp( b );
3490     bSign = extractFloat64Sign( b );
3491     zSign = aSign ^ bSign;
3492     if ( aExp == 0x7FF ) {
3493         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3494             return propagateFloat64NaN( a, b STATUS_VAR );
3495         }
3496         if ( ( bExp | bSig ) == 0 ) {
3497             float_raise( float_flag_invalid STATUS_VAR);
3498             return float64_default_nan;
3499         }
3500         return packFloat64( zSign, 0x7FF, 0 );
3501     }
3502     if ( bExp == 0x7FF ) {
3503         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3504         if ( ( aExp | aSig ) == 0 ) {
3505             float_raise( float_flag_invalid STATUS_VAR);
3506             return float64_default_nan;
3507         }
3508         return packFloat64( zSign, 0x7FF, 0 );
3509     }
3510     if ( aExp == 0 ) {
3511         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3512         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3513     }
3514     if ( bExp == 0 ) {
3515         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3516         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3517     }
3518     zExp = aExp + bExp - 0x3FF;
3519     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3520     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3521     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3522     zSig0 |= ( zSig1 != 0 );
3523     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3524         zSig0 <<= 1;
3525         --zExp;
3526     }
3527     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3528
3529 }
3530
3531 /*----------------------------------------------------------------------------
3532 | Returns the result of dividing the double-precision floating-point value `a'
3533 | by the corresponding value `b'.  The operation is performed according to
3534 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3535 *----------------------------------------------------------------------------*/
3536
3537 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3538 {
3539     flag aSign, bSign, zSign;
3540     int_fast16_t aExp, bExp, zExp;
3541     uint64_t aSig, bSig, zSig;
3542     uint64_t rem0, rem1;
3543     uint64_t term0, term1;
3544     a = float64_squash_input_denormal(a STATUS_VAR);
3545     b = float64_squash_input_denormal(b STATUS_VAR);
3546
3547     aSig = extractFloat64Frac( a );
3548     aExp = extractFloat64Exp( a );
3549     aSign = extractFloat64Sign( a );
3550     bSig = extractFloat64Frac( b );
3551     bExp = extractFloat64Exp( b );
3552     bSign = extractFloat64Sign( b );
3553     zSign = aSign ^ bSign;
3554     if ( aExp == 0x7FF ) {
3555         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3556         if ( bExp == 0x7FF ) {
3557             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3558             float_raise( float_flag_invalid STATUS_VAR);
3559             return float64_default_nan;
3560         }
3561         return packFloat64( zSign, 0x7FF, 0 );
3562     }
3563     if ( bExp == 0x7FF ) {
3564         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3565         return packFloat64( zSign, 0, 0 );
3566     }
3567     if ( bExp == 0 ) {
3568         if ( bSig == 0 ) {
3569             if ( ( aExp | aSig ) == 0 ) {
3570                 float_raise( float_flag_invalid STATUS_VAR);
3571                 return float64_default_nan;
3572             }
3573             float_raise( float_flag_divbyzero STATUS_VAR);
3574             return packFloat64( zSign, 0x7FF, 0 );
3575         }
3576         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3577     }
3578     if ( aExp == 0 ) {
3579         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3580         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3581     }
3582     zExp = aExp - bExp + 0x3FD;
3583     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3584     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3585     if ( bSig <= ( aSig + aSig ) ) {
3586         aSig >>= 1;
3587         ++zExp;
3588     }
3589     zSig = estimateDiv128To64( aSig, 0, bSig );
3590     if ( ( zSig & 0x1FF ) <= 2 ) {
3591         mul64To128( bSig, zSig, &term0, &term1 );
3592         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3593         while ( (int64_t) rem0 < 0 ) {
3594             --zSig;
3595             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3596         }
3597         zSig |= ( rem1 != 0 );
3598     }
3599     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3600
3601 }
3602
3603 /*----------------------------------------------------------------------------
3604 | Returns the remainder of the double-precision floating-point value `a'
3605 | with respect to the corresponding value `b'.  The operation is performed
3606 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3607 *----------------------------------------------------------------------------*/
3608
3609 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3610 {
3611     flag aSign, zSign;
3612     int_fast16_t aExp, bExp, expDiff;
3613     uint64_t aSig, bSig;
3614     uint64_t q, alternateASig;
3615     int64_t sigMean;
3616
3617     a = float64_squash_input_denormal(a STATUS_VAR);
3618     b = float64_squash_input_denormal(b STATUS_VAR);
3619     aSig = extractFloat64Frac( a );
3620     aExp = extractFloat64Exp( a );
3621     aSign = extractFloat64Sign( a );
3622     bSig = extractFloat64Frac( b );
3623     bExp = extractFloat64Exp( b );
3624     if ( aExp == 0x7FF ) {
3625         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3626             return propagateFloat64NaN( a, b STATUS_VAR );
3627         }
3628         float_raise( float_flag_invalid STATUS_VAR);
3629         return float64_default_nan;
3630     }
3631     if ( bExp == 0x7FF ) {
3632         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3633         return a;
3634     }
3635     if ( bExp == 0 ) {
3636         if ( bSig == 0 ) {
3637             float_raise( float_flag_invalid STATUS_VAR);
3638             return float64_default_nan;
3639         }
3640         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3641     }
3642     if ( aExp == 0 ) {
3643         if ( aSig == 0 ) return a;
3644         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3645     }
3646     expDiff = aExp - bExp;
3647     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3648     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3649     if ( expDiff < 0 ) {
3650         if ( expDiff < -1 ) return a;
3651         aSig >>= 1;
3652     }
3653     q = ( bSig <= aSig );
3654     if ( q ) aSig -= bSig;
3655     expDiff -= 64;
3656     while ( 0 < expDiff ) {
3657         q = estimateDiv128To64( aSig, 0, bSig );
3658         q = ( 2 < q ) ? q - 2 : 0;
3659         aSig = - ( ( bSig>>2 ) * q );
3660         expDiff -= 62;
3661     }
3662     expDiff += 64;
3663     if ( 0 < expDiff ) {
3664         q = estimateDiv128To64( aSig, 0, bSig );
3665         q = ( 2 < q ) ? q - 2 : 0;
3666         q >>= 64 - expDiff;
3667         bSig >>= 2;
3668         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3669     }
3670     else {
3671         aSig >>= 2;
3672         bSig >>= 2;
3673     }
3674     do {
3675         alternateASig = aSig;
3676         ++q;
3677         aSig -= bSig;
3678     } while ( 0 <= (int64_t) aSig );
3679     sigMean = aSig + alternateASig;
3680     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3681         aSig = alternateASig;
3682     }
3683     zSign = ( (int64_t) aSig < 0 );
3684     if ( zSign ) aSig = - aSig;
3685     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3686
3687 }
3688
3689 /*----------------------------------------------------------------------------
3690 | Returns the result of multiplying the double-precision floating-point values
3691 | `a' and `b' then adding 'c', with no intermediate rounding step after the
3692 | multiplication.  The operation is performed according to the IEC/IEEE
3693 | Standard for Binary Floating-Point Arithmetic 754-2008.
3694 | The flags argument allows the caller to select negation of the
3695 | addend, the intermediate product, or the final result. (The difference
3696 | between this and having the caller do a separate negation is that negating
3697 | externally will flip the sign bit on NaNs.)
3698 *----------------------------------------------------------------------------*/
3699
3700 float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
3701 {
3702     flag aSign, bSign, cSign, zSign;
3703     int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
3704     uint64_t aSig, bSig, cSig;
3705     flag pInf, pZero, pSign;
3706     uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
3707     int shiftcount;
3708     flag signflip, infzero;
3709
3710     a = float64_squash_input_denormal(a STATUS_VAR);
3711     b = float64_squash_input_denormal(b STATUS_VAR);
3712     c = float64_squash_input_denormal(c STATUS_VAR);
3713     aSig = extractFloat64Frac(a);
3714     aExp = extractFloat64Exp(a);
3715     aSign = extractFloat64Sign(a);
3716     bSig = extractFloat64Frac(b);
3717     bExp = extractFloat64Exp(b);
3718     bSign = extractFloat64Sign(b);
3719     cSig = extractFloat64Frac(c);
3720     cExp = extractFloat64Exp(c);
3721     cSign = extractFloat64Sign(c);
3722
3723     infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
3724                (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
3725
3726     /* It is implementation-defined whether the cases of (0,inf,qnan)
3727      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
3728      * they return if they do), so we have to hand this information
3729      * off to the target-specific pick-a-NaN routine.
3730      */
3731     if (((aExp == 0x7ff) && aSig) ||
3732         ((bExp == 0x7ff) && bSig) ||
3733         ((cExp == 0x7ff) && cSig)) {
3734         return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
3735     }
3736
3737     if (infzero) {
3738         float_raise(float_flag_invalid STATUS_VAR);
3739         return float64_default_nan;
3740     }
3741
3742     if (flags & float_muladd_negate_c) {
3743         cSign ^= 1;
3744     }
3745
3746     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
3747
3748     /* Work out the sign and type of the product */
3749     pSign = aSign ^ bSign;
3750     if (flags & float_muladd_negate_product) {
3751         pSign ^= 1;
3752     }
3753     pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
3754     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
3755
3756     if (cExp == 0x7ff) {
3757         if (pInf && (pSign ^ cSign)) {
3758             /* addition of opposite-signed infinities => InvalidOperation */
3759             float_raise(float_flag_invalid STATUS_VAR);
3760             return float64_default_nan;
3761         }
3762         /* Otherwise generate an infinity of the same sign */
3763         return packFloat64(cSign ^ signflip, 0x7ff, 0);
3764     }
3765
3766     if (pInf) {
3767         return packFloat64(pSign ^ signflip, 0x7ff, 0);
3768     }
3769
3770     if (pZero) {
3771         if (cExp == 0) {
3772             if (cSig == 0) {
3773                 /* Adding two exact zeroes */
3774                 if (pSign == cSign) {
3775                     zSign = pSign;
3776                 } else if (STATUS(float_rounding_mode) == float_round_down) {
3777                     zSign = 1;
3778                 } else {
3779                     zSign = 0;
3780                 }
3781                 return packFloat64(zSign ^ signflip, 0, 0);
3782             }
3783             /* Exact zero plus a denorm */
3784             if (STATUS(flush_to_zero)) {
3785                 float_raise(float_flag_output_denormal STATUS_VAR);
3786                 return packFloat64(cSign ^ signflip, 0, 0);
3787             }
3788         }
3789         /* Zero plus something non-zero : just return the something */
3790         return packFloat64(cSign ^ signflip, cExp, cSig);
3791     }
3792
3793     if (aExp == 0) {
3794         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
3795     }
3796     if (bExp == 0) {
3797         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
3798     }
3799
3800     /* Calculate the actual result a * b + c */
3801
3802     /* Multiply first; this is easy. */
3803     /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
3804      * because we want the true exponent, not the "one-less-than"
3805      * flavour that roundAndPackFloat64() takes.
3806      */
3807     pExp = aExp + bExp - 0x3fe;
3808     aSig = (aSig | LIT64(0x0010000000000000))<<10;
3809     bSig = (bSig | LIT64(0x0010000000000000))<<11;
3810     mul64To128(aSig, bSig, &pSig0, &pSig1);
3811     if ((int64_t)(pSig0 << 1) >= 0) {
3812         shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
3813         pExp--;
3814     }
3815
3816     zSign = pSign ^ signflip;
3817
3818     /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
3819      * bit in position 126.
3820      */
3821     if (cExp == 0) {
3822         if (!cSig) {
3823             /* Throw out the special case of c being an exact zero now */
3824             shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
3825             return roundAndPackFloat64(zSign, pExp - 1,
3826                                        pSig1 STATUS_VAR);
3827         }
3828         normalizeFloat64Subnormal(cSig, &cExp, &cSig);
3829     }
3830
3831     /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
3832      * significand of the addend, with the explicit bit in position 126.
3833      */
3834     cSig0 = cSig << (126 - 64 - 52);
3835     cSig1 = 0;
3836     cSig0 |= LIT64(0x4000000000000000);
3837     expDiff = pExp - cExp;
3838
3839     if (pSign == cSign) {
3840         /* Addition */
3841         if (expDiff > 0) {
3842             /* scale c to match p */
3843             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3844             zExp = pExp;
3845         } else if (expDiff < 0) {
3846             /* scale p to match c */
3847             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3848             zExp = cExp;
3849         } else {
3850             /* no scaling needed */
3851             zExp = cExp;
3852         }
3853         /* Add significands and make sure explicit bit ends up in posn 126 */
3854         add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3855         if ((int64_t)zSig0 < 0) {
3856             shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
3857         } else {
3858             zExp--;
3859         }
3860         shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
3861         return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
3862     } else {
3863         /* Subtraction */
3864         if (expDiff > 0) {
3865             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3866             sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3867             zExp = pExp;
3868         } else if (expDiff < 0) {
3869             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3870             sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
3871             zExp = cExp;
3872             zSign ^= 1;
3873         } else {
3874             zExp = pExp;
3875             if (lt128(cSig0, cSig1, pSig0, pSig1)) {
3876                 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3877             } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
3878                 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
3879                 zSign ^= 1;
3880             } else {
3881                 /* Exact zero */
3882                 zSign = signflip;
3883                 if (STATUS(float_rounding_mode) == float_round_down) {
3884                     zSign ^= 1;
3885                 }
3886                 return packFloat64(zSign, 0, 0);
3887             }
3888         }
3889         --zExp;
3890         /* Do the equivalent of normalizeRoundAndPackFloat64() but
3891          * starting with the significand in a pair of uint64_t.
3892          */
3893         if (zSig0) {
3894             shiftcount = countLeadingZeros64(zSig0) - 1;
3895             shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
3896             if (zSig1) {
3897                 zSig0 |= 1;
3898             }
3899             zExp -= shiftcount;
3900         } else {
3901             shiftcount = countLeadingZeros64(zSig1);
3902             if (shiftcount == 0) {
3903                 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
3904                 zExp -= 63;
3905             } else {
3906                 shiftcount--;
3907                 zSig0 = zSig1 << shiftcount;
3908                 zExp -= (shiftcount + 64);
3909             }
3910         }
3911         return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
3912     }
3913 }
3914
3915 /*----------------------------------------------------------------------------
3916 | Returns the square root of the double-precision floating-point value `a'.
3917 | The operation is performed according to the IEC/IEEE Standard for Binary
3918 | Floating-Point Arithmetic.
3919 *----------------------------------------------------------------------------*/
3920
3921 float64 float64_sqrt( float64 a STATUS_PARAM )
3922 {
3923     flag aSign;
3924     int_fast16_t aExp, zExp;
3925     uint64_t aSig, zSig, doubleZSig;
3926     uint64_t rem0, rem1, term0, term1;
3927     a = float64_squash_input_denormal(a STATUS_VAR);
3928
3929     aSig = extractFloat64Frac( a );
3930     aExp = extractFloat64Exp( a );
3931     aSign = extractFloat64Sign( a );
3932     if ( aExp == 0x7FF ) {
3933         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3934         if ( ! aSign ) return a;
3935         float_raise( float_flag_invalid STATUS_VAR);
3936         return float64_default_nan;
3937     }
3938     if ( aSign ) {
3939         if ( ( aExp | aSig ) == 0 ) return a;
3940         float_raise( float_flag_invalid STATUS_VAR);
3941         return float64_default_nan;
3942     }
3943     if ( aExp == 0 ) {
3944         if ( aSig == 0 ) return float64_zero;
3945         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3946     }
3947     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3948     aSig |= LIT64( 0x0010000000000000 );
3949     zSig = estimateSqrt32( aExp, aSig>>21 );
3950     aSig <<= 9 - ( aExp & 1 );
3951     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3952     if ( ( zSig & 0x1FF ) <= 5 ) {
3953         doubleZSig = zSig<<1;
3954         mul64To128( zSig, zSig, &term0, &term1 );
3955         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3956         while ( (int64_t) rem0 < 0 ) {
3957             --zSig;
3958             doubleZSig -= 2;
3959             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3960         }
3961         zSig |= ( ( rem0 | rem1 ) != 0 );
3962     }
3963     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3964
3965 }
3966
3967 /*----------------------------------------------------------------------------
3968 | Returns the binary log of the double-precision floating-point value `a'.
3969 | The operation is performed according to the IEC/IEEE Standard for Binary
3970 | Floating-Point Arithmetic.
3971 *----------------------------------------------------------------------------*/
3972 float64 float64_log2( float64 a STATUS_PARAM )
3973 {
3974     flag aSign, zSign;
3975     int_fast16_t aExp;
3976     uint64_t aSig, aSig0, aSig1, zSig, i;
3977     a = float64_squash_input_denormal(a STATUS_VAR);
3978
3979     aSig = extractFloat64Frac( a );
3980     aExp = extractFloat64Exp( a );
3981     aSign = extractFloat64Sign( a );
3982
3983     if ( aExp == 0 ) {
3984         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3985         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3986     }
3987     if ( aSign ) {
3988         float_raise( float_flag_invalid STATUS_VAR);
3989         return float64_default_nan;
3990     }
3991     if ( aExp == 0x7FF ) {
3992         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3993         return a;
3994     }
3995
3996     aExp -= 0x3FF;
3997     aSig |= LIT64( 0x0010000000000000 );
3998     zSign = aExp < 0;
3999     zSig = (uint64_t)aExp << 52;
4000     for (i = 1LL << 51; i > 0; i >>= 1) {
4001         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4002         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4003         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4004             aSig >>= 1;
4005             zSig |= i;
4006         }
4007     }
4008
4009     if ( zSign )
4010         zSig = -zSig;
4011     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
4012 }
4013
4014 /*----------------------------------------------------------------------------
4015 | Returns 1 if the double-precision floating-point value `a' is equal to the
4016 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4017 | if either operand is a NaN.  Otherwise, the comparison is performed
4018 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4019 *----------------------------------------------------------------------------*/
4020
4021 int float64_eq( float64 a, float64 b STATUS_PARAM )
4022 {
4023     uint64_t av, bv;
4024     a = float64_squash_input_denormal(a STATUS_VAR);
4025     b = float64_squash_input_denormal(b STATUS_VAR);
4026
4027     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4028          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4029        ) {
4030         float_raise( float_flag_invalid STATUS_VAR);
4031         return 0;
4032     }
4033     av = float64_val(a);
4034     bv = float64_val(b);
4035     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4036
4037 }
4038
4039 /*----------------------------------------------------------------------------
4040 | Returns 1 if the double-precision floating-point value `a' is less than or
4041 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4042 | exception is raised if either operand is a NaN.  The comparison is performed
4043 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4044 *----------------------------------------------------------------------------*/
4045
4046 int float64_le( float64 a, float64 b STATUS_PARAM )
4047 {
4048     flag aSign, bSign;
4049     uint64_t av, bv;
4050     a = float64_squash_input_denormal(a STATUS_VAR);
4051     b = float64_squash_input_denormal(b STATUS_VAR);
4052
4053     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4054          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4055        ) {
4056         float_raise( float_flag_invalid STATUS_VAR);
4057         return 0;
4058     }
4059     aSign = extractFloat64Sign( a );
4060     bSign = extractFloat64Sign( b );
4061     av = float64_val(a);
4062     bv = float64_val(b);
4063     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4064     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4065
4066 }
4067
4068 /*----------------------------------------------------------------------------
4069 | Returns 1 if the double-precision floating-point value `a' is less than
4070 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4071 | raised if either operand is a NaN.  The comparison is performed according
4072 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4073 *----------------------------------------------------------------------------*/
4074
4075 int float64_lt( float64 a, float64 b STATUS_PARAM )
4076 {
4077     flag aSign, bSign;
4078     uint64_t av, bv;
4079
4080     a = float64_squash_input_denormal(a STATUS_VAR);
4081     b = float64_squash_input_denormal(b STATUS_VAR);
4082     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4083          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4084        ) {
4085         float_raise( float_flag_invalid STATUS_VAR);
4086         return 0;
4087     }
4088     aSign = extractFloat64Sign( a );
4089     bSign = extractFloat64Sign( b );
4090     av = float64_val(a);
4091     bv = float64_val(b);
4092     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4093     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4094
4095 }
4096
4097 /*----------------------------------------------------------------------------
4098 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4099 | be compared, and 0 otherwise.  The invalid exception is raised if either
4100 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4101 | Standard for Binary Floating-Point Arithmetic.
4102 *----------------------------------------------------------------------------*/
4103
4104 int float64_unordered( float64 a, float64 b STATUS_PARAM )
4105 {
4106     a = float64_squash_input_denormal(a STATUS_VAR);
4107     b = float64_squash_input_denormal(b STATUS_VAR);
4108
4109     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4110          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4111        ) {
4112         float_raise( float_flag_invalid STATUS_VAR);
4113         return 1;
4114     }
4115     return 0;
4116 }
4117
4118 /*----------------------------------------------------------------------------
4119 | Returns 1 if the double-precision floating-point value `a' is equal to the
4120 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4121 | exception.The comparison is performed according to the IEC/IEEE Standard
4122 | for Binary Floating-Point Arithmetic.
4123 *----------------------------------------------------------------------------*/
4124
4125 int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
4126 {
4127     uint64_t av, bv;
4128     a = float64_squash_input_denormal(a STATUS_VAR);
4129     b = float64_squash_input_denormal(b STATUS_VAR);
4130
4131     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4132          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4133        ) {
4134         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4135             float_raise( float_flag_invalid STATUS_VAR);
4136         }
4137         return 0;
4138     }
4139     av = float64_val(a);
4140     bv = float64_val(b);
4141     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4142
4143 }
4144
4145 /*----------------------------------------------------------------------------
4146 | Returns 1 if the double-precision floating-point value `a' is less than or
4147 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4148 | cause an exception.  Otherwise, the comparison is performed according to the
4149 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4150 *----------------------------------------------------------------------------*/
4151
4152 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
4153 {
4154     flag aSign, bSign;
4155     uint64_t av, bv;
4156     a = float64_squash_input_denormal(a STATUS_VAR);
4157     b = float64_squash_input_denormal(b STATUS_VAR);
4158
4159     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4160          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4161        ) {
4162         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4163             float_raise( float_flag_invalid STATUS_VAR);
4164         }
4165         return 0;
4166     }
4167     aSign = extractFloat64Sign( a );
4168     bSign = extractFloat64Sign( b );
4169     av = float64_val(a);
4170     bv = float64_val(b);
4171     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4172     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4173
4174 }
4175
4176 /*----------------------------------------------------------------------------
4177 | Returns 1 if the double-precision floating-point value `a' is less than
4178 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4179 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4180 | Standard for Binary Floating-Point Arithmetic.
4181 *----------------------------------------------------------------------------*/
4182
4183 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
4184 {
4185     flag aSign, bSign;
4186     uint64_t av, bv;
4187     a = float64_squash_input_denormal(a STATUS_VAR);
4188     b = float64_squash_input_denormal(b STATUS_VAR);
4189
4190     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4191          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4192        ) {
4193         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4194             float_raise( float_flag_invalid STATUS_VAR);
4195         }
4196         return 0;
4197     }
4198     aSign = extractFloat64Sign( a );
4199     bSign = extractFloat64Sign( b );
4200     av = float64_val(a);
4201     bv = float64_val(b);
4202     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4203     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4204
4205 }
4206
4207 /*----------------------------------------------------------------------------
4208 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4209 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4210 | comparison is performed according to the IEC/IEEE Standard for Binary
4211 | Floating-Point Arithmetic.
4212 *----------------------------------------------------------------------------*/
4213
4214 int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4215 {
4216     a = float64_squash_input_denormal(a STATUS_VAR);
4217     b = float64_squash_input_denormal(b STATUS_VAR);
4218
4219     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4220          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4221        ) {
4222         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4223             float_raise( float_flag_invalid STATUS_VAR);
4224         }
4225         return 1;
4226     }
4227     return 0;
4228 }
4229
4230 /*----------------------------------------------------------------------------
4231 | Returns the result of converting the extended double-precision floating-
4232 | point value `a' to the 32-bit two's complement integer format.  The
4233 | conversion is performed according to the IEC/IEEE Standard for Binary
4234 | Floating-Point Arithmetic---which means in particular that the conversion
4235 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4236 | largest positive integer is returned.  Otherwise, if the conversion
4237 | overflows, the largest integer with the same sign as `a' is returned.
4238 *----------------------------------------------------------------------------*/
4239
4240 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4241 {
4242     flag aSign;
4243     int32 aExp, shiftCount;
4244     uint64_t aSig;
4245
4246     aSig = extractFloatx80Frac( a );
4247     aExp = extractFloatx80Exp( a );
4248     aSign = extractFloatx80Sign( a );
4249     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4250     shiftCount = 0x4037 - aExp;
4251     if ( shiftCount <= 0 ) shiftCount = 1;
4252     shift64RightJamming( aSig, shiftCount, &aSig );
4253     return roundAndPackInt32( aSign, aSig STATUS_VAR );
4254
4255 }
4256
4257 /*----------------------------------------------------------------------------
4258 | Returns the result of converting the extended double-precision floating-
4259 | point value `a' to the 32-bit two's complement integer format.  The
4260 | conversion is performed according to the IEC/IEEE Standard for Binary
4261 | Floating-Point Arithmetic, except that the conversion is always rounded
4262 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4263 | Otherwise, if the conversion overflows, the largest integer with the same
4264 | sign as `a' is returned.
4265 *----------------------------------------------------------------------------*/
4266
4267 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4268 {
4269     flag aSign;
4270     int32 aExp, shiftCount;
4271     uint64_t aSig, savedASig;
4272     int32_t z;
4273
4274     aSig = extractFloatx80Frac( a );
4275     aExp = extractFloatx80Exp( a );
4276     aSign = extractFloatx80Sign( a );
4277     if ( 0x401E < aExp ) {
4278         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4279         goto invalid;
4280     }
4281     else if ( aExp < 0x3FFF ) {
4282         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4283         return 0;
4284     }
4285     shiftCount = 0x403E - aExp;
4286     savedASig = aSig;
4287     aSig >>= shiftCount;
4288     z = aSig;
4289     if ( aSign ) z = - z;
4290     if ( ( z < 0 ) ^ aSign ) {
4291  invalid:
4292         float_raise( float_flag_invalid STATUS_VAR);
4293         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4294     }
4295     if ( ( aSig<<shiftCount ) != savedASig ) {
4296         STATUS(float_exception_flags) |= float_flag_inexact;
4297     }
4298     return z;
4299
4300 }
4301
4302 /*----------------------------------------------------------------------------
4303 | Returns the result of converting the extended double-precision floating-
4304 | point value `a' to the 64-bit two's complement integer format.  The
4305 | conversion is performed according to the IEC/IEEE Standard for Binary
4306 | Floating-Point Arithmetic---which means in particular that the conversion
4307 | is rounded according to the current rounding mode.  If `a' is a NaN,
4308 | the largest positive integer is returned.  Otherwise, if the conversion
4309 | overflows, the largest integer with the same sign as `a' is returned.
4310 *----------------------------------------------------------------------------*/
4311
4312 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4313 {
4314     flag aSign;
4315     int32 aExp, shiftCount;
4316     uint64_t aSig, aSigExtra;
4317
4318     aSig = extractFloatx80Frac( a );
4319     aExp = extractFloatx80Exp( a );
4320     aSign = extractFloatx80Sign( a );
4321     shiftCount = 0x403E - aExp;
4322     if ( shiftCount <= 0 ) {
4323         if ( shiftCount ) {
4324             float_raise( float_flag_invalid STATUS_VAR);
4325             if (    ! aSign
4326                  || (    ( aExp == 0x7FFF )
4327                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
4328                ) {
4329                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4330             }
4331             return (int64_t) LIT64( 0x8000000000000000 );
4332         }
4333         aSigExtra = 0;
4334     }
4335     else {
4336         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4337     }
4338     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4339
4340 }
4341
4342 /*----------------------------------------------------------------------------
4343 | Returns the result of converting the extended double-precision floating-
4344 | point value `a' to the 64-bit two's complement integer format.  The
4345 | conversion is performed according to the IEC/IEEE Standard for Binary
4346 | Floating-Point Arithmetic, except that the conversion is always rounded
4347 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4348 | Otherwise, if the conversion overflows, the largest integer with the same
4349 | sign as `a' is returned.
4350 *----------------------------------------------------------------------------*/
4351
4352 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4353 {
4354     flag aSign;
4355     int32 aExp, shiftCount;
4356     uint64_t aSig;
4357     int64 z;
4358
4359     aSig = extractFloatx80Frac( a );
4360     aExp = extractFloatx80Exp( a );
4361     aSign = extractFloatx80Sign( a );
4362     shiftCount = aExp - 0x403E;
4363     if ( 0 <= shiftCount ) {
4364         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4365         if ( ( a.high != 0xC03E ) || aSig ) {
4366             float_raise( float_flag_invalid STATUS_VAR);
4367             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4368                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4369             }
4370         }
4371         return (int64_t) LIT64( 0x8000000000000000 );
4372     }
4373     else if ( aExp < 0x3FFF ) {
4374         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4375         return 0;
4376     }
4377     z = aSig>>( - shiftCount );
4378     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4379         STATUS(float_exception_flags) |= float_flag_inexact;
4380     }
4381     if ( aSign ) z = - z;
4382     return z;
4383
4384 }
4385
4386 /*----------------------------------------------------------------------------
4387 | Returns the result of converting the extended double-precision floating-
4388 | point value `a' to the single-precision floating-point format.  The
4389 | conversion is performed according to the IEC/IEEE Standard for Binary
4390 | Floating-Point Arithmetic.
4391 *----------------------------------------------------------------------------*/
4392
4393 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4394 {
4395     flag aSign;
4396     int32 aExp;
4397     uint64_t aSig;
4398
4399     aSig = extractFloatx80Frac( a );
4400     aExp = extractFloatx80Exp( a );
4401     aSign = extractFloatx80Sign( a );
4402     if ( aExp == 0x7FFF ) {
4403         if ( (uint64_t) ( aSig<<1 ) ) {
4404             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4405         }
4406         return packFloat32( aSign, 0xFF, 0 );
4407     }
4408     shift64RightJamming( aSig, 33, &aSig );
4409     if ( aExp || aSig ) aExp -= 0x3F81;
4410     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4411
4412 }
4413
4414 /*----------------------------------------------------------------------------
4415 | Returns the result of converting the extended double-precision floating-
4416 | point value `a' to the double-precision floating-point format.  The
4417 | conversion is performed according to the IEC/IEEE Standard for Binary
4418 | Floating-Point Arithmetic.
4419 *----------------------------------------------------------------------------*/
4420
4421 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4422 {
4423     flag aSign;
4424     int32 aExp;
4425     uint64_t aSig, zSig;
4426
4427     aSig = extractFloatx80Frac( a );
4428     aExp = extractFloatx80Exp( a );
4429     aSign = extractFloatx80Sign( a );
4430     if ( aExp == 0x7FFF ) {
4431         if ( (uint64_t) ( aSig<<1 ) ) {
4432             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4433         }
4434         return packFloat64( aSign, 0x7FF, 0 );
4435     }
4436     shift64RightJamming( aSig, 1, &zSig );
4437     if ( aExp || aSig ) aExp -= 0x3C01;
4438     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4439
4440 }
4441
4442 /*----------------------------------------------------------------------------
4443 | Returns the result of converting the extended double-precision floating-
4444 | point value `a' to the quadruple-precision floating-point format.  The
4445 | conversion is performed according to the IEC/IEEE Standard for Binary
4446 | Floating-Point Arithmetic.
4447 *----------------------------------------------------------------------------*/
4448
4449 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4450 {
4451     flag aSign;
4452     int_fast16_t aExp;
4453     uint64_t aSig, zSig0, zSig1;
4454
4455     aSig = extractFloatx80Frac( a );
4456     aExp = extractFloatx80Exp( a );
4457     aSign = extractFloatx80Sign( a );
4458     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4459         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4460     }
4461     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4462     return packFloat128( aSign, aExp, zSig0, zSig1 );
4463
4464 }
4465
4466 /*----------------------------------------------------------------------------
4467 | Rounds the extended double-precision floating-point value `a' to an integer,
4468 | and returns the result as an extended quadruple-precision floating-point
4469 | value.  The operation is performed according to the IEC/IEEE Standard for
4470 | Binary Floating-Point Arithmetic.
4471 *----------------------------------------------------------------------------*/
4472
4473 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4474 {
4475     flag aSign;
4476     int32 aExp;
4477     uint64_t lastBitMask, roundBitsMask;
4478     int8 roundingMode;
4479     floatx80 z;
4480
4481     aExp = extractFloatx80Exp( a );
4482     if ( 0x403E <= aExp ) {
4483         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4484             return propagateFloatx80NaN( a, a STATUS_VAR );
4485         }
4486         return a;
4487     }
4488     if ( aExp < 0x3FFF ) {
4489         if (    ( aExp == 0 )
4490              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4491             return a;
4492         }
4493         STATUS(float_exception_flags) |= float_flag_inexact;
4494         aSign = extractFloatx80Sign( a );
4495         switch ( STATUS(float_rounding_mode) ) {
4496          case float_round_nearest_even:
4497             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4498                ) {
4499                 return
4500                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4501             }
4502             break;
4503          case float_round_down:
4504             return
4505                   aSign ?
4506                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4507                 : packFloatx80( 0, 0, 0 );
4508          case float_round_up:
4509             return
4510                   aSign ? packFloatx80( 1, 0, 0 )
4511                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4512         }
4513         return packFloatx80( aSign, 0, 0 );
4514     }
4515     lastBitMask = 1;
4516     lastBitMask <<= 0x403E - aExp;
4517     roundBitsMask = lastBitMask - 1;
4518     z = a;
4519     roundingMode = STATUS(float_rounding_mode);
4520     if ( roundingMode == float_round_nearest_even ) {
4521         z.low += lastBitMask>>1;
4522         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4523     }
4524     else if ( roundingMode != float_round_to_zero ) {
4525         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4526             z.low += roundBitsMask;
4527         }
4528     }
4529     z.low &= ~ roundBitsMask;
4530     if ( z.low == 0 ) {
4531         ++z.high;
4532         z.low = LIT64( 0x8000000000000000 );
4533     }
4534     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4535     return z;
4536
4537 }
4538
4539 /*----------------------------------------------------------------------------
4540 | Returns the result of adding the absolute values of the extended double-
4541 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4542 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4543 | The addition is performed according to the IEC/IEEE Standard for Binary
4544 | Floating-Point Arithmetic.
4545 *----------------------------------------------------------------------------*/
4546
4547 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4548 {
4549     int32 aExp, bExp, zExp;
4550     uint64_t aSig, bSig, zSig0, zSig1;
4551     int32 expDiff;
4552
4553     aSig = extractFloatx80Frac( a );
4554     aExp = extractFloatx80Exp( a );
4555     bSig = extractFloatx80Frac( b );
4556     bExp = extractFloatx80Exp( b );
4557     expDiff = aExp - bExp;
4558     if ( 0 < expDiff ) {
4559         if ( aExp == 0x7FFF ) {
4560             if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4561             return a;
4562         }
4563         if ( bExp == 0 ) --expDiff;
4564         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4565         zExp = aExp;
4566     }
4567     else if ( expDiff < 0 ) {
4568         if ( bExp == 0x7FFF ) {
4569             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4570             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4571         }
4572         if ( aExp == 0 ) ++expDiff;
4573         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4574         zExp = bExp;
4575     }
4576     else {
4577         if ( aExp == 0x7FFF ) {
4578             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4579                 return propagateFloatx80NaN( a, b STATUS_VAR );
4580             }
4581             return a;
4582         }
4583         zSig1 = 0;
4584         zSig0 = aSig + bSig;
4585         if ( aExp == 0 ) {
4586             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4587             goto roundAndPack;
4588         }
4589         zExp = aExp;
4590         goto shiftRight1;
4591     }
4592     zSig0 = aSig + bSig;
4593     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4594  shiftRight1:
4595     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4596     zSig0 |= LIT64( 0x8000000000000000 );
4597     ++zExp;
4598  roundAndPack:
4599     return
4600         roundAndPackFloatx80(
4601             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4602
4603 }
4604
4605 /*----------------------------------------------------------------------------
4606 | Returns the result of subtracting the absolute values of the extended
4607 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4608 | difference is negated before being returned.  `zSign' is ignored if the
4609 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4610 | Standard for Binary Floating-Point Arithmetic.
4611 *----------------------------------------------------------------------------*/
4612
4613 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4614 {
4615     int32 aExp, bExp, zExp;
4616     uint64_t aSig, bSig, zSig0, zSig1;
4617     int32 expDiff;
4618     floatx80 z;
4619
4620     aSig = extractFloatx80Frac( a );
4621     aExp = extractFloatx80Exp( a );
4622     bSig = extractFloatx80Frac( b );
4623     bExp = extractFloatx80Exp( b );
4624     expDiff = aExp - bExp;
4625     if ( 0 < expDiff ) goto aExpBigger;
4626     if ( expDiff < 0 ) goto bExpBigger;
4627     if ( aExp == 0x7FFF ) {
4628         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4629             return propagateFloatx80NaN( a, b STATUS_VAR );
4630         }
4631         float_raise( float_flag_invalid STATUS_VAR);
4632         z.low = floatx80_default_nan_low;
4633         z.high = floatx80_default_nan_high;
4634         return z;
4635     }
4636     if ( aExp == 0 ) {
4637         aExp = 1;
4638         bExp = 1;
4639     }
4640     zSig1 = 0;
4641     if ( bSig < aSig ) goto aBigger;
4642     if ( aSig < bSig ) goto bBigger;
4643     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4644  bExpBigger:
4645     if ( bExp == 0x7FFF ) {
4646         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4647         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4648     }
4649     if ( aExp == 0 ) ++expDiff;
4650     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4651  bBigger:
4652     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4653     zExp = bExp;
4654     zSign ^= 1;
4655     goto normalizeRoundAndPack;
4656  aExpBigger:
4657     if ( aExp == 0x7FFF ) {
4658         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4659         return a;
4660     }
4661     if ( bExp == 0 ) --expDiff;
4662     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4663  aBigger:
4664     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4665     zExp = aExp;
4666  normalizeRoundAndPack:
4667     return
4668         normalizeRoundAndPackFloatx80(
4669             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4670
4671 }
4672
4673 /*----------------------------------------------------------------------------
4674 | Returns the result of adding the extended double-precision floating-point
4675 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4676 | Standard for Binary Floating-Point Arithmetic.
4677 *----------------------------------------------------------------------------*/
4678
4679 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4680 {
4681     flag aSign, bSign;
4682
4683     aSign = extractFloatx80Sign( a );
4684     bSign = extractFloatx80Sign( b );
4685     if ( aSign == bSign ) {
4686         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4687     }
4688     else {
4689         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4690     }
4691
4692 }
4693
4694 /*----------------------------------------------------------------------------
4695 | Returns the result of subtracting the extended double-precision floating-
4696 | point values `a' and `b'.  The operation is performed according to the
4697 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4698 *----------------------------------------------------------------------------*/
4699
4700 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4701 {
4702     flag aSign, bSign;
4703
4704     aSign = extractFloatx80Sign( a );
4705     bSign = extractFloatx80Sign( b );
4706     if ( aSign == bSign ) {
4707         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4708     }
4709     else {
4710         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4711     }
4712
4713 }
4714
4715 /*----------------------------------------------------------------------------
4716 | Returns the result of multiplying the extended double-precision floating-
4717 | point values `a' and `b'.  The operation is performed according to the
4718 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4719 *----------------------------------------------------------------------------*/
4720
4721 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4722 {
4723     flag aSign, bSign, zSign;
4724     int32 aExp, bExp, zExp;
4725     uint64_t aSig, bSig, zSig0, zSig1;
4726     floatx80 z;
4727
4728     aSig = extractFloatx80Frac( a );
4729     aExp = extractFloatx80Exp( a );
4730     aSign = extractFloatx80Sign( a );
4731     bSig = extractFloatx80Frac( b );
4732     bExp = extractFloatx80Exp( b );
4733     bSign = extractFloatx80Sign( b );
4734     zSign = aSign ^ bSign;
4735     if ( aExp == 0x7FFF ) {
4736         if (    (uint64_t) ( aSig<<1 )
4737              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4738             return propagateFloatx80NaN( a, b STATUS_VAR );
4739         }
4740         if ( ( bExp | bSig ) == 0 ) goto invalid;
4741         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4742     }
4743     if ( bExp == 0x7FFF ) {
4744         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4745         if ( ( aExp | aSig ) == 0 ) {
4746  invalid:
4747             float_raise( float_flag_invalid STATUS_VAR);
4748             z.low = floatx80_default_nan_low;
4749             z.high = floatx80_default_nan_high;
4750             return z;
4751         }
4752         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4753     }
4754     if ( aExp == 0 ) {
4755         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4756         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4757     }
4758     if ( bExp == 0 ) {
4759         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4760         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4761     }
4762     zExp = aExp + bExp - 0x3FFE;
4763     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4764     if ( 0 < (int64_t) zSig0 ) {
4765         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4766         --zExp;
4767     }
4768     return
4769         roundAndPackFloatx80(
4770             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4771
4772 }
4773
4774 /*----------------------------------------------------------------------------
4775 | Returns the result of dividing the extended double-precision floating-point
4776 | value `a' by the corresponding value `b'.  The operation is performed
4777 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4778 *----------------------------------------------------------------------------*/
4779
4780 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4781 {
4782     flag aSign, bSign, zSign;
4783     int32 aExp, bExp, zExp;
4784     uint64_t aSig, bSig, zSig0, zSig1;
4785     uint64_t rem0, rem1, rem2, term0, term1, term2;
4786     floatx80 z;
4787
4788     aSig = extractFloatx80Frac( a );
4789     aExp = extractFloatx80Exp( a );
4790     aSign = extractFloatx80Sign( a );
4791     bSig = extractFloatx80Frac( b );
4792     bExp = extractFloatx80Exp( b );
4793     bSign = extractFloatx80Sign( b );
4794     zSign = aSign ^ bSign;
4795     if ( aExp == 0x7FFF ) {
4796         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4797         if ( bExp == 0x7FFF ) {
4798             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4799             goto invalid;
4800         }
4801         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4802     }
4803     if ( bExp == 0x7FFF ) {
4804         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4805         return packFloatx80( zSign, 0, 0 );
4806     }
4807     if ( bExp == 0 ) {
4808         if ( bSig == 0 ) {
4809             if ( ( aExp | aSig ) == 0 ) {
4810  invalid:
4811                 float_raise( float_flag_invalid STATUS_VAR);
4812                 z.low = floatx80_default_nan_low;
4813                 z.high = floatx80_default_nan_high;
4814                 return z;
4815             }
4816             float_raise( float_flag_divbyzero STATUS_VAR);
4817             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4818         }
4819         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4820     }
4821     if ( aExp == 0 ) {
4822         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4823         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4824     }
4825     zExp = aExp - bExp + 0x3FFE;
4826     rem1 = 0;
4827     if ( bSig <= aSig ) {
4828         shift128Right( aSig, 0, 1, &aSig, &rem1 );
4829         ++zExp;
4830     }
4831     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4832     mul64To128( bSig, zSig0, &term0, &term1 );
4833     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4834     while ( (int64_t) rem0 < 0 ) {
4835         --zSig0;
4836         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4837     }
4838     zSig1 = estimateDiv128To64( rem1, 0, bSig );
4839     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4840         mul64To128( bSig, zSig1, &term1, &term2 );
4841         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4842         while ( (int64_t) rem1 < 0 ) {
4843             --zSig1;
4844             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4845         }
4846         zSig1 |= ( ( rem1 | rem2 ) != 0 );
4847     }
4848     return
4849         roundAndPackFloatx80(
4850             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4851
4852 }
4853
4854 /*----------------------------------------------------------------------------
4855 | Returns the remainder of the extended double-precision floating-point value
4856 | `a' with respect to the corresponding value `b'.  The operation is performed
4857 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4858 *----------------------------------------------------------------------------*/
4859
4860 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4861 {
4862     flag aSign, zSign;
4863     int32 aExp, bExp, expDiff;
4864     uint64_t aSig0, aSig1, bSig;
4865     uint64_t q, term0, term1, alternateASig0, alternateASig1;
4866     floatx80 z;
4867
4868     aSig0 = extractFloatx80Frac( a );
4869     aExp = extractFloatx80Exp( a );
4870     aSign = extractFloatx80Sign( a );
4871     bSig = extractFloatx80Frac( b );
4872     bExp = extractFloatx80Exp( b );
4873     if ( aExp == 0x7FFF ) {
4874         if (    (uint64_t) ( aSig0<<1 )
4875              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4876             return propagateFloatx80NaN( a, b STATUS_VAR );
4877         }
4878         goto invalid;
4879     }
4880     if ( bExp == 0x7FFF ) {
4881         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4882         return a;
4883     }
4884     if ( bExp == 0 ) {
4885         if ( bSig == 0 ) {
4886  invalid:
4887             float_raise( float_flag_invalid STATUS_VAR);
4888             z.low = floatx80_default_nan_low;
4889             z.high = floatx80_default_nan_high;
4890             return z;
4891         }
4892         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4893     }
4894     if ( aExp == 0 ) {
4895         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4896         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4897     }
4898     bSig |= LIT64( 0x8000000000000000 );
4899     zSign = aSign;
4900     expDiff = aExp - bExp;
4901     aSig1 = 0;
4902     if ( expDiff < 0 ) {
4903         if ( expDiff < -1 ) return a;
4904         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4905         expDiff = 0;
4906     }
4907     q = ( bSig <= aSig0 );
4908     if ( q ) aSig0 -= bSig;
4909     expDiff -= 64;
4910     while ( 0 < expDiff ) {
4911         q = estimateDiv128To64( aSig0, aSig1, bSig );
4912         q = ( 2 < q ) ? q - 2 : 0;
4913         mul64To128( bSig, q, &term0, &term1 );
4914         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4915         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4916         expDiff -= 62;
4917     }
4918     expDiff += 64;
4919     if ( 0 < expDiff ) {
4920         q = estimateDiv128To64( aSig0, aSig1, bSig );
4921         q = ( 2 < q ) ? q - 2 : 0;
4922         q >>= 64 - expDiff;
4923         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4924         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4925         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4926         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4927             ++q;
4928             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4929         }
4930     }
4931     else {
4932         term1 = 0;
4933         term0 = bSig;
4934     }
4935     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4936     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4937          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4938               && ( q & 1 ) )
4939        ) {
4940         aSig0 = alternateASig0;
4941         aSig1 = alternateASig1;
4942         zSign = ! zSign;
4943     }
4944     return
4945         normalizeRoundAndPackFloatx80(
4946             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4947
4948 }
4949
4950 /*----------------------------------------------------------------------------
4951 | Returns the square root of the extended double-precision floating-point
4952 | value `a'.  The operation is performed according to the IEC/IEEE Standard
4953 | for Binary Floating-Point Arithmetic.
4954 *----------------------------------------------------------------------------*/
4955
4956 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4957 {
4958     flag aSign;
4959     int32 aExp, zExp;
4960     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4961     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4962     floatx80 z;
4963
4964     aSig0 = extractFloatx80Frac( a );
4965     aExp = extractFloatx80Exp( a );
4966     aSign = extractFloatx80Sign( a );
4967     if ( aExp == 0x7FFF ) {
4968         if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4969         if ( ! aSign ) return a;
4970         goto invalid;
4971     }
4972     if ( aSign ) {
4973         if ( ( aExp | aSig0 ) == 0 ) return a;
4974  invalid:
4975         float_raise( float_flag_invalid STATUS_VAR);
4976         z.low = floatx80_default_nan_low;
4977         z.high = floatx80_default_nan_high;
4978         return z;
4979     }
4980     if ( aExp == 0 ) {
4981         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4982         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4983     }
4984     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4985     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4986     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4987     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4988     doubleZSig0 = zSig0<<1;
4989     mul64To128( zSig0, zSig0, &term0, &term1 );
4990     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4991     while ( (int64_t) rem0 < 0 ) {
4992         --zSig0;
4993         doubleZSig0 -= 2;
4994         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4995     }
4996     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4997     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4998         if ( zSig1 == 0 ) zSig1 = 1;
4999         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5000         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5001         mul64To128( zSig1, zSig1, &term2, &term3 );
5002         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5003         while ( (int64_t) rem1 < 0 ) {
5004             --zSig1;
5005             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5006             term3 |= 1;
5007             term2 |= doubleZSig0;
5008             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5009         }
5010         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5011     }
5012     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5013     zSig0 |= doubleZSig0;
5014     return
5015         roundAndPackFloatx80(
5016             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
5017
5018 }
5019
5020 /*----------------------------------------------------------------------------
5021 | Returns 1 if the extended double-precision floating-point value `a' is equal
5022 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5023 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5024 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5025 *----------------------------------------------------------------------------*/
5026
5027 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
5028 {
5029
5030     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5031               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5032          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5033               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5034        ) {
5035         float_raise( float_flag_invalid STATUS_VAR);
5036         return 0;
5037     }
5038     return
5039            ( a.low == b.low )
5040         && (    ( a.high == b.high )
5041              || (    ( a.low == 0 )
5042                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5043            );
5044
5045 }
5046
5047 /*----------------------------------------------------------------------------
5048 | Returns 1 if the extended double-precision floating-point value `a' is
5049 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5050 | invalid exception is raised if either operand is a NaN.  The comparison is
5051 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5052 | Arithmetic.
5053 *----------------------------------------------------------------------------*/
5054
5055 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
5056 {
5057     flag aSign, bSign;
5058
5059     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5060               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5061          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5062               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5063        ) {
5064         float_raise( float_flag_invalid STATUS_VAR);
5065         return 0;
5066     }
5067     aSign = extractFloatx80Sign( a );
5068     bSign = extractFloatx80Sign( b );
5069     if ( aSign != bSign ) {
5070         return
5071                aSign
5072             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5073                  == 0 );
5074     }
5075     return
5076           aSign ? le128( b.high, b.low, a.high, a.low )
5077         : le128( a.high, a.low, b.high, b.low );
5078
5079 }
5080
5081 /*----------------------------------------------------------------------------
5082 | Returns 1 if the extended double-precision floating-point value `a' is
5083 | less than the corresponding value `b', and 0 otherwise.  The invalid
5084 | exception is raised if either operand is a NaN.  The comparison is performed
5085 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5086 *----------------------------------------------------------------------------*/
5087
5088 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
5089 {
5090     flag aSign, bSign;
5091
5092     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5093               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5094          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5095               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5096        ) {
5097         float_raise( float_flag_invalid STATUS_VAR);
5098         return 0;
5099     }
5100     aSign = extractFloatx80Sign( a );
5101     bSign = extractFloatx80Sign( b );
5102     if ( aSign != bSign ) {
5103         return
5104                aSign
5105             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5106                  != 0 );
5107     }
5108     return
5109           aSign ? lt128( b.high, b.low, a.high, a.low )
5110         : lt128( a.high, a.low, b.high, b.low );
5111
5112 }
5113
5114 /*----------------------------------------------------------------------------
5115 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5116 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5117 | either operand is a NaN.   The comparison is performed according to the
5118 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5119 *----------------------------------------------------------------------------*/
5120 int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5121 {
5122     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5123               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5124          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5125               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5126        ) {
5127         float_raise( float_flag_invalid STATUS_VAR);
5128         return 1;
5129     }
5130     return 0;
5131 }
5132
5133 /*----------------------------------------------------------------------------
5134 | Returns 1 if the extended double-precision floating-point value `a' is
5135 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5136 | cause an exception.  The comparison is performed according to the IEC/IEEE
5137 | Standard for Binary Floating-Point Arithmetic.
5138 *----------------------------------------------------------------------------*/
5139
5140 int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5141 {
5142
5143     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5144               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5145          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5146               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5147        ) {
5148         if (    floatx80_is_signaling_nan( a )
5149              || floatx80_is_signaling_nan( b ) ) {
5150             float_raise( float_flag_invalid STATUS_VAR);
5151         }
5152         return 0;
5153     }
5154     return
5155            ( a.low == b.low )
5156         && (    ( a.high == b.high )
5157              || (    ( a.low == 0 )
5158                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5159            );
5160
5161 }
5162
5163 /*----------------------------------------------------------------------------
5164 | Returns 1 if the extended double-precision floating-point value `a' is less
5165 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5166 | do not cause an exception.  Otherwise, the comparison is performed according
5167 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5168 *----------------------------------------------------------------------------*/
5169
5170 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5171 {
5172     flag aSign, bSign;
5173
5174     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5175               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5176          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5177               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5178        ) {
5179         if (    floatx80_is_signaling_nan( a )
5180              || floatx80_is_signaling_nan( b ) ) {
5181             float_raise( float_flag_invalid STATUS_VAR);
5182         }
5183         return 0;
5184     }
5185     aSign = extractFloatx80Sign( a );
5186     bSign = extractFloatx80Sign( b );
5187     if ( aSign != bSign ) {
5188         return
5189                aSign
5190             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5191                  == 0 );
5192     }
5193     return
5194           aSign ? le128( b.high, b.low, a.high, a.low )
5195         : le128( a.high, a.low, b.high, b.low );
5196
5197 }
5198
5199 /*----------------------------------------------------------------------------
5200 | Returns 1 if the extended double-precision floating-point value `a' is less
5201 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5202 | an exception.  Otherwise, the comparison is performed according to the
5203 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5204 *----------------------------------------------------------------------------*/
5205
5206 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5207 {
5208     flag aSign, bSign;
5209
5210     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5211               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5212          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5213               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5214        ) {
5215         if (    floatx80_is_signaling_nan( a )
5216              || floatx80_is_signaling_nan( b ) ) {
5217             float_raise( float_flag_invalid STATUS_VAR);
5218         }
5219         return 0;
5220     }
5221     aSign = extractFloatx80Sign( a );
5222     bSign = extractFloatx80Sign( b );
5223     if ( aSign != bSign ) {
5224         return
5225                aSign
5226             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5227                  != 0 );
5228     }
5229     return
5230           aSign ? lt128( b.high, b.low, a.high, a.low )
5231         : lt128( a.high, a.low, b.high, b.low );
5232
5233 }
5234
5235 /*----------------------------------------------------------------------------
5236 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5237 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5238 | The comparison is performed according to the IEC/IEEE Standard for Binary
5239 | Floating-Point Arithmetic.
5240 *----------------------------------------------------------------------------*/
5241 int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5242 {
5243     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5244               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5245          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5246               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5247        ) {
5248         if (    floatx80_is_signaling_nan( a )
5249              || floatx80_is_signaling_nan( b ) ) {
5250             float_raise( float_flag_invalid STATUS_VAR);
5251         }
5252         return 1;
5253     }
5254     return 0;
5255 }
5256
5257 /*----------------------------------------------------------------------------
5258 | Returns the result of converting the quadruple-precision floating-point
5259 | value `a' to the 32-bit two's complement integer format.  The conversion
5260 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5261 | Arithmetic---which means in particular that the conversion is rounded
5262 | according to the current rounding mode.  If `a' is a NaN, the largest
5263 | positive integer is returned.  Otherwise, if the conversion overflows, the
5264 | largest integer with the same sign as `a' is returned.
5265 *----------------------------------------------------------------------------*/
5266
5267 int32 float128_to_int32( float128 a STATUS_PARAM )
5268 {
5269     flag aSign;
5270     int32 aExp, shiftCount;
5271     uint64_t aSig0, aSig1;
5272
5273     aSig1 = extractFloat128Frac1( a );
5274     aSig0 = extractFloat128Frac0( a );
5275     aExp = extractFloat128Exp( a );
5276     aSign = extractFloat128Sign( a );
5277     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5278     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5279     aSig0 |= ( aSig1 != 0 );
5280     shiftCount = 0x4028 - aExp;
5281     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5282     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5283
5284 }
5285
5286 /*----------------------------------------------------------------------------
5287 | Returns the result of converting the quadruple-precision floating-point
5288 | value `a' to the 32-bit two's complement integer format.  The conversion
5289 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5290 | Arithmetic, except that the conversion is always rounded toward zero.  If
5291 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5292 | conversion overflows, the largest integer with the same sign as `a' is
5293 | returned.
5294 *----------------------------------------------------------------------------*/
5295
5296 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5297 {
5298     flag aSign;
5299     int32 aExp, shiftCount;
5300     uint64_t aSig0, aSig1, savedASig;
5301     int32_t z;
5302
5303     aSig1 = extractFloat128Frac1( a );
5304     aSig0 = extractFloat128Frac0( a );
5305     aExp = extractFloat128Exp( a );
5306     aSign = extractFloat128Sign( a );
5307     aSig0 |= ( aSig1 != 0 );
5308     if ( 0x401E < aExp ) {
5309         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5310         goto invalid;
5311     }
5312     else if ( aExp < 0x3FFF ) {
5313         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5314         return 0;
5315     }
5316     aSig0 |= LIT64( 0x0001000000000000 );
5317     shiftCount = 0x402F - aExp;
5318     savedASig = aSig0;
5319     aSig0 >>= shiftCount;
5320     z = aSig0;
5321     if ( aSign ) z = - z;
5322     if ( ( z < 0 ) ^ aSign ) {
5323  invalid:
5324         float_raise( float_flag_invalid STATUS_VAR);
5325         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5326     }
5327     if ( ( aSig0<<shiftCount ) != savedASig ) {
5328         STATUS(float_exception_flags) |= float_flag_inexact;
5329     }
5330     return z;
5331
5332 }
5333
5334 /*----------------------------------------------------------------------------
5335 | Returns the result of converting the quadruple-precision floating-point
5336 | value `a' to the 64-bit two's complement integer format.  The conversion
5337 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5338 | Arithmetic---which means in particular that the conversion is rounded
5339 | according to the current rounding mode.  If `a' is a NaN, the largest
5340 | positive integer is returned.  Otherwise, if the conversion overflows, the
5341 | largest integer with the same sign as `a' is returned.
5342 *----------------------------------------------------------------------------*/
5343
5344 int64 float128_to_int64( float128 a STATUS_PARAM )
5345 {
5346     flag aSign;
5347     int32 aExp, shiftCount;
5348     uint64_t aSig0, aSig1;
5349
5350     aSig1 = extractFloat128Frac1( a );
5351     aSig0 = extractFloat128Frac0( a );
5352     aExp = extractFloat128Exp( a );
5353     aSign = extractFloat128Sign( a );
5354     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5355     shiftCount = 0x402F - aExp;
5356     if ( shiftCount <= 0 ) {
5357         if ( 0x403E < aExp ) {
5358             float_raise( float_flag_invalid STATUS_VAR);
5359             if (    ! aSign
5360                  || (    ( aExp == 0x7FFF )
5361                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5362                     )
5363                ) {
5364                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5365             }
5366             return (int64_t) LIT64( 0x8000000000000000 );
5367         }
5368         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5369     }
5370     else {
5371         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5372     }
5373     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5374
5375 }
5376
5377 /*----------------------------------------------------------------------------
5378 | Returns the result of converting the quadruple-precision floating-point
5379 | value `a' to the 64-bit two's complement integer format.  The conversion
5380 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5381 | Arithmetic, except that the conversion is always rounded toward zero.
5382 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5383 | the conversion overflows, the largest integer with the same sign as `a' is
5384 | returned.
5385 *----------------------------------------------------------------------------*/
5386
5387 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5388 {
5389     flag aSign;
5390     int32 aExp, shiftCount;
5391     uint64_t aSig0, aSig1;
5392     int64 z;
5393
5394     aSig1 = extractFloat128Frac1( a );
5395     aSig0 = extractFloat128Frac0( a );
5396     aExp = extractFloat128Exp( a );
5397     aSign = extractFloat128Sign( a );
5398     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5399     shiftCount = aExp - 0x402F;
5400     if ( 0 < shiftCount ) {
5401         if ( 0x403E <= aExp ) {
5402             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5403             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5404                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5405                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5406             }
5407             else {
5408                 float_raise( float_flag_invalid STATUS_VAR);
5409                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5410                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5411                 }
5412             }
5413             return (int64_t) LIT64( 0x8000000000000000 );
5414         }
5415         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5416         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5417             STATUS(float_exception_flags) |= float_flag_inexact;
5418         }
5419     }
5420     else {
5421         if ( aExp < 0x3FFF ) {
5422             if ( aExp | aSig0 | aSig1 ) {
5423                 STATUS(float_exception_flags) |= float_flag_inexact;
5424             }
5425             return 0;
5426         }
5427         z = aSig0>>( - shiftCount );
5428         if (    aSig1
5429              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5430             STATUS(float_exception_flags) |= float_flag_inexact;
5431         }
5432     }
5433     if ( aSign ) z = - z;
5434     return z;
5435
5436 }
5437
5438 /*----------------------------------------------------------------------------
5439 | Returns the result of converting the quadruple-precision floating-point
5440 | value `a' to the single-precision floating-point format.  The conversion
5441 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5442 | Arithmetic.
5443 *----------------------------------------------------------------------------*/
5444
5445 float32 float128_to_float32( float128 a STATUS_PARAM )
5446 {
5447     flag aSign;
5448     int32 aExp;
5449     uint64_t aSig0, aSig1;
5450     uint32_t zSig;
5451
5452     aSig1 = extractFloat128Frac1( a );
5453     aSig0 = extractFloat128Frac0( a );
5454     aExp = extractFloat128Exp( a );
5455     aSign = extractFloat128Sign( a );
5456     if ( aExp == 0x7FFF ) {
5457         if ( aSig0 | aSig1 ) {
5458             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5459         }
5460         return packFloat32( aSign, 0xFF, 0 );
5461     }
5462     aSig0 |= ( aSig1 != 0 );
5463     shift64RightJamming( aSig0, 18, &aSig0 );
5464     zSig = aSig0;
5465     if ( aExp || zSig ) {
5466         zSig |= 0x40000000;
5467         aExp -= 0x3F81;
5468     }
5469     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5470
5471 }
5472
5473 /*----------------------------------------------------------------------------
5474 | Returns the result of converting the quadruple-precision floating-point
5475 | value `a' to the double-precision floating-point format.  The conversion
5476 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5477 | Arithmetic.
5478 *----------------------------------------------------------------------------*/
5479
5480 float64 float128_to_float64( float128 a STATUS_PARAM )
5481 {
5482     flag aSign;
5483     int32 aExp;
5484     uint64_t aSig0, aSig1;
5485
5486     aSig1 = extractFloat128Frac1( a );
5487     aSig0 = extractFloat128Frac0( a );
5488     aExp = extractFloat128Exp( a );
5489     aSign = extractFloat128Sign( a );
5490     if ( aExp == 0x7FFF ) {
5491         if ( aSig0 | aSig1 ) {
5492             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5493         }
5494         return packFloat64( aSign, 0x7FF, 0 );
5495     }
5496     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5497     aSig0 |= ( aSig1 != 0 );
5498     if ( aExp || aSig0 ) {
5499         aSig0 |= LIT64( 0x4000000000000000 );
5500         aExp -= 0x3C01;
5501     }
5502     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5503
5504 }
5505
5506 /*----------------------------------------------------------------------------
5507 | Returns the result of converting the quadruple-precision floating-point
5508 | value `a' to the extended double-precision floating-point format.  The
5509 | conversion is performed according to the IEC/IEEE Standard for Binary
5510 | Floating-Point Arithmetic.
5511 *----------------------------------------------------------------------------*/
5512
5513 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5514 {
5515     flag aSign;
5516     int32 aExp;
5517     uint64_t aSig0, aSig1;
5518
5519     aSig1 = extractFloat128Frac1( a );
5520     aSig0 = extractFloat128Frac0( a );
5521     aExp = extractFloat128Exp( a );
5522     aSign = extractFloat128Sign( a );
5523     if ( aExp == 0x7FFF ) {
5524         if ( aSig0 | aSig1 ) {
5525             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5526         }
5527         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5528     }
5529     if ( aExp == 0 ) {
5530         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5531         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5532     }
5533     else {
5534         aSig0 |= LIT64( 0x0001000000000000 );
5535     }
5536     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5537     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5538
5539 }
5540
5541 /*----------------------------------------------------------------------------
5542 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5543 | returns the result as a quadruple-precision floating-point value.  The
5544 | operation is performed according to the IEC/IEEE Standard for Binary
5545 | Floating-Point Arithmetic.
5546 *----------------------------------------------------------------------------*/
5547
5548 float128 float128_round_to_int( float128 a STATUS_PARAM )
5549 {
5550     flag aSign;
5551     int32 aExp;
5552     uint64_t lastBitMask, roundBitsMask;
5553     int8 roundingMode;
5554     float128 z;
5555
5556     aExp = extractFloat128Exp( a );
5557     if ( 0x402F <= aExp ) {
5558         if ( 0x406F <= aExp ) {
5559             if (    ( aExp == 0x7FFF )
5560                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5561                ) {
5562                 return propagateFloat128NaN( a, a STATUS_VAR );
5563             }
5564             return a;
5565         }
5566         lastBitMask = 1;
5567         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5568         roundBitsMask = lastBitMask - 1;
5569         z = a;
5570         roundingMode = STATUS(float_rounding_mode);
5571         if ( roundingMode == float_round_nearest_even ) {
5572             if ( lastBitMask ) {
5573                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5574                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5575             }
5576             else {
5577                 if ( (int64_t) z.low < 0 ) {
5578                     ++z.high;
5579                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5580                 }
5581             }
5582         }
5583         else if ( roundingMode != float_round_to_zero ) {
5584             if (   extractFloat128Sign( z )
5585                  ^ ( roundingMode == float_round_up ) ) {
5586                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5587             }
5588         }
5589         z.low &= ~ roundBitsMask;
5590     }
5591     else {
5592         if ( aExp < 0x3FFF ) {
5593             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5594             STATUS(float_exception_flags) |= float_flag_inexact;
5595             aSign = extractFloat128Sign( a );
5596             switch ( STATUS(float_rounding_mode) ) {
5597              case float_round_nearest_even:
5598                 if (    ( aExp == 0x3FFE )
5599                      && (   extractFloat128Frac0( a )
5600                           | extractFloat128Frac1( a ) )
5601                    ) {
5602                     return packFloat128( aSign, 0x3FFF, 0, 0 );
5603                 }
5604                 break;
5605              case float_round_down:
5606                 return
5607                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5608                     : packFloat128( 0, 0, 0, 0 );
5609              case float_round_up:
5610                 return
5611                       aSign ? packFloat128( 1, 0, 0, 0 )
5612                     : packFloat128( 0, 0x3FFF, 0, 0 );
5613             }
5614             return packFloat128( aSign, 0, 0, 0 );
5615         }
5616         lastBitMask = 1;
5617         lastBitMask <<= 0x402F - aExp;
5618         roundBitsMask = lastBitMask - 1;
5619         z.low = 0;
5620         z.high = a.high;
5621         roundingMode = STATUS(float_rounding_mode);
5622         if ( roundingMode == float_round_nearest_even ) {
5623             z.high += lastBitMask>>1;
5624             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5625                 z.high &= ~ lastBitMask;
5626             }
5627         }
5628         else if ( roundingMode != float_round_to_zero ) {
5629             if (   extractFloat128Sign( z )
5630                  ^ ( roundingMode == float_round_up ) ) {
5631                 z.high |= ( a.low != 0 );
5632                 z.high += roundBitsMask;
5633             }
5634         }
5635         z.high &= ~ roundBitsMask;
5636     }
5637     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5638         STATUS(float_exception_flags) |= float_flag_inexact;
5639     }
5640     return z;
5641
5642 }
5643
5644 /*----------------------------------------------------------------------------
5645 | Returns the result of adding the absolute values of the quadruple-precision
5646 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5647 | before being returned.  `zSign' is ignored if the result is a NaN.
5648 | The addition is performed according to the IEC/IEEE Standard for Binary
5649 | Floating-Point Arithmetic.
5650 *----------------------------------------------------------------------------*/
5651
5652 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5653 {
5654     int32 aExp, bExp, zExp;
5655     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5656     int32 expDiff;
5657
5658     aSig1 = extractFloat128Frac1( a );
5659     aSig0 = extractFloat128Frac0( a );
5660     aExp = extractFloat128Exp( a );
5661     bSig1 = extractFloat128Frac1( b );
5662     bSig0 = extractFloat128Frac0( b );
5663     bExp = extractFloat128Exp( b );
5664     expDiff = aExp - bExp;
5665     if ( 0 < expDiff ) {
5666         if ( aExp == 0x7FFF ) {
5667             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5668             return a;
5669         }
5670         if ( bExp == 0 ) {
5671             --expDiff;
5672         }
5673         else {
5674             bSig0 |= LIT64( 0x0001000000000000 );
5675         }
5676         shift128ExtraRightJamming(
5677             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5678         zExp = aExp;
5679     }
5680     else if ( expDiff < 0 ) {
5681         if ( bExp == 0x7FFF ) {
5682             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5683             return packFloat128( zSign, 0x7FFF, 0, 0 );
5684         }
5685         if ( aExp == 0 ) {
5686             ++expDiff;
5687         }
5688         else {
5689             aSig0 |= LIT64( 0x0001000000000000 );
5690         }
5691         shift128ExtraRightJamming(
5692             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5693         zExp = bExp;
5694     }
5695     else {
5696         if ( aExp == 0x7FFF ) {
5697             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5698                 return propagateFloat128NaN( a, b STATUS_VAR );
5699             }
5700             return a;
5701         }
5702         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5703         if ( aExp == 0 ) {
5704             if (STATUS(flush_to_zero)) {
5705                 if (zSig0 | zSig1) {
5706                     float_raise(float_flag_output_denormal STATUS_VAR);
5707                 }
5708                 return packFloat128(zSign, 0, 0, 0);
5709             }
5710             return packFloat128( zSign, 0, zSig0, zSig1 );
5711         }
5712         zSig2 = 0;
5713         zSig0 |= LIT64( 0x0002000000000000 );
5714         zExp = aExp;
5715         goto shiftRight1;
5716     }
5717     aSig0 |= LIT64( 0x0001000000000000 );
5718     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5719     --zExp;
5720     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5721     ++zExp;
5722  shiftRight1:
5723     shift128ExtraRightJamming(
5724         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5725  roundAndPack:
5726     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5727
5728 }
5729
5730 /*----------------------------------------------------------------------------
5731 | Returns the result of subtracting the absolute values of the quadruple-
5732 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
5733 | difference is negated before being returned.  `zSign' is ignored if the
5734 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5735 | Standard for Binary Floating-Point Arithmetic.
5736 *----------------------------------------------------------------------------*/
5737
5738 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5739 {
5740     int32 aExp, bExp, zExp;
5741     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5742     int32 expDiff;
5743     float128 z;
5744
5745     aSig1 = extractFloat128Frac1( a );
5746     aSig0 = extractFloat128Frac0( a );
5747     aExp = extractFloat128Exp( a );
5748     bSig1 = extractFloat128Frac1( b );
5749     bSig0 = extractFloat128Frac0( b );
5750     bExp = extractFloat128Exp( b );
5751     expDiff = aExp - bExp;
5752     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5753     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5754     if ( 0 < expDiff ) goto aExpBigger;
5755     if ( expDiff < 0 ) goto bExpBigger;
5756     if ( aExp == 0x7FFF ) {
5757         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5758             return propagateFloat128NaN( a, b STATUS_VAR );
5759         }
5760         float_raise( float_flag_invalid STATUS_VAR);
5761         z.low = float128_default_nan_low;
5762         z.high = float128_default_nan_high;
5763         return z;
5764     }
5765     if ( aExp == 0 ) {
5766         aExp = 1;
5767         bExp = 1;
5768     }
5769     if ( bSig0 < aSig0 ) goto aBigger;
5770     if ( aSig0 < bSig0 ) goto bBigger;
5771     if ( bSig1 < aSig1 ) goto aBigger;
5772     if ( aSig1 < bSig1 ) goto bBigger;
5773     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5774  bExpBigger:
5775     if ( bExp == 0x7FFF ) {
5776         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5777         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5778     }
5779     if ( aExp == 0 ) {
5780         ++expDiff;
5781     }
5782     else {
5783         aSig0 |= LIT64( 0x4000000000000000 );
5784     }
5785     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5786     bSig0 |= LIT64( 0x4000000000000000 );
5787  bBigger:
5788     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5789     zExp = bExp;
5790     zSign ^= 1;
5791     goto normalizeRoundAndPack;
5792  aExpBigger:
5793     if ( aExp == 0x7FFF ) {
5794         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5795         return a;
5796     }
5797     if ( bExp == 0 ) {
5798         --expDiff;
5799     }
5800     else {
5801         bSig0 |= LIT64( 0x4000000000000000 );
5802     }
5803     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5804     aSig0 |= LIT64( 0x4000000000000000 );
5805  aBigger:
5806     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5807     zExp = aExp;
5808  normalizeRoundAndPack:
5809     --zExp;
5810     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5811
5812 }
5813
5814 /*----------------------------------------------------------------------------
5815 | Returns the result of adding the quadruple-precision floating-point values
5816 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5817 | for Binary Floating-Point Arithmetic.
5818 *----------------------------------------------------------------------------*/
5819
5820 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5821 {
5822     flag aSign, bSign;
5823
5824     aSign = extractFloat128Sign( a );
5825     bSign = extractFloat128Sign( b );
5826     if ( aSign == bSign ) {
5827         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5828     }
5829     else {
5830         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5831     }
5832
5833 }
5834
5835 /*----------------------------------------------------------------------------
5836 | Returns the result of subtracting the quadruple-precision floating-point
5837 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5838 | Standard for Binary Floating-Point Arithmetic.
5839 *----------------------------------------------------------------------------*/
5840
5841 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5842 {
5843     flag aSign, bSign;
5844
5845     aSign = extractFloat128Sign( a );
5846     bSign = extractFloat128Sign( b );
5847     if ( aSign == bSign ) {
5848         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5849     }
5850     else {
5851         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5852     }
5853
5854 }
5855
5856 /*----------------------------------------------------------------------------
5857 | Returns the result of multiplying the quadruple-precision floating-point
5858 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5859 | Standard for Binary Floating-Point Arithmetic.
5860 *----------------------------------------------------------------------------*/
5861
5862 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5863 {
5864     flag aSign, bSign, zSign;
5865     int32 aExp, bExp, zExp;
5866     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5867     float128 z;
5868
5869     aSig1 = extractFloat128Frac1( a );
5870     aSig0 = extractFloat128Frac0( a );
5871     aExp = extractFloat128Exp( a );
5872     aSign = extractFloat128Sign( a );
5873     bSig1 = extractFloat128Frac1( b );
5874     bSig0 = extractFloat128Frac0( b );
5875     bExp = extractFloat128Exp( b );
5876     bSign = extractFloat128Sign( b );
5877     zSign = aSign ^ bSign;
5878     if ( aExp == 0x7FFF ) {
5879         if (    ( aSig0 | aSig1 )
5880              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5881             return propagateFloat128NaN( a, b STATUS_VAR );
5882         }
5883         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5884         return packFloat128( zSign, 0x7FFF, 0, 0 );
5885     }
5886     if ( bExp == 0x7FFF ) {
5887         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5888         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5889  invalid:
5890             float_raise( float_flag_invalid STATUS_VAR);
5891             z.low = float128_default_nan_low;
5892             z.high = float128_default_nan_high;
5893             return z;
5894         }
5895         return packFloat128( zSign, 0x7FFF, 0, 0 );
5896     }
5897     if ( aExp == 0 ) {
5898         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5899         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5900     }
5901     if ( bExp == 0 ) {
5902         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5903         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5904     }
5905     zExp = aExp + bExp - 0x4000;
5906     aSig0 |= LIT64( 0x0001000000000000 );
5907     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5908     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5909     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5910     zSig2 |= ( zSig3 != 0 );
5911     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5912         shift128ExtraRightJamming(
5913             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5914         ++zExp;
5915     }
5916     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5917
5918 }
5919
5920 /*----------------------------------------------------------------------------
5921 | Returns the result of dividing the quadruple-precision floating-point value
5922 | `a' by the corresponding value `b'.  The operation is performed according to
5923 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5924 *----------------------------------------------------------------------------*/
5925
5926 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5927 {
5928     flag aSign, bSign, zSign;
5929     int32 aExp, bExp, zExp;
5930     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5931     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5932     float128 z;
5933
5934     aSig1 = extractFloat128Frac1( a );
5935     aSig0 = extractFloat128Frac0( a );
5936     aExp = extractFloat128Exp( a );
5937     aSign = extractFloat128Sign( a );
5938     bSig1 = extractFloat128Frac1( b );
5939     bSig0 = extractFloat128Frac0( b );
5940     bExp = extractFloat128Exp( b );
5941     bSign = extractFloat128Sign( b );
5942     zSign = aSign ^ bSign;
5943     if ( aExp == 0x7FFF ) {
5944         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5945         if ( bExp == 0x7FFF ) {
5946             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5947             goto invalid;
5948         }
5949         return packFloat128( zSign, 0x7FFF, 0, 0 );
5950     }
5951     if ( bExp == 0x7FFF ) {
5952         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5953         return packFloat128( zSign, 0, 0, 0 );
5954     }
5955     if ( bExp == 0 ) {
5956         if ( ( bSig0 | bSig1 ) == 0 ) {
5957             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5958  invalid:
5959                 float_raise( float_flag_invalid STATUS_VAR);
5960                 z.low = float128_default_nan_low;
5961                 z.high = float128_default_nan_high;
5962                 return z;
5963             }
5964             float_raise( float_flag_divbyzero STATUS_VAR);
5965             return packFloat128( zSign, 0x7FFF, 0, 0 );
5966         }
5967         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5968     }
5969     if ( aExp == 0 ) {
5970         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5971         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5972     }
5973     zExp = aExp - bExp + 0x3FFD;
5974     shortShift128Left(
5975         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5976     shortShift128Left(
5977         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5978     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5979         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5980         ++zExp;
5981     }
5982     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5983     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5984     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5985     while ( (int64_t) rem0 < 0 ) {
5986         --zSig0;
5987         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5988     }
5989     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5990     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5991         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5992         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5993         while ( (int64_t) rem1 < 0 ) {
5994             --zSig1;
5995             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5996         }
5997         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5998     }
5999     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6000     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6001
6002 }
6003
6004 /*----------------------------------------------------------------------------
6005 | Returns the remainder of the quadruple-precision floating-point value `a'
6006 | with respect to the corresponding value `b'.  The operation is performed
6007 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6008 *----------------------------------------------------------------------------*/
6009
6010 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
6011 {
6012     flag aSign, zSign;
6013     int32 aExp, bExp, expDiff;
6014     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6015     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6016     int64_t sigMean0;
6017     float128 z;
6018
6019     aSig1 = extractFloat128Frac1( a );
6020     aSig0 = extractFloat128Frac0( a );
6021     aExp = extractFloat128Exp( a );
6022     aSign = extractFloat128Sign( a );
6023     bSig1 = extractFloat128Frac1( b );
6024     bSig0 = extractFloat128Frac0( b );
6025     bExp = extractFloat128Exp( b );
6026     if ( aExp == 0x7FFF ) {
6027         if (    ( aSig0 | aSig1 )
6028              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6029             return propagateFloat128NaN( a, b STATUS_VAR );
6030         }
6031         goto invalid;
6032     }
6033     if ( bExp == 0x7FFF ) {
6034         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6035         return a;
6036     }
6037     if ( bExp == 0 ) {
6038         if ( ( bSig0 | bSig1 ) == 0 ) {
6039  invalid:
6040             float_raise( float_flag_invalid STATUS_VAR);
6041             z.low = float128_default_nan_low;
6042             z.high = float128_default_nan_high;
6043             return z;
6044         }
6045         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6046     }
6047     if ( aExp == 0 ) {
6048         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6049         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6050     }
6051     expDiff = aExp - bExp;
6052     if ( expDiff < -1 ) return a;
6053     shortShift128Left(
6054         aSig0 | LIT64( 0x0001000000000000 ),
6055         aSig1,
6056         15 - ( expDiff < 0 ),
6057         &aSig0,
6058         &aSig1
6059     );
6060     shortShift128Left(
6061         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6062     q = le128( bSig0, bSig1, aSig0, aSig1 );
6063     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6064     expDiff -= 64;
6065     while ( 0 < expDiff ) {
6066         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6067         q = ( 4 < q ) ? q - 4 : 0;
6068         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6069         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6070         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6071         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6072         expDiff -= 61;
6073     }
6074     if ( -64 < expDiff ) {
6075         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6076         q = ( 4 < q ) ? q - 4 : 0;
6077         q >>= - expDiff;
6078         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6079         expDiff += 52;
6080         if ( expDiff < 0 ) {
6081             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6082         }
6083         else {
6084             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6085         }
6086         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6087         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6088     }
6089     else {
6090         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6091         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6092     }
6093     do {
6094         alternateASig0 = aSig0;
6095         alternateASig1 = aSig1;
6096         ++q;
6097         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6098     } while ( 0 <= (int64_t) aSig0 );
6099     add128(
6100         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6101     if (    ( sigMean0 < 0 )
6102          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6103         aSig0 = alternateASig0;
6104         aSig1 = alternateASig1;
6105     }
6106     zSign = ( (int64_t) aSig0 < 0 );
6107     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6108     return
6109         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6110
6111 }
6112
6113 /*----------------------------------------------------------------------------
6114 | Returns the square root of the quadruple-precision floating-point value `a'.
6115 | The operation is performed according to the IEC/IEEE Standard for Binary
6116 | Floating-Point Arithmetic.
6117 *----------------------------------------------------------------------------*/
6118
6119 float128 float128_sqrt( float128 a STATUS_PARAM )
6120 {
6121     flag aSign;
6122     int32 aExp, zExp;
6123     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6124     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6125     float128 z;
6126
6127     aSig1 = extractFloat128Frac1( a );
6128     aSig0 = extractFloat128Frac0( a );
6129     aExp = extractFloat128Exp( a );
6130     aSign = extractFloat128Sign( a );
6131     if ( aExp == 0x7FFF ) {
6132         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6133         if ( ! aSign ) return a;
6134         goto invalid;
6135     }
6136     if ( aSign ) {
6137         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6138  invalid:
6139         float_raise( float_flag_invalid STATUS_VAR);
6140         z.low = float128_default_nan_low;
6141         z.high = float128_default_nan_high;
6142         return z;
6143     }
6144     if ( aExp == 0 ) {
6145         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6146         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6147     }
6148     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6149     aSig0 |= LIT64( 0x0001000000000000 );
6150     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6151     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6152     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6153     doubleZSig0 = zSig0<<1;
6154     mul64To128( zSig0, zSig0, &term0, &term1 );
6155     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6156     while ( (int64_t) rem0 < 0 ) {
6157         --zSig0;
6158         doubleZSig0 -= 2;
6159         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6160     }
6161     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6162     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6163         if ( zSig1 == 0 ) zSig1 = 1;
6164         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6165         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6166         mul64To128( zSig1, zSig1, &term2, &term3 );
6167         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6168         while ( (int64_t) rem1 < 0 ) {
6169             --zSig1;
6170             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6171             term3 |= 1;
6172             term2 |= doubleZSig0;
6173             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6174         }
6175         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6176     }
6177     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6178     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6179
6180 }
6181
6182 /*----------------------------------------------------------------------------
6183 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6184 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6185 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6186 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6187 *----------------------------------------------------------------------------*/
6188
6189 int float128_eq( float128 a, float128 b STATUS_PARAM )
6190 {
6191
6192     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6193               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6194          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6195               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6196        ) {
6197         float_raise( float_flag_invalid STATUS_VAR);
6198         return 0;
6199     }
6200     return
6201            ( a.low == b.low )
6202         && (    ( a.high == b.high )
6203              || (    ( a.low == 0 )
6204                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6205            );
6206
6207 }
6208
6209 /*----------------------------------------------------------------------------
6210 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6211 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6212 | exception is raised if either operand is a NaN.  The comparison is performed
6213 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6214 *----------------------------------------------------------------------------*/
6215
6216 int float128_le( float128 a, float128 b STATUS_PARAM )
6217 {
6218     flag aSign, bSign;
6219
6220     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6221               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6222          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6223               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6224        ) {
6225         float_raise( float_flag_invalid STATUS_VAR);
6226         return 0;
6227     }
6228     aSign = extractFloat128Sign( a );
6229     bSign = extractFloat128Sign( b );
6230     if ( aSign != bSign ) {
6231         return
6232                aSign
6233             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6234                  == 0 );
6235     }
6236     return
6237           aSign ? le128( b.high, b.low, a.high, a.low )
6238         : le128( a.high, a.low, b.high, b.low );
6239
6240 }
6241
6242 /*----------------------------------------------------------------------------
6243 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6244 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6245 | raised if either operand is a NaN.  The comparison is performed according
6246 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6247 *----------------------------------------------------------------------------*/
6248
6249 int float128_lt( float128 a, float128 b STATUS_PARAM )
6250 {
6251     flag aSign, bSign;
6252
6253     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6254               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6255          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6256               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6257        ) {
6258         float_raise( float_flag_invalid STATUS_VAR);
6259         return 0;
6260     }
6261     aSign = extractFloat128Sign( a );
6262     bSign = extractFloat128Sign( b );
6263     if ( aSign != bSign ) {
6264         return
6265                aSign
6266             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6267                  != 0 );
6268     }
6269     return
6270           aSign ? lt128( b.high, b.low, a.high, a.low )
6271         : lt128( a.high, a.low, b.high, b.low );
6272
6273 }
6274
6275 /*----------------------------------------------------------------------------
6276 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6277 | be compared, and 0 otherwise.  The invalid exception is raised if either
6278 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6279 | Standard for Binary Floating-Point Arithmetic.
6280 *----------------------------------------------------------------------------*/
6281
6282 int float128_unordered( float128 a, float128 b STATUS_PARAM )
6283 {
6284     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6285               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6286          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6287               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6288        ) {
6289         float_raise( float_flag_invalid STATUS_VAR);
6290         return 1;
6291     }
6292     return 0;
6293 }
6294
6295 /*----------------------------------------------------------------------------
6296 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6297 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6298 | exception.  The comparison is performed according to the IEC/IEEE Standard
6299 | for Binary Floating-Point Arithmetic.
6300 *----------------------------------------------------------------------------*/
6301
6302 int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
6303 {
6304
6305     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6306               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6307          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6308               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6309        ) {
6310         if (    float128_is_signaling_nan( a )
6311              || float128_is_signaling_nan( b ) ) {
6312             float_raise( float_flag_invalid STATUS_VAR);
6313         }
6314         return 0;
6315     }
6316     return
6317            ( a.low == b.low )
6318         && (    ( a.high == b.high )
6319              || (    ( a.low == 0 )
6320                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6321            );
6322
6323 }
6324
6325 /*----------------------------------------------------------------------------
6326 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6327 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6328 | cause an exception.  Otherwise, the comparison is performed according to the
6329 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6330 *----------------------------------------------------------------------------*/
6331
6332 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
6333 {
6334     flag aSign, bSign;
6335
6336     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6337               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6338          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6339               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6340        ) {
6341         if (    float128_is_signaling_nan( a )
6342              || float128_is_signaling_nan( b ) ) {
6343             float_raise( float_flag_invalid STATUS_VAR);
6344         }
6345         return 0;
6346     }
6347     aSign = extractFloat128Sign( a );
6348     bSign = extractFloat128Sign( b );
6349     if ( aSign != bSign ) {
6350         return
6351                aSign
6352             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6353                  == 0 );
6354     }
6355     return
6356           aSign ? le128( b.high, b.low, a.high, a.low )
6357         : le128( a.high, a.low, b.high, b.low );
6358
6359 }
6360
6361 /*----------------------------------------------------------------------------
6362 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6363 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6364 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6365 | Standard for Binary Floating-Point Arithmetic.
6366 *----------------------------------------------------------------------------*/
6367
6368 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
6369 {
6370     flag aSign, bSign;
6371
6372     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6373               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6374          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6375               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6376        ) {
6377         if (    float128_is_signaling_nan( a )
6378              || float128_is_signaling_nan( b ) ) {
6379             float_raise( float_flag_invalid STATUS_VAR);
6380         }
6381         return 0;
6382     }
6383     aSign = extractFloat128Sign( a );
6384     bSign = extractFloat128Sign( b );
6385     if ( aSign != bSign ) {
6386         return
6387                aSign
6388             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6389                  != 0 );
6390     }
6391     return
6392           aSign ? lt128( b.high, b.low, a.high, a.low )
6393         : lt128( a.high, a.low, b.high, b.low );
6394
6395 }
6396
6397 /*----------------------------------------------------------------------------
6398 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6399 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6400 | comparison is performed according to the IEC/IEEE Standard for Binary
6401 | Floating-Point Arithmetic.
6402 *----------------------------------------------------------------------------*/
6403
6404 int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6405 {
6406     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6407               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6408          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6409               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6410        ) {
6411         if (    float128_is_signaling_nan( a )
6412              || float128_is_signaling_nan( b ) ) {
6413             float_raise( float_flag_invalid STATUS_VAR);
6414         }
6415         return 1;
6416     }
6417     return 0;
6418 }
6419
6420 /* misc functions */
6421 float32 uint32_to_float32( uint32 a STATUS_PARAM )
6422 {
6423     return int64_to_float32(a STATUS_VAR);
6424 }
6425
6426 float64 uint32_to_float64( uint32 a STATUS_PARAM )
6427 {
6428     return int64_to_float64(a STATUS_VAR);
6429 }
6430
6431 uint32 float32_to_uint32( float32 a STATUS_PARAM )
6432 {
6433     int64_t v;
6434     uint32 res;
6435
6436     v = float32_to_int64(a STATUS_VAR);
6437     if (v < 0) {
6438         res = 0;
6439         float_raise( float_flag_invalid STATUS_VAR);
6440     } else if (v > 0xffffffff) {
6441         res = 0xffffffff;
6442         float_raise( float_flag_invalid STATUS_VAR);
6443     } else {
6444         res = v;
6445     }
6446     return res;
6447 }
6448
6449 uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6450 {
6451     int64_t v;
6452     uint32 res;
6453
6454     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6455     if (v < 0) {
6456         res = 0;
6457         float_raise( float_flag_invalid STATUS_VAR);
6458     } else if (v > 0xffffffff) {
6459         res = 0xffffffff;
6460         float_raise( float_flag_invalid STATUS_VAR);
6461     } else {
6462         res = v;
6463     }
6464     return res;
6465 }
6466
6467 uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
6468 {
6469     int64_t v;
6470     uint_fast16_t res;
6471
6472     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6473     if (v < 0) {
6474         res = 0;
6475         float_raise( float_flag_invalid STATUS_VAR);
6476     } else if (v > 0xffff) {
6477         res = 0xffff;
6478         float_raise( float_flag_invalid STATUS_VAR);
6479     } else {
6480         res = v;
6481     }
6482     return res;
6483 }
6484
6485 uint32 float64_to_uint32( float64 a STATUS_PARAM )
6486 {
6487     int64_t v;
6488     uint32 res;
6489
6490     v = float64_to_int64(a STATUS_VAR);
6491     if (v < 0) {
6492         res = 0;
6493         float_raise( float_flag_invalid STATUS_VAR);
6494     } else if (v > 0xffffffff) {
6495         res = 0xffffffff;
6496         float_raise( float_flag_invalid STATUS_VAR);
6497     } else {
6498         res = v;
6499     }
6500     return res;
6501 }
6502
6503 uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6504 {
6505     int64_t v;
6506     uint32 res;
6507
6508     v = float64_to_int64_round_to_zero(a STATUS_VAR);
6509     if (v < 0) {
6510         res = 0;
6511         float_raise( float_flag_invalid STATUS_VAR);
6512     } else if (v > 0xffffffff) {
6513         res = 0xffffffff;
6514         float_raise( float_flag_invalid STATUS_VAR);
6515     } else {
6516         res = v;
6517     }
6518     return res;
6519 }
6520
6521 uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
6522 {
6523     int64_t v;
6524     uint_fast16_t res;
6525
6526     v = float64_to_int64_round_to_zero(a STATUS_VAR);
6527     if (v < 0) {
6528         res = 0;
6529         float_raise( float_flag_invalid STATUS_VAR);
6530     } else if (v > 0xffff) {
6531         res = 0xffff;
6532         float_raise( float_flag_invalid STATUS_VAR);
6533     } else {
6534         res = v;
6535     }
6536     return res;
6537 }
6538
6539 /* FIXME: This looks broken.  */
6540 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
6541 {
6542     int64_t v;
6543
6544     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6545     v += float64_val(a);
6546     v = float64_to_int64(make_float64(v) STATUS_VAR);
6547
6548     return v - INT64_MIN;
6549 }
6550
6551 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6552 {
6553     int64_t v;
6554
6555     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6556     v += float64_val(a);
6557     v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
6558
6559     return v - INT64_MIN;
6560 }
6561
6562 #define COMPARE(s, nan_exp)                                                  \
6563 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
6564                                       int is_quiet STATUS_PARAM )            \
6565 {                                                                            \
6566     flag aSign, bSign;                                                       \
6567     uint ## s ## _t av, bv;                                                  \
6568     a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
6569     b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
6570                                                                              \
6571     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
6572          extractFloat ## s ## Frac( a ) ) ||                                 \
6573         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
6574           extractFloat ## s ## Frac( b ) )) {                                \
6575         if (!is_quiet ||                                                     \
6576             float ## s ## _is_signaling_nan( a ) ||                          \
6577             float ## s ## _is_signaling_nan( b ) ) {                         \
6578             float_raise( float_flag_invalid STATUS_VAR);                     \
6579         }                                                                    \
6580         return float_relation_unordered;                                     \
6581     }                                                                        \
6582     aSign = extractFloat ## s ## Sign( a );                                  \
6583     bSign = extractFloat ## s ## Sign( b );                                  \
6584     av = float ## s ## _val(a);                                              \
6585     bv = float ## s ## _val(b);                                              \
6586     if ( aSign != bSign ) {                                                  \
6587         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
6588             /* zero case */                                                  \
6589             return float_relation_equal;                                     \
6590         } else {                                                             \
6591             return 1 - (2 * aSign);                                          \
6592         }                                                                    \
6593     } else {                                                                 \
6594         if (av == bv) {                                                      \
6595             return float_relation_equal;                                     \
6596         } else {                                                             \
6597             return 1 - 2 * (aSign ^ ( av < bv ));                            \
6598         }                                                                    \
6599     }                                                                        \
6600 }                                                                            \
6601                                                                              \
6602 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
6603 {                                                                            \
6604     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
6605 }                                                                            \
6606                                                                              \
6607 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
6608 {                                                                            \
6609     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
6610 }
6611
6612 COMPARE(32, 0xff)
6613 COMPARE(64, 0x7ff)
6614
6615 INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6616                                       int is_quiet STATUS_PARAM )
6617 {
6618     flag aSign, bSign;
6619
6620     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6621           ( extractFloatx80Frac( a )<<1 ) ) ||
6622         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6623           ( extractFloatx80Frac( b )<<1 ) )) {
6624         if (!is_quiet ||
6625             floatx80_is_signaling_nan( a ) ||
6626             floatx80_is_signaling_nan( b ) ) {
6627             float_raise( float_flag_invalid STATUS_VAR);
6628         }
6629         return float_relation_unordered;
6630     }
6631     aSign = extractFloatx80Sign( a );
6632     bSign = extractFloatx80Sign( b );
6633     if ( aSign != bSign ) {
6634
6635         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6636              ( ( a.low | b.low ) == 0 ) ) {
6637             /* zero case */
6638             return float_relation_equal;
6639         } else {
6640             return 1 - (2 * aSign);
6641         }
6642     } else {
6643         if (a.low == b.low && a.high == b.high) {
6644             return float_relation_equal;
6645         } else {
6646             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6647         }
6648     }
6649 }
6650
6651 int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6652 {
6653     return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6654 }
6655
6656 int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6657 {
6658     return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6659 }
6660
6661 INLINE int float128_compare_internal( float128 a, float128 b,
6662                                       int is_quiet STATUS_PARAM )
6663 {
6664     flag aSign, bSign;
6665
6666     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6667           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6668         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6669           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6670         if (!is_quiet ||
6671             float128_is_signaling_nan( a ) ||
6672             float128_is_signaling_nan( b ) ) {
6673             float_raise( float_flag_invalid STATUS_VAR);
6674         }
6675         return float_relation_unordered;
6676     }
6677     aSign = extractFloat128Sign( a );
6678     bSign = extractFloat128Sign( b );
6679     if ( aSign != bSign ) {
6680         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6681             /* zero case */
6682             return float_relation_equal;
6683         } else {
6684             return 1 - (2 * aSign);
6685         }
6686     } else {
6687         if (a.low == b.low && a.high == b.high) {
6688             return float_relation_equal;
6689         } else {
6690             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6691         }
6692     }
6693 }
6694
6695 int float128_compare( float128 a, float128 b STATUS_PARAM )
6696 {
6697     return float128_compare_internal(a, b, 0 STATUS_VAR);
6698 }
6699
6700 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6701 {
6702     return float128_compare_internal(a, b, 1 STATUS_VAR);
6703 }
6704
6705 /* min() and max() functions. These can't be implemented as
6706  * 'compare and pick one input' because that would mishandle
6707  * NaNs and +0 vs -0.
6708  */
6709 #define MINMAX(s, nan_exp)                                              \
6710 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
6711                                         int ismin STATUS_PARAM )        \
6712 {                                                                       \
6713     flag aSign, bSign;                                                  \
6714     uint ## s ## _t av, bv;                                             \
6715     a = float ## s ## _squash_input_denormal(a STATUS_VAR);             \
6716     b = float ## s ## _squash_input_denormal(b STATUS_VAR);             \
6717     if (float ## s ## _is_any_nan(a) ||                                 \
6718         float ## s ## _is_any_nan(b)) {                                 \
6719         return propagateFloat ## s ## NaN(a, b STATUS_VAR);             \
6720     }                                                                   \
6721     aSign = extractFloat ## s ## Sign(a);                               \
6722     bSign = extractFloat ## s ## Sign(b);                               \
6723     av = float ## s ## _val(a);                                         \
6724     bv = float ## s ## _val(b);                                         \
6725     if (aSign != bSign) {                                               \
6726         if (ismin) {                                                    \
6727             return aSign ? a : b;                                       \
6728         } else {                                                        \
6729             return aSign ? b : a;                                       \
6730         }                                                               \
6731     } else {                                                            \
6732         if (ismin) {                                                    \
6733             return (aSign ^ (av < bv)) ? a : b;                         \
6734         } else {                                                        \
6735             return (aSign ^ (av < bv)) ? b : a;                         \
6736         }                                                               \
6737     }                                                                   \
6738 }                                                                       \
6739                                                                         \
6740 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM)  \
6741 {                                                                       \
6742     return float ## s ## _minmax(a, b, 1 STATUS_VAR);                   \
6743 }                                                                       \
6744                                                                         \
6745 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM)  \
6746 {                                                                       \
6747     return float ## s ## _minmax(a, b, 0 STATUS_VAR);                   \
6748 }
6749
6750 MINMAX(32, 0xff)
6751 MINMAX(64, 0x7ff)
6752
6753
6754 /* Multiply A by 2 raised to the power N.  */
6755 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6756 {
6757     flag aSign;
6758     int16_t aExp;
6759     uint32_t aSig;
6760
6761     a = float32_squash_input_denormal(a STATUS_VAR);
6762     aSig = extractFloat32Frac( a );
6763     aExp = extractFloat32Exp( a );
6764     aSign = extractFloat32Sign( a );
6765
6766     if ( aExp == 0xFF ) {
6767         if ( aSig ) {
6768             return propagateFloat32NaN( a, a STATUS_VAR );
6769         }
6770         return a;
6771     }
6772     if ( aExp != 0 )
6773         aSig |= 0x00800000;
6774     else if ( aSig == 0 )
6775         return a;
6776
6777     if (n > 0x200) {
6778         n = 0x200;
6779     } else if (n < -0x200) {
6780         n = -0x200;
6781     }
6782
6783     aExp += n - 1;
6784     aSig <<= 7;
6785     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6786 }
6787
6788 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6789 {
6790     flag aSign;
6791     int16_t aExp;
6792     uint64_t aSig;
6793
6794     a = float64_squash_input_denormal(a STATUS_VAR);
6795     aSig = extractFloat64Frac( a );
6796     aExp = extractFloat64Exp( a );
6797     aSign = extractFloat64Sign( a );
6798
6799     if ( aExp == 0x7FF ) {
6800         if ( aSig ) {
6801             return propagateFloat64NaN( a, a STATUS_VAR );
6802         }
6803         return a;
6804     }
6805     if ( aExp != 0 )
6806         aSig |= LIT64( 0x0010000000000000 );
6807     else if ( aSig == 0 )
6808         return a;
6809
6810     if (n > 0x1000) {
6811         n = 0x1000;
6812     } else if (n < -0x1000) {
6813         n = -0x1000;
6814     }
6815
6816     aExp += n - 1;
6817     aSig <<= 10;
6818     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6819 }
6820
6821 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6822 {
6823     flag aSign;
6824     int32_t aExp;
6825     uint64_t aSig;
6826
6827     aSig = extractFloatx80Frac( a );
6828     aExp = extractFloatx80Exp( a );
6829     aSign = extractFloatx80Sign( a );
6830
6831     if ( aExp == 0x7FFF ) {
6832         if ( aSig<<1 ) {
6833             return propagateFloatx80NaN( a, a STATUS_VAR );
6834         }
6835         return a;
6836     }
6837
6838     if (aExp == 0 && aSig == 0)
6839         return a;
6840
6841     if (n > 0x10000) {
6842         n = 0x10000;
6843     } else if (n < -0x10000) {
6844         n = -0x10000;
6845     }
6846
6847     aExp += n;
6848     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6849                                           aSign, aExp, aSig, 0 STATUS_VAR );
6850 }
6851
6852 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6853 {
6854     flag aSign;
6855     int32_t aExp;
6856     uint64_t aSig0, aSig1;
6857
6858     aSig1 = extractFloat128Frac1( a );
6859     aSig0 = extractFloat128Frac0( a );
6860     aExp = extractFloat128Exp( a );
6861     aSign = extractFloat128Sign( a );
6862     if ( aExp == 0x7FFF ) {
6863         if ( aSig0 | aSig1 ) {
6864             return propagateFloat128NaN( a, a STATUS_VAR );
6865         }
6866         return a;
6867     }
6868     if ( aExp != 0 )
6869         aSig0 |= LIT64( 0x0001000000000000 );
6870     else if ( aSig0 == 0 && aSig1 == 0 )
6871         return a;
6872
6873     if (n > 0x10000) {
6874         n = 0x10000;
6875     } else if (n < -0x10000) {
6876         n = -0x10000;
6877     }
6878
6879     aExp += n - 1;
6880     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6881                                           STATUS_VAR );
6882
6883 }