fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * Derived from SoftFloat.
   5  */
   6
   7 /*============================================================================
   8
   9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  10 Package, Release 2b.
  11
  12 Written by John R. Hauser.  This work was made possible in part by the
  13 International Computer Science Institute, located at Suite 600, 1947 Center
  14 Street, Berkeley, California 94704.  Funding was partially provided by the
  15 National Science Foundation under grant MIP-9311980.  The original version
  16 of this code was written as part of a project to build a fixed-point vector
  17 processor in collaboration with the University of California at Berkeley,
  18 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  20 arithmetic/SoftFloat.html'.
  21
  22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  24 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  30
  31 Derivative works are acceptable, even for commercial purposes, so long as
  32 (1) the source code for the derivative work includes prominent notice that
  33 the work is derivative, and (2) the source code includes prominent notice with
  34 these four paragraphs for those parts of this code that are retained.
  35
  36 =============================================================================*/
  37
  38 /* softfloat (and in particular the code in softfloat-specialize.h) is
  39  * target-dependent and needs the TARGET_* macros.
  40  */
  41 #include "config.h"
  42
  43 #include "fpu/softfloat.h"
  44
  45 /*----------------------------------------------------------------------------
  46 | Primitive arithmetic functions, including multi-word arithmetic, and
  47 | division and square root approximations.  (Can be specialized to target if
  48 | desired.)
  49 *----------------------------------------------------------------------------*/
  50 #include "softfloat-macros.h"
  51
  52 /*----------------------------------------------------------------------------
  53 | Functions and definitions to determine:  (1) whether tininess for underflow
  54 | is detected before or after rounding by default, (2) what (if anything)
  55 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  56 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  57 | are propagated from function inputs to output.  These details are target-
  58 | specific.
  59 *----------------------------------------------------------------------------*/
  60 #include "softfloat-specialize.h"
  61
  62 void set_float_rounding_mode(int val STATUS_PARAM)
  63 {
  64     STATUS(float_rounding_mode) = val;
  65 }
  66
  67 void set_float_exception_flags(int val STATUS_PARAM)
  68 {
  69     STATUS(float_exception_flags) = val;
  70 }
  71
  72 void set_floatx80_rounding_precision(int val STATUS_PARAM)
  73 {
  74     STATUS(floatx80_rounding_precision) = val;
  75 }
  76
  77 /*----------------------------------------------------------------------------
  78 | Returns the fraction bits of the half-precision floating-point value `a'.
  79 *----------------------------------------------------------------------------*/
  80
  81 INLINE uint32_t extractFloat16Frac(float16 a)
  82 {
  83     return float16_val(a) & 0x3ff;
  84 }
  85
  86 /*----------------------------------------------------------------------------
  87 | Returns the exponent bits of the half-precision floating-point value `a'.
  88 *----------------------------------------------------------------------------*/
  89
  90 INLINE int_fast16_t extractFloat16Exp(float16 a)
  91 {
  92     return (float16_val(a) >> 10) & 0x1f;
  93 }
  94
  95 /*----------------------------------------------------------------------------
  96 | Returns the sign bit of the single-precision floating-point value `a'.
  97 *----------------------------------------------------------------------------*/
  98
  99 INLINE flag extractFloat16Sign(float16 a)
 100 {
 101     return float16_val(a)>>15;
 102 }
 103
 104 /*----------------------------------------------------------------------------
 105 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
 106 | and 7, and returns the properly rounded 32-bit integer corresponding to the
 107 | input.  If `zSign' is 1, the input is negated before being converted to an
 108 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 109 | is simply rounded to an integer, with the inexact exception raised if the
 110 | input cannot be represented exactly as an integer.  However, if the fixed-
 111 | point input is too large, the invalid exception is raised and the largest
 112 | positive or negative integer is returned.
 113 *----------------------------------------------------------------------------*/
 114
 115 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
 116 {
 117     int8 roundingMode;
 118     flag roundNearestEven;
 119     int8 roundIncrement, roundBits;
 120     int32_t z;
 121
 122     roundingMode = STATUS(float_rounding_mode);
 123     roundNearestEven = ( roundingMode == float_round_nearest_even );
 124     roundIncrement = 0x40;
 125     if ( ! roundNearestEven ) {
 126         if ( roundingMode == float_round_to_zero ) {
 127             roundIncrement = 0;
 128         }
 129         else {
 130             roundIncrement = 0x7F;
 131             if ( zSign ) {
 132                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 133             }
 134             else {
 135                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 136             }
 137         }
 138     }
 139     roundBits = absZ & 0x7F;
 140     absZ = ( absZ + roundIncrement )>>7;
 141     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 142     z = absZ;
 143     if ( zSign ) z = - z;
 144     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 145         float_raise( float_flag_invalid STATUS_VAR);
 146         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
 147     }
 148     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 149     return z;
 150
 151 }
 152
 153 /*----------------------------------------------------------------------------
 154 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 155 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 156 | and returns the properly rounded 64-bit integer corresponding to the input.
 157 | If `zSign' is 1, the input is negated before being converted to an integer.
 158 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 159 | the inexact exception raised if the input cannot be represented exactly as
 160 | an integer.  However, if the fixed-point input is too large, the invalid
 161 | exception is raised and the largest positive or negative integer is
 162 | returned.
 163 *----------------------------------------------------------------------------*/
 164
 165 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
 166 {
 167     int8 roundingMode;
 168     flag roundNearestEven, increment;
 169     int64_t z;
 170
 171     roundingMode = STATUS(float_rounding_mode);
 172     roundNearestEven = ( roundingMode == float_round_nearest_even );
 173     increment = ( (int64_t) absZ1 < 0 );
 174     if ( ! roundNearestEven ) {
 175         if ( roundingMode == float_round_to_zero ) {
 176             increment = 0;
 177         }
 178         else {
 179             if ( zSign ) {
 180                 increment = ( roundingMode == float_round_down ) && absZ1;
 181             }
 182             else {
 183                 increment = ( roundingMode == float_round_up ) && absZ1;
 184             }
 185         }
 186     }
 187     if ( increment ) {
 188         ++absZ0;
 189         if ( absZ0 == 0 ) goto overflow;
 190         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 191     }
 192     z = absZ0;
 193     if ( zSign ) z = - z;
 194     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 195  overflow:
 196         float_raise( float_flag_invalid STATUS_VAR);
 197         return
 198               zSign ? (int64_t) LIT64( 0x8000000000000000 )
 199             : LIT64( 0x7FFFFFFFFFFFFFFF );
 200     }
 201     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 202     return z;
 203
 204 }
 205
 206 /*----------------------------------------------------------------------------
 207 | Returns the fraction bits of the single-precision floating-point value `a'.
 208 *----------------------------------------------------------------------------*/
 209
 210 INLINE uint32_t extractFloat32Frac( float32 a )
 211 {
 212
 213     return float32_val(a) & 0x007FFFFF;
 214
 215 }
 216
 217 /*----------------------------------------------------------------------------
 218 | Returns the exponent bits of the single-precision floating-point value `a'.
 219 *----------------------------------------------------------------------------*/
 220
 221 INLINE int_fast16_t extractFloat32Exp(float32 a)
 222 {
 223
 224     return ( float32_val(a)>>23 ) & 0xFF;
 225
 226 }
 227
 228 /*----------------------------------------------------------------------------
 229 | Returns the sign bit of the single-precision floating-point value `a'.
 230 *----------------------------------------------------------------------------*/
 231
 232 INLINE flag extractFloat32Sign( float32 a )
 233 {
 234
 235     return float32_val(a)>>31;
 236
 237 }
 238
 239 /*----------------------------------------------------------------------------
 240 | If `a' is denormal and we are in flush-to-zero mode then set the
 241 | input-denormal exception and return zero. Otherwise just return the value.
 242 *----------------------------------------------------------------------------*/
 243 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
 244 {
 245     if (STATUS(flush_inputs_to_zero)) {
 246         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
 247             float_raise(float_flag_input_denormal STATUS_VAR);
 248             return make_float32(float32_val(a) & 0x80000000);
 249         }
 250     }
 251     return a;
 252 }
 253
 254 /*----------------------------------------------------------------------------
 255 | Normalizes the subnormal single-precision floating-point value represented
 256 | by the denormalized significand `aSig'.  The normalized exponent and
 257 | significand are stored at the locations pointed to by `zExpPtr' and
 258 | `zSigPtr', respectively.
 259 *----------------------------------------------------------------------------*/
 260
 261 static void
 262  normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
 263 {
 264     int8 shiftCount;
 265
 266     shiftCount = countLeadingZeros32( aSig ) - 8;
 267     *zSigPtr = aSig<<shiftCount;
 268     *zExpPtr = 1 - shiftCount;
 269
 270 }
 271
 272 /*----------------------------------------------------------------------------
 273 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 274 | single-precision floating-point value, returning the result.  After being
 275 | shifted into the proper positions, the three fields are simply added
 276 | together to form the result.  This means that any integer portion of `zSig'
 277 | will be added into the exponent.  Since a properly normalized significand
 278 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 279 | than the desired result exponent whenever `zSig' is a complete, normalized
 280 | significand.
 281 *----------------------------------------------------------------------------*/
 282
 283 INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
 284 {
 285
 286     return make_float32(
 287           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
 288
 289 }
 290
 291 /*----------------------------------------------------------------------------
 292 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 293 | and significand `zSig', and returns the proper single-precision floating-
 294 | point value corresponding to the abstract input.  Ordinarily, the abstract
 295 | value is simply rounded and packed into the single-precision format, with
 296 | the inexact exception raised if the abstract input cannot be represented
 297 | exactly.  However, if the abstract value is too large, the overflow and
 298 | inexact exceptions are raised and an infinity or maximal finite value is
 299 | returned.  If the abstract value is too small, the input value is rounded to
 300 | a subnormal number, and the underflow and inexact exceptions are raised if
 301 | the abstract input cannot be represented exactly as a subnormal single-
 302 | precision floating-point number.
 303 |     The input significand `zSig' has its binary point between bits 30
 304 | and 29, which is 7 bits to the left of the usual location.  This shifted
 305 | significand must be normalized or smaller.  If `zSig' is not normalized,
 306 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 307 | and it must not require rounding.  In the usual case that `zSig' is
 308 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 309 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 310 | Binary Floating-Point Arithmetic.
 311 *----------------------------------------------------------------------------*/
 312
 313 static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
 314 {
 315     int8 roundingMode;
 316     flag roundNearestEven;
 317     int8 roundIncrement, roundBits;
 318     flag isTiny;
 319
 320     roundingMode = STATUS(float_rounding_mode);
 321     roundNearestEven = ( roundingMode == float_round_nearest_even );
 322     roundIncrement = 0x40;
 323     if ( ! roundNearestEven ) {
 324         if ( roundingMode == float_round_to_zero ) {
 325             roundIncrement = 0;
 326         }
 327         else {
 328             roundIncrement = 0x7F;
 329             if ( zSign ) {
 330                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 331             }
 332             else {
 333                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 334             }
 335         }
 336     }
 337     roundBits = zSig & 0x7F;
 338     if ( 0xFD <= (uint16_t) zExp ) {
 339         if (    ( 0xFD < zExp )
 340              || (    ( zExp == 0xFD )
 341                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
 342            ) {
 343             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 344             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 345         }
 346         if ( zExp < 0 ) {
 347             if (STATUS(flush_to_zero)) {
 348                 float_raise(float_flag_output_denormal STATUS_VAR);
 349                 return packFloat32(zSign, 0, 0);
 350             }
 351             isTiny =
 352                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 353                 || ( zExp < -1 )
 354                 || ( zSig + roundIncrement < 0x80000000 );
 355             shift32RightJamming( zSig, - zExp, &zSig );
 356             zExp = 0;
 357             roundBits = zSig & 0x7F;
 358             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 359         }
 360     }
 361     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 362     zSig = ( zSig + roundIncrement )>>7;
 363     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 364     if ( zSig == 0 ) zExp = 0;
 365     return packFloat32( zSign, zExp, zSig );
 366
 367 }
 368
 369 /*----------------------------------------------------------------------------
 370 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 371 | and significand `zSig', and returns the proper single-precision floating-
 372 | point value corresponding to the abstract input.  This routine is just like
 373 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 374 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 375 | floating-point exponent.
 376 *----------------------------------------------------------------------------*/
 377
 378 static float32
 379  normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
 380 {
 381     int8 shiftCount;
 382
 383     shiftCount = countLeadingZeros32( zSig ) - 1;
 384     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 385
 386 }
 387
 388 /*----------------------------------------------------------------------------
 389 | Returns the fraction bits of the double-precision floating-point value `a'.
 390 *----------------------------------------------------------------------------*/
 391
 392 INLINE uint64_t extractFloat64Frac( float64 a )
 393 {
 394
 395     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 396
 397 }
 398
 399 /*----------------------------------------------------------------------------
 400 | Returns the exponent bits of the double-precision floating-point value `a'.
 401 *----------------------------------------------------------------------------*/
 402
 403 INLINE int_fast16_t extractFloat64Exp(float64 a)
 404 {
 405
 406     return ( float64_val(a)>>52 ) & 0x7FF;
 407
 408 }
 409
 410 /*----------------------------------------------------------------------------
 411 | Returns the sign bit of the double-precision floating-point value `a'.
 412 *----------------------------------------------------------------------------*/
 413
 414 INLINE flag extractFloat64Sign( float64 a )
 415 {
 416
 417     return float64_val(a)>>63;
 418
 419 }
 420
 421 /*----------------------------------------------------------------------------
 422 | If `a' is denormal and we are in flush-to-zero mode then set the
 423 | input-denormal exception and return zero. Otherwise just return the value.
 424 *----------------------------------------------------------------------------*/
 425 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
 426 {
 427     if (STATUS(flush_inputs_to_zero)) {
 428         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
 429             float_raise(float_flag_input_denormal STATUS_VAR);
 430             return make_float64(float64_val(a) & (1ULL << 63));
 431         }
 432     }
 433     return a;
 434 }
 435
 436 /*----------------------------------------------------------------------------
 437 | Normalizes the subnormal double-precision floating-point value represented
 438 | by the denormalized significand `aSig'.  The normalized exponent and
 439 | significand are stored at the locations pointed to by `zExpPtr' and
 440 | `zSigPtr', respectively.
 441 *----------------------------------------------------------------------------*/
 442
 443 static void
 444  normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
 445 {
 446     int8 shiftCount;
 447
 448     shiftCount = countLeadingZeros64( aSig ) - 11;
 449     *zSigPtr = aSig<<shiftCount;
 450     *zExpPtr = 1 - shiftCount;
 451
 452 }
 453
 454 /*----------------------------------------------------------------------------
 455 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 456 | double-precision floating-point value, returning the result.  After being
 457 | shifted into the proper positions, the three fields are simply added
 458 | together to form the result.  This means that any integer portion of `zSig'
 459 | will be added into the exponent.  Since a properly normalized significand
 460 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 461 | than the desired result exponent whenever `zSig' is a complete, normalized
 462 | significand.
 463 *----------------------------------------------------------------------------*/
 464
 465 INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
 466 {
 467
 468     return make_float64(
 469         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
 470
 471 }
 472
 473 /*----------------------------------------------------------------------------
 474 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 475 | and significand `zSig', and returns the proper double-precision floating-
 476 | point value corresponding to the abstract input.  Ordinarily, the abstract
 477 | value is simply rounded and packed into the double-precision format, with
 478 | the inexact exception raised if the abstract input cannot be represented
 479 | exactly.  However, if the abstract value is too large, the overflow and
 480 | inexact exceptions are raised and an infinity or maximal finite value is
 481 | returned.  If the abstract value is too small, the input value is rounded
 482 | to a subnormal number, and the underflow and inexact exceptions are raised
 483 | if the abstract input cannot be represented exactly as a subnormal double-
 484 | precision floating-point number.
 485 |     The input significand `zSig' has its binary point between bits 62
 486 | and 61, which is 10 bits to the left of the usual location.  This shifted
 487 | significand must be normalized or smaller.  If `zSig' is not normalized,
 488 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 489 | and it must not require rounding.  In the usual case that `zSig' is
 490 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 491 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 492 | Binary Floating-Point Arithmetic.
 493 *----------------------------------------------------------------------------*/
 494
 495 static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
 496 {
 497     int8 roundingMode;
 498     flag roundNearestEven;
 499     int_fast16_t roundIncrement, roundBits;
 500     flag isTiny;
 501
 502     roundingMode = STATUS(float_rounding_mode);
 503     roundNearestEven = ( roundingMode == float_round_nearest_even );
 504     roundIncrement = 0x200;
 505     if ( ! roundNearestEven ) {
 506         if ( roundingMode == float_round_to_zero ) {
 507             roundIncrement = 0;
 508         }
 509         else {
 510             roundIncrement = 0x3FF;
 511             if ( zSign ) {
 512                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 513             }
 514             else {
 515                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 516             }
 517         }
 518     }
 519     roundBits = zSig & 0x3FF;
 520     if ( 0x7FD <= (uint16_t) zExp ) {
 521         if (    ( 0x7FD < zExp )
 522              || (    ( zExp == 0x7FD )
 523                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
 524            ) {
 525             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 526             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 527         }
 528         if ( zExp < 0 ) {
 529             if (STATUS(flush_to_zero)) {
 530                 float_raise(float_flag_output_denormal STATUS_VAR);
 531                 return packFloat64(zSign, 0, 0);
 532             }
 533             isTiny =
 534                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 535                 || ( zExp < -1 )
 536                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 537             shift64RightJamming( zSig, - zExp, &zSig );
 538             zExp = 0;
 539             roundBits = zSig & 0x3FF;
 540             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 541         }
 542     }
 543     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 544     zSig = ( zSig + roundIncrement )>>10;
 545     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 546     if ( zSig == 0 ) zExp = 0;
 547     return packFloat64( zSign, zExp, zSig );
 548
 549 }
 550
 551 /*----------------------------------------------------------------------------
 552 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 553 | and significand `zSig', and returns the proper double-precision floating-
 554 | point value corresponding to the abstract input.  This routine is just like
 555 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 556 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 557 | floating-point exponent.
 558 *----------------------------------------------------------------------------*/
 559
 560 static float64
 561  normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
 562 {
 563     int8 shiftCount;
 564
 565     shiftCount = countLeadingZeros64( zSig ) - 1;
 566     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 567
 568 }
 569
 570 /*----------------------------------------------------------------------------
 571 | Returns the fraction bits of the extended double-precision floating-point
 572 | value `a'.
 573 *----------------------------------------------------------------------------*/
 574
 575 INLINE uint64_t extractFloatx80Frac( floatx80 a )
 576 {
 577
 578     return a.low;
 579
 580 }
 581
 582 /*----------------------------------------------------------------------------
 583 | Returns the exponent bits of the extended double-precision floating-point
 584 | value `a'.
 585 *----------------------------------------------------------------------------*/
 586
 587 INLINE int32 extractFloatx80Exp( floatx80 a )
 588 {
 589
 590     return a.high & 0x7FFF;
 591
 592 }
 593
 594 /*----------------------------------------------------------------------------
 595 | Returns the sign bit of the extended double-precision floating-point value
 596 | `a'.
 597 *----------------------------------------------------------------------------*/
 598
 599 INLINE flag extractFloatx80Sign( floatx80 a )
 600 {
 601
 602     return a.high>>15;
 603
 604 }
 605
 606 /*----------------------------------------------------------------------------
 607 | Normalizes the subnormal extended double-precision floating-point value
 608 | represented by the denormalized significand `aSig'.  The normalized exponent
 609 | and significand are stored at the locations pointed to by `zExpPtr' and
 610 | `zSigPtr', respectively.
 611 *----------------------------------------------------------------------------*/
 612
 613 static void
 614  normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
 615 {
 616     int8 shiftCount;
 617
 618     shiftCount = countLeadingZeros64( aSig );
 619     *zSigPtr = aSig<<shiftCount;
 620     *zExpPtr = 1 - shiftCount;
 621
 622 }
 623
 624 /*----------------------------------------------------------------------------
 625 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 626 | extended double-precision floating-point value, returning the result.
 627 *----------------------------------------------------------------------------*/
 628
 629 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
 630 {
 631     floatx80 z;
 632
 633     z.low = zSig;
 634     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
 635     return z;
 636
 637 }
 638
 639 /*----------------------------------------------------------------------------
 640 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 641 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 642 | and returns the proper extended double-precision floating-point value
 643 | corresponding to the abstract input.  Ordinarily, the abstract value is
 644 | rounded and packed into the extended double-precision format, with the
 645 | inexact exception raised if the abstract input cannot be represented
 646 | exactly.  However, if the abstract value is too large, the overflow and
 647 | inexact exceptions are raised and an infinity or maximal finite value is
 648 | returned.  If the abstract value is too small, the input value is rounded to
 649 | a subnormal number, and the underflow and inexact exceptions are raised if
 650 | the abstract input cannot be represented exactly as a subnormal extended
 651 | double-precision floating-point number.
 652 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 653 | number of bits as single or double precision, respectively.  Otherwise, the
 654 | result is rounded to the full precision of the extended double-precision
 655 | format.
 656 |     The input significand must be normalized or smaller.  If the input
 657 | significand is not normalized, `zExp' must be 0; in that case, the result
 658 | returned is a subnormal number, and it must not require rounding.  The
 659 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 660 | Floating-Point Arithmetic.
 661 *----------------------------------------------------------------------------*/
 662
 663 static floatx80
 664  roundAndPackFloatx80(
 665      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 666  STATUS_PARAM)
 667 {
 668     int8 roundingMode;
 669     flag roundNearestEven, increment, isTiny;
 670     int64 roundIncrement, roundMask, roundBits;
 671
 672     roundingMode = STATUS(float_rounding_mode);
 673     roundNearestEven = ( roundingMode == float_round_nearest_even );
 674     if ( roundingPrecision == 80 ) goto precision80;
 675     if ( roundingPrecision == 64 ) {
 676         roundIncrement = LIT64( 0x0000000000000400 );
 677         roundMask = LIT64( 0x00000000000007FF );
 678     }
 679     else if ( roundingPrecision == 32 ) {
 680         roundIncrement = LIT64( 0x0000008000000000 );
 681         roundMask = LIT64( 0x000000FFFFFFFFFF );
 682     }
 683     else {
 684         goto precision80;
 685     }
 686     zSig0 |= ( zSig1 != 0 );
 687     if ( ! roundNearestEven ) {
 688         if ( roundingMode == float_round_to_zero ) {
 689             roundIncrement = 0;
 690         }
 691         else {
 692             roundIncrement = roundMask;
 693             if ( zSign ) {
 694                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 695             }
 696             else {
 697                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 698             }
 699         }
 700     }
 701     roundBits = zSig0 & roundMask;
 702     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 703         if (    ( 0x7FFE < zExp )
 704              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 705            ) {
 706             goto overflow;
 707         }
 708         if ( zExp <= 0 ) {
 709             if (STATUS(flush_to_zero)) {
 710                 float_raise(float_flag_output_denormal STATUS_VAR);
 711                 return packFloatx80(zSign, 0, 0);
 712             }
 713             isTiny =
 714                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 715                 || ( zExp < 0 )
 716                 || ( zSig0 <= zSig0 + roundIncrement );
 717             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 718             zExp = 0;
 719             roundBits = zSig0 & roundMask;
 720             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 721             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 722             zSig0 += roundIncrement;
 723             if ( (int64_t) zSig0 < 0 ) zExp = 1;
 724             roundIncrement = roundMask + 1;
 725             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 726                 roundMask |= roundIncrement;
 727             }
 728             zSig0 &= ~ roundMask;
 729             return packFloatx80( zSign, zExp, zSig0 );
 730         }
 731     }
 732     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 733     zSig0 += roundIncrement;
 734     if ( zSig0 < roundIncrement ) {
 735         ++zExp;
 736         zSig0 = LIT64( 0x8000000000000000 );
 737     }
 738     roundIncrement = roundMask + 1;
 739     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 740         roundMask |= roundIncrement;
 741     }
 742     zSig0 &= ~ roundMask;
 743     if ( zSig0 == 0 ) zExp = 0;
 744     return packFloatx80( zSign, zExp, zSig0 );
 745  precision80:
 746     increment = ( (int64_t) zSig1 < 0 );
 747     if ( ! roundNearestEven ) {
 748         if ( roundingMode == float_round_to_zero ) {
 749             increment = 0;
 750         }
 751         else {
 752             if ( zSign ) {
 753                 increment = ( roundingMode == float_round_down ) && zSig1;
 754             }
 755             else {
 756                 increment = ( roundingMode == float_round_up ) && zSig1;
 757             }
 758         }
 759     }
 760     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 761         if (    ( 0x7FFE < zExp )
 762              || (    ( zExp == 0x7FFE )
 763                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 764                   && increment
 765                 )
 766            ) {
 767             roundMask = 0;
 768  overflow:
 769             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 770             if (    ( roundingMode == float_round_to_zero )
 771                  || ( zSign && ( roundingMode == float_round_up ) )
 772                  || ( ! zSign && ( roundingMode == float_round_down ) )
 773                ) {
 774                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 775             }
 776             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 777         }
 778         if ( zExp <= 0 ) {
 779             isTiny =
 780                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 781                 || ( zExp < 0 )
 782                 || ! increment
 783                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 784             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 785             zExp = 0;
 786             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 787             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 788             if ( roundNearestEven ) {
 789                 increment = ( (int64_t) zSig1 < 0 );
 790             }
 791             else {
 792                 if ( zSign ) {
 793                     increment = ( roundingMode == float_round_down ) && zSig1;
 794                 }
 795                 else {
 796                     increment = ( roundingMode == float_round_up ) && zSig1;
 797                 }
 798             }
 799             if ( increment ) {
 800                 ++zSig0;
 801                 zSig0 &=
 802                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 803                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
 804             }
 805             return packFloatx80( zSign, zExp, zSig0 );
 806         }
 807     }
 808     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 809     if ( increment ) {
 810         ++zSig0;
 811         if ( zSig0 == 0 ) {
 812             ++zExp;
 813             zSig0 = LIT64( 0x8000000000000000 );
 814         }
 815         else {
 816             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 817         }
 818     }
 819     else {
 820         if ( zSig0 == 0 ) zExp = 0;
 821     }
 822     return packFloatx80( zSign, zExp, zSig0 );
 823
 824 }
 825
 826 /*----------------------------------------------------------------------------
 827 | Takes an abstract floating-point value having sign `zSign', exponent
 828 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 829 | and returns the proper extended double-precision floating-point value
 830 | corresponding to the abstract input.  This routine is just like
 831 | `roundAndPackFloatx80' except that the input significand does not have to be
 832 | normalized.
 833 *----------------------------------------------------------------------------*/
 834
 835 static floatx80
 836  normalizeRoundAndPackFloatx80(
 837      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 838  STATUS_PARAM)
 839 {
 840     int8 shiftCount;
 841
 842     if ( zSig0 == 0 ) {
 843         zSig0 = zSig1;
 844         zSig1 = 0;
 845         zExp -= 64;
 846     }
 847     shiftCount = countLeadingZeros64( zSig0 );
 848     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 849     zExp -= shiftCount;
 850     return
 851         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 852
 853 }
 854
 855 /*----------------------------------------------------------------------------
 856 | Returns the least-significant 64 fraction bits of the quadruple-precision
 857 | floating-point value `a'.
 858 *----------------------------------------------------------------------------*/
 859
 860 INLINE uint64_t extractFloat128Frac1( float128 a )
 861 {
 862
 863     return a.low;
 864
 865 }
 866
 867 /*----------------------------------------------------------------------------
 868 | Returns the most-significant 48 fraction bits of the quadruple-precision
 869 | floating-point value `a'.
 870 *----------------------------------------------------------------------------*/
 871
 872 INLINE uint64_t extractFloat128Frac0( float128 a )
 873 {
 874
 875     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 876
 877 }
 878
 879 /*----------------------------------------------------------------------------
 880 | Returns the exponent bits of the quadruple-precision floating-point value
 881 | `a'.
 882 *----------------------------------------------------------------------------*/
 883
 884 INLINE int32 extractFloat128Exp( float128 a )
 885 {
 886
 887     return ( a.high>>48 ) & 0x7FFF;
 888
 889 }
 890
 891 /*----------------------------------------------------------------------------
 892 | Returns the sign bit of the quadruple-precision floating-point value `a'.
 893 *----------------------------------------------------------------------------*/
 894
 895 INLINE flag extractFloat128Sign( float128 a )
 896 {
 897
 898     return a.high>>63;
 899
 900 }
 901
 902 /*----------------------------------------------------------------------------
 903 | Normalizes the subnormal quadruple-precision floating-point value
 904 | represented by the denormalized significand formed by the concatenation of
 905 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 906 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 907 | significand are stored at the location pointed to by `zSig0Ptr', and the
 908 | least significant 64 bits of the normalized significand are stored at the
 909 | location pointed to by `zSig1Ptr'.
 910 *----------------------------------------------------------------------------*/
 911
 912 static void
 913  normalizeFloat128Subnormal(
 914      uint64_t aSig0,
 915      uint64_t aSig1,
 916      int32 *zExpPtr,
 917      uint64_t *zSig0Ptr,
 918      uint64_t *zSig1Ptr
 919  )
 920 {
 921     int8 shiftCount;
 922
 923     if ( aSig0 == 0 ) {
 924         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 925         if ( shiftCount < 0 ) {
 926             *zSig0Ptr = aSig1>>( - shiftCount );
 927             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 928         }
 929         else {
 930             *zSig0Ptr = aSig1<<shiftCount;
 931             *zSig1Ptr = 0;
 932         }
 933         *zExpPtr = - shiftCount - 63;
 934     }
 935     else {
 936         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 937         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 938         *zExpPtr = 1 - shiftCount;
 939     }
 940
 941 }
 942
 943 /*----------------------------------------------------------------------------
 944 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 945 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 946 | floating-point value, returning the result.  After being shifted into the
 947 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 948 | added together to form the most significant 32 bits of the result.  This
 949 | means that any integer portion of `zSig0' will be added into the exponent.
 950 | Since a properly normalized significand will have an integer portion equal
 951 | to 1, the `zExp' input should be 1 less than the desired result exponent
 952 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 953 | significand.
 954 *----------------------------------------------------------------------------*/
 955
 956 INLINE float128
 957  packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
 958 {
 959     float128 z;
 960
 961     z.low = zSig1;
 962     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
 963     return z;
 964
 965 }
 966
 967 /*----------------------------------------------------------------------------
 968 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 969 | and extended significand formed by the concatenation of `zSig0', `zSig1',
 970 | and `zSig2', and returns the proper quadruple-precision floating-point value
 971 | corresponding to the abstract input.  Ordinarily, the abstract value is
 972 | simply rounded and packed into the quadruple-precision format, with the
 973 | inexact exception raised if the abstract input cannot be represented
 974 | exactly.  However, if the abstract value is too large, the overflow and
 975 | inexact exceptions are raised and an infinity or maximal finite value is
 976 | returned.  If the abstract value is too small, the input value is rounded to
 977 | a subnormal number, and the underflow and inexact exceptions are raised if
 978 | the abstract input cannot be represented exactly as a subnormal quadruple-
 979 | precision floating-point number.
 980 |     The input significand must be normalized or smaller.  If the input
 981 | significand is not normalized, `zExp' must be 0; in that case, the result
 982 | returned is a subnormal number, and it must not require rounding.  In the
 983 | usual case that the input significand is normalized, `zExp' must be 1 less
 984 | than the ``true'' floating-point exponent.  The handling of underflow and
 985 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 986 *----------------------------------------------------------------------------*/
 987
 988 static float128
 989  roundAndPackFloat128(
 990      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
 991 {
 992     int8 roundingMode;
 993     flag roundNearestEven, increment, isTiny;
 994
 995     roundingMode = STATUS(float_rounding_mode);
 996     roundNearestEven = ( roundingMode == float_round_nearest_even );
 997     increment = ( (int64_t) zSig2 < 0 );
 998     if ( ! roundNearestEven ) {
 999         if ( roundingMode == float_round_to_zero ) {
1000             increment = 0;
1001         }
1002         else {
1003             if ( zSign ) {
1004                 increment = ( roundingMode == float_round_down ) && zSig2;
1005             }
1006             else {
1007                 increment = ( roundingMode == float_round_up ) && zSig2;
1008             }
1009         }
1010     }
1011     if ( 0x7FFD <= (uint32_t) zExp ) {
1012         if (    ( 0x7FFD < zExp )
1013              || (    ( zExp == 0x7FFD )
1014                   && eq128(
1015                          LIT64( 0x0001FFFFFFFFFFFF ),
1016                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1017                          zSig0,
1018                          zSig1
1019                      )
1020                   && increment
1021                 )
1022            ) {
1023             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1024             if (    ( roundingMode == float_round_to_zero )
1025                  || ( zSign && ( roundingMode == float_round_up ) )
1026                  || ( ! zSign && ( roundingMode == float_round_down ) )
1027                ) {
1028                 return
1029                     packFloat128(
1030                         zSign,
1031                         0x7FFE,
1032                         LIT64( 0x0000FFFFFFFFFFFF ),
1033                         LIT64( 0xFFFFFFFFFFFFFFFF )
1034                     );
1035             }
1036             return packFloat128( zSign, 0x7FFF, 0, 0 );
1037         }
1038         if ( zExp < 0 ) {
1039             if (STATUS(flush_to_zero)) {
1040                 float_raise(float_flag_output_denormal STATUS_VAR);
1041                 return packFloat128(zSign, 0, 0, 0);
1042             }
1043             isTiny =
1044                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1045                 || ( zExp < -1 )
1046                 || ! increment
1047                 || lt128(
1048                        zSig0,
1049                        zSig1,
1050                        LIT64( 0x0001FFFFFFFFFFFF ),
1051                        LIT64( 0xFFFFFFFFFFFFFFFF )
1052                    );
1053             shift128ExtraRightJamming(
1054                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1055             zExp = 0;
1056             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1057             if ( roundNearestEven ) {
1058                 increment = ( (int64_t) zSig2 < 0 );
1059             }
1060             else {
1061                 if ( zSign ) {
1062                     increment = ( roundingMode == float_round_down ) && zSig2;
1063                 }
1064                 else {
1065                     increment = ( roundingMode == float_round_up ) && zSig2;
1066                 }
1067             }
1068         }
1069     }
1070     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1071     if ( increment ) {
1072         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1073         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1074     }
1075     else {
1076         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1077     }
1078     return packFloat128( zSign, zExp, zSig0, zSig1 );
1079
1080 }
1081
1082 /*----------------------------------------------------------------------------
1083 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1084 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1085 | returns the proper quadruple-precision floating-point value corresponding
1086 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1087 | except that the input significand has fewer bits and does not have to be
1088 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1089 | point exponent.
1090 *----------------------------------------------------------------------------*/
1091
1092 static float128
1093  normalizeRoundAndPackFloat128(
1094      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1095 {
1096     int8 shiftCount;
1097     uint64_t zSig2;
1098
1099     if ( zSig0 == 0 ) {
1100         zSig0 = zSig1;
1101         zSig1 = 0;
1102         zExp -= 64;
1103     }
1104     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1105     if ( 0 <= shiftCount ) {
1106         zSig2 = 0;
1107         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1108     }
1109     else {
1110         shift128ExtraRightJamming(
1111             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1112     }
1113     zExp -= shiftCount;
1114     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1115
1116 }
1117
1118 /*----------------------------------------------------------------------------
1119 | Returns the result of converting the 32-bit two's complement integer `a'
1120 | to the single-precision floating-point format.  The conversion is performed
1121 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1122 *----------------------------------------------------------------------------*/
1123
1124 float32 int32_to_float32( int32 a STATUS_PARAM )
1125 {
1126     flag zSign;
1127
1128     if ( a == 0 ) return float32_zero;
1129     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1130     zSign = ( a < 0 );
1131     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1132
1133 }
1134
1135 /*----------------------------------------------------------------------------
1136 | Returns the result of converting the 32-bit two's complement integer `a'
1137 | to the double-precision floating-point format.  The conversion is performed
1138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139 *----------------------------------------------------------------------------*/
1140
1141 float64 int32_to_float64( int32 a STATUS_PARAM )
1142 {
1143     flag zSign;
1144     uint32 absA;
1145     int8 shiftCount;
1146     uint64_t zSig;
1147
1148     if ( a == 0 ) return float64_zero;
1149     zSign = ( a < 0 );
1150     absA = zSign ? - a : a;
1151     shiftCount = countLeadingZeros32( absA ) + 21;
1152     zSig = absA;
1153     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1154
1155 }
1156
1157 /*----------------------------------------------------------------------------
1158 | Returns the result of converting the 32-bit two's complement integer `a'
1159 | to the extended double-precision floating-point format.  The conversion
1160 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1161 | Arithmetic.
1162 *----------------------------------------------------------------------------*/
1163
1164 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1165 {
1166     flag zSign;
1167     uint32 absA;
1168     int8 shiftCount;
1169     uint64_t zSig;
1170
1171     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1172     zSign = ( a < 0 );
1173     absA = zSign ? - a : a;
1174     shiftCount = countLeadingZeros32( absA ) + 32;
1175     zSig = absA;
1176     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1177
1178 }
1179
1180 /*----------------------------------------------------------------------------
1181 | Returns the result of converting the 32-bit two's complement integer `a' to
1182 | the quadruple-precision floating-point format.  The conversion is performed
1183 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1184 *----------------------------------------------------------------------------*/
1185
1186 float128 int32_to_float128( int32 a STATUS_PARAM )
1187 {
1188     flag zSign;
1189     uint32 absA;
1190     int8 shiftCount;
1191     uint64_t zSig0;
1192
1193     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1194     zSign = ( a < 0 );
1195     absA = zSign ? - a : a;
1196     shiftCount = countLeadingZeros32( absA ) + 17;
1197     zSig0 = absA;
1198     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1199
1200 }
1201
1202 /*----------------------------------------------------------------------------
1203 | Returns the result of converting the 64-bit two's complement integer `a'
1204 | to the single-precision floating-point format.  The conversion is performed
1205 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1206 *----------------------------------------------------------------------------*/
1207
1208 float32 int64_to_float32( int64 a STATUS_PARAM )
1209 {
1210     flag zSign;
1211     uint64 absA;
1212     int8 shiftCount;
1213
1214     if ( a == 0 ) return float32_zero;
1215     zSign = ( a < 0 );
1216     absA = zSign ? - a : a;
1217     shiftCount = countLeadingZeros64( absA ) - 40;
1218     if ( 0 <= shiftCount ) {
1219         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1220     }
1221     else {
1222         shiftCount += 7;
1223         if ( shiftCount < 0 ) {
1224             shift64RightJamming( absA, - shiftCount, &absA );
1225         }
1226         else {
1227             absA <<= shiftCount;
1228         }
1229         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1230     }
1231
1232 }
1233
1234 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1235 {
1236     int8 shiftCount;
1237
1238     if ( a == 0 ) return float32_zero;
1239     shiftCount = countLeadingZeros64( a ) - 40;
1240     if ( 0 <= shiftCount ) {
1241         return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
1242     }
1243     else {
1244         shiftCount += 7;
1245         if ( shiftCount < 0 ) {
1246             shift64RightJamming( a, - shiftCount, &a );
1247         }
1248         else {
1249             a <<= shiftCount;
1250         }
1251         return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
1252     }
1253 }
1254
1255 /*----------------------------------------------------------------------------
1256 | Returns the result of converting the 64-bit two's complement integer `a'
1257 | to the double-precision floating-point format.  The conversion is performed
1258 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1259 *----------------------------------------------------------------------------*/
1260
1261 float64 int64_to_float64( int64 a STATUS_PARAM )
1262 {
1263     flag zSign;
1264
1265     if ( a == 0 ) return float64_zero;
1266     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1267         return packFloat64( 1, 0x43E, 0 );
1268     }
1269     zSign = ( a < 0 );
1270     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1271
1272 }
1273
1274 float64 uint64_to_float64(uint64 a STATUS_PARAM)
1275 {
1276     int exp =  0x43C;
1277
1278     if (a == 0) {
1279         return float64_zero;
1280     }
1281     if ((int64_t)a < 0) {
1282         shift64RightJamming(a, 1, &a);
1283         exp += 1;
1284     }
1285     return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);
1286 }
1287
1288 /*----------------------------------------------------------------------------
1289 | Returns the result of converting the 64-bit two's complement integer `a'
1290 | to the extended double-precision floating-point format.  The conversion
1291 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1292 | Arithmetic.
1293 *----------------------------------------------------------------------------*/
1294
1295 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1296 {
1297     flag zSign;
1298     uint64 absA;
1299     int8 shiftCount;
1300
1301     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1302     zSign = ( a < 0 );
1303     absA = zSign ? - a : a;
1304     shiftCount = countLeadingZeros64( absA );
1305     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1306
1307 }
1308
1309 /*----------------------------------------------------------------------------
1310 | Returns the result of converting the 64-bit two's complement integer `a' to
1311 | the quadruple-precision floating-point format.  The conversion is performed
1312 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1313 *----------------------------------------------------------------------------*/
1314
1315 float128 int64_to_float128( int64 a STATUS_PARAM )
1316 {
1317     flag zSign;
1318     uint64 absA;
1319     int8 shiftCount;
1320     int32 zExp;
1321     uint64_t zSig0, zSig1;
1322
1323     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1324     zSign = ( a < 0 );
1325     absA = zSign ? - a : a;
1326     shiftCount = countLeadingZeros64( absA ) + 49;
1327     zExp = 0x406E - shiftCount;
1328     if ( 64 <= shiftCount ) {
1329         zSig1 = 0;
1330         zSig0 = absA;
1331         shiftCount -= 64;
1332     }
1333     else {
1334         zSig1 = absA;
1335         zSig0 = 0;
1336     }
1337     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1338     return packFloat128( zSign, zExp, zSig0, zSig1 );
1339
1340 }
1341
1342 float128 uint64_to_float128(uint64 a STATUS_PARAM)
1343 {
1344     if (a == 0) {
1345         return float128_zero;
1346     }
1347     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1348 }
1349
1350 /*----------------------------------------------------------------------------
1351 | Returns the result of converting the single-precision floating-point value
1352 | `a' to the 32-bit two's complement integer format.  The conversion is
1353 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1354 | Arithmetic---which means in particular that the conversion is rounded
1355 | according to the current rounding mode.  If `a' is a NaN, the largest
1356 | positive integer is returned.  Otherwise, if the conversion overflows, the
1357 | largest integer with the same sign as `a' is returned.
1358 *----------------------------------------------------------------------------*/
1359
1360 int32 float32_to_int32( float32 a STATUS_PARAM )
1361 {
1362     flag aSign;
1363     int_fast16_t aExp, shiftCount;
1364     uint32_t aSig;
1365     uint64_t aSig64;
1366
1367     a = float32_squash_input_denormal(a STATUS_VAR);
1368     aSig = extractFloat32Frac( a );
1369     aExp = extractFloat32Exp( a );
1370     aSign = extractFloat32Sign( a );
1371     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1372     if ( aExp ) aSig |= 0x00800000;
1373     shiftCount = 0xAF - aExp;
1374     aSig64 = aSig;
1375     aSig64 <<= 32;
1376     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1377     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1378
1379 }
1380
1381 /*----------------------------------------------------------------------------
1382 | Returns the result of converting the single-precision floating-point value
1383 | `a' to the 32-bit two's complement integer format.  The conversion is
1384 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1385 | Arithmetic, except that the conversion is always rounded toward zero.
1386 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1387 | the conversion overflows, the largest integer with the same sign as `a' is
1388 | returned.
1389 *----------------------------------------------------------------------------*/
1390
1391 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1392 {
1393     flag aSign;
1394     int_fast16_t aExp, shiftCount;
1395     uint32_t aSig;
1396     int32_t z;
1397     a = float32_squash_input_denormal(a STATUS_VAR);
1398
1399     aSig = extractFloat32Frac( a );
1400     aExp = extractFloat32Exp( a );
1401     aSign = extractFloat32Sign( a );
1402     shiftCount = aExp - 0x9E;
1403     if ( 0 <= shiftCount ) {
1404         if ( float32_val(a) != 0xCF000000 ) {
1405             float_raise( float_flag_invalid STATUS_VAR);
1406             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1407         }
1408         return (int32_t) 0x80000000;
1409     }
1410     else if ( aExp <= 0x7E ) {
1411         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1412         return 0;
1413     }
1414     aSig = ( aSig | 0x00800000 )<<8;
1415     z = aSig>>( - shiftCount );
1416     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1417         STATUS(float_exception_flags) |= float_flag_inexact;
1418     }
1419     if ( aSign ) z = - z;
1420     return z;
1421
1422 }
1423
1424 /*----------------------------------------------------------------------------
1425 | Returns the result of converting the single-precision floating-point value
1426 | `a' to the 16-bit two's complement integer format.  The conversion is
1427 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1428 | Arithmetic, except that the conversion is always rounded toward zero.
1429 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1430 | the conversion overflows, the largest integer with the same sign as `a' is
1431 | returned.
1432 *----------------------------------------------------------------------------*/
1433
1434 int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
1435 {
1436     flag aSign;
1437     int_fast16_t aExp, shiftCount;
1438     uint32_t aSig;
1439     int32 z;
1440
1441     aSig = extractFloat32Frac( a );
1442     aExp = extractFloat32Exp( a );
1443     aSign = extractFloat32Sign( a );
1444     shiftCount = aExp - 0x8E;
1445     if ( 0 <= shiftCount ) {
1446         if ( float32_val(a) != 0xC7000000 ) {
1447             float_raise( float_flag_invalid STATUS_VAR);
1448             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1449                 return 0x7FFF;
1450             }
1451         }
1452         return (int32_t) 0xffff8000;
1453     }
1454     else if ( aExp <= 0x7E ) {
1455         if ( aExp | aSig ) {
1456             STATUS(float_exception_flags) |= float_flag_inexact;
1457         }
1458         return 0;
1459     }
1460     shiftCount -= 0x10;
1461     aSig = ( aSig | 0x00800000 )<<8;
1462     z = aSig>>( - shiftCount );
1463     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1464         STATUS(float_exception_flags) |= float_flag_inexact;
1465     }
1466     if ( aSign ) {
1467         z = - z;
1468     }
1469     return z;
1470
1471 }
1472
1473 /*----------------------------------------------------------------------------
1474 | Returns the result of converting the single-precision floating-point value
1475 | `a' to the 64-bit two's complement integer format.  The conversion is
1476 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1477 | Arithmetic---which means in particular that the conversion is rounded
1478 | according to the current rounding mode.  If `a' is a NaN, the largest
1479 | positive integer is returned.  Otherwise, if the conversion overflows, the
1480 | largest integer with the same sign as `a' is returned.
1481 *----------------------------------------------------------------------------*/
1482
1483 int64 float32_to_int64( float32 a STATUS_PARAM )
1484 {
1485     flag aSign;
1486     int_fast16_t aExp, shiftCount;
1487     uint32_t aSig;
1488     uint64_t aSig64, aSigExtra;
1489     a = float32_squash_input_denormal(a STATUS_VAR);
1490
1491     aSig = extractFloat32Frac( a );
1492     aExp = extractFloat32Exp( a );
1493     aSign = extractFloat32Sign( a );
1494     shiftCount = 0xBE - aExp;
1495     if ( shiftCount < 0 ) {
1496         float_raise( float_flag_invalid STATUS_VAR);
1497         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1498             return LIT64( 0x7FFFFFFFFFFFFFFF );
1499         }
1500         return (int64_t) LIT64( 0x8000000000000000 );
1501     }
1502     if ( aExp ) aSig |= 0x00800000;
1503     aSig64 = aSig;
1504     aSig64 <<= 40;
1505     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1506     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1507
1508 }
1509
1510 /*----------------------------------------------------------------------------
1511 | Returns the result of converting the single-precision floating-point value
1512 | `a' to the 64-bit two's complement integer format.  The conversion is
1513 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1514 | Arithmetic, except that the conversion is always rounded toward zero.  If
1515 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1516 | conversion overflows, the largest integer with the same sign as `a' is
1517 | returned.
1518 *----------------------------------------------------------------------------*/
1519
1520 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1521 {
1522     flag aSign;
1523     int_fast16_t aExp, shiftCount;
1524     uint32_t aSig;
1525     uint64_t aSig64;
1526     int64 z;
1527     a = float32_squash_input_denormal(a STATUS_VAR);
1528
1529     aSig = extractFloat32Frac( a );
1530     aExp = extractFloat32Exp( a );
1531     aSign = extractFloat32Sign( a );
1532     shiftCount = aExp - 0xBE;
1533     if ( 0 <= shiftCount ) {
1534         if ( float32_val(a) != 0xDF000000 ) {
1535             float_raise( float_flag_invalid STATUS_VAR);
1536             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1537                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1538             }
1539         }
1540         return (int64_t) LIT64( 0x8000000000000000 );
1541     }
1542     else if ( aExp <= 0x7E ) {
1543         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1544         return 0;
1545     }
1546     aSig64 = aSig | 0x00800000;
1547     aSig64 <<= 40;
1548     z = aSig64>>( - shiftCount );
1549     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1550         STATUS(float_exception_flags) |= float_flag_inexact;
1551     }
1552     if ( aSign ) z = - z;
1553     return z;
1554
1555 }
1556
1557 /*----------------------------------------------------------------------------
1558 | Returns the result of converting the single-precision floating-point value
1559 | `a' to the double-precision floating-point format.  The conversion is
1560 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1561 | Arithmetic.
1562 *----------------------------------------------------------------------------*/
1563
1564 float64 float32_to_float64( float32 a STATUS_PARAM )
1565 {
1566     flag aSign;
1567     int_fast16_t aExp;
1568     uint32_t aSig;
1569     a = float32_squash_input_denormal(a STATUS_VAR);
1570
1571     aSig = extractFloat32Frac( a );
1572     aExp = extractFloat32Exp( a );
1573     aSign = extractFloat32Sign( a );
1574     if ( aExp == 0xFF ) {
1575         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1576         return packFloat64( aSign, 0x7FF, 0 );
1577     }
1578     if ( aExp == 0 ) {
1579         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1580         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1581         --aExp;
1582     }
1583     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1584
1585 }
1586
1587 /*----------------------------------------------------------------------------
1588 | Returns the result of converting the single-precision floating-point value
1589 | `a' to the extended double-precision floating-point format.  The conversion
1590 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1591 | Arithmetic.
1592 *----------------------------------------------------------------------------*/
1593
1594 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1595 {
1596     flag aSign;
1597     int_fast16_t aExp;
1598     uint32_t aSig;
1599
1600     a = float32_squash_input_denormal(a STATUS_VAR);
1601     aSig = extractFloat32Frac( a );
1602     aExp = extractFloat32Exp( a );
1603     aSign = extractFloat32Sign( a );
1604     if ( aExp == 0xFF ) {
1605         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1606         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1607     }
1608     if ( aExp == 0 ) {
1609         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1610         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1611     }
1612     aSig |= 0x00800000;
1613     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1614
1615 }
1616
1617 /*----------------------------------------------------------------------------
1618 | Returns the result of converting the single-precision floating-point value
1619 | `a' to the double-precision floating-point format.  The conversion is
1620 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1621 | Arithmetic.
1622 *----------------------------------------------------------------------------*/
1623
1624 float128 float32_to_float128( float32 a STATUS_PARAM )
1625 {
1626     flag aSign;
1627     int_fast16_t aExp;
1628     uint32_t aSig;
1629
1630     a = float32_squash_input_denormal(a STATUS_VAR);
1631     aSig = extractFloat32Frac( a );
1632     aExp = extractFloat32Exp( a );
1633     aSign = extractFloat32Sign( a );
1634     if ( aExp == 0xFF ) {
1635         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1636         return packFloat128( aSign, 0x7FFF, 0, 0 );
1637     }
1638     if ( aExp == 0 ) {
1639         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1640         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1641         --aExp;
1642     }
1643     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1644
1645 }
1646
1647 /*----------------------------------------------------------------------------
1648 | Rounds the single-precision floating-point value `a' to an integer, and
1649 | returns the result as a single-precision floating-point value.  The
1650 | operation is performed according to the IEC/IEEE Standard for Binary
1651 | Floating-Point Arithmetic.
1652 *----------------------------------------------------------------------------*/
1653
1654 float32 float32_round_to_int( float32 a STATUS_PARAM)
1655 {
1656     flag aSign;
1657     int_fast16_t aExp;
1658     uint32_t lastBitMask, roundBitsMask;
1659     int8 roundingMode;
1660     uint32_t z;
1661     a = float32_squash_input_denormal(a STATUS_VAR);
1662
1663     aExp = extractFloat32Exp( a );
1664     if ( 0x96 <= aExp ) {
1665         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1666             return propagateFloat32NaN( a, a STATUS_VAR );
1667         }
1668         return a;
1669     }
1670     if ( aExp <= 0x7E ) {
1671         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1672         STATUS(float_exception_flags) |= float_flag_inexact;
1673         aSign = extractFloat32Sign( a );
1674         switch ( STATUS(float_rounding_mode) ) {
1675          case float_round_nearest_even:
1676             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1677                 return packFloat32( aSign, 0x7F, 0 );
1678             }
1679             break;
1680          case float_round_down:
1681             return make_float32(aSign ? 0xBF800000 : 0);
1682          case float_round_up:
1683             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1684         }
1685         return packFloat32( aSign, 0, 0 );
1686     }
1687     lastBitMask = 1;
1688     lastBitMask <<= 0x96 - aExp;
1689     roundBitsMask = lastBitMask - 1;
1690     z = float32_val(a);
1691     roundingMode = STATUS(float_rounding_mode);
1692     if ( roundingMode == float_round_nearest_even ) {
1693         z += lastBitMask>>1;
1694         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1695     }
1696     else if ( roundingMode != float_round_to_zero ) {
1697         if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1698             z += roundBitsMask;
1699         }
1700     }
1701     z &= ~ roundBitsMask;
1702     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1703     return make_float32(z);
1704
1705 }
1706
1707 /*----------------------------------------------------------------------------
1708 | Returns the result of adding the absolute values of the single-precision
1709 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1710 | before being returned.  `zSign' is ignored if the result is a NaN.
1711 | The addition is performed according to the IEC/IEEE Standard for Binary
1712 | Floating-Point Arithmetic.
1713 *----------------------------------------------------------------------------*/
1714
1715 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1716 {
1717     int_fast16_t aExp, bExp, zExp;
1718     uint32_t aSig, bSig, zSig;
1719     int_fast16_t expDiff;
1720
1721     aSig = extractFloat32Frac( a );
1722     aExp = extractFloat32Exp( a );
1723     bSig = extractFloat32Frac( b );
1724     bExp = extractFloat32Exp( b );
1725     expDiff = aExp - bExp;
1726     aSig <<= 6;
1727     bSig <<= 6;
1728     if ( 0 < expDiff ) {
1729         if ( aExp == 0xFF ) {
1730             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1731             return a;
1732         }
1733         if ( bExp == 0 ) {
1734             --expDiff;
1735         }
1736         else {
1737             bSig |= 0x20000000;
1738         }
1739         shift32RightJamming( bSig, expDiff, &bSig );
1740         zExp = aExp;
1741     }
1742     else if ( expDiff < 0 ) {
1743         if ( bExp == 0xFF ) {
1744             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1745             return packFloat32( zSign, 0xFF, 0 );
1746         }
1747         if ( aExp == 0 ) {
1748             ++expDiff;
1749         }
1750         else {
1751             aSig |= 0x20000000;
1752         }
1753         shift32RightJamming( aSig, - expDiff, &aSig );
1754         zExp = bExp;
1755     }
1756     else {
1757         if ( aExp == 0xFF ) {
1758             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1759             return a;
1760         }
1761         if ( aExp == 0 ) {
1762             if (STATUS(flush_to_zero)) {
1763                 if (aSig | bSig) {
1764                     float_raise(float_flag_output_denormal STATUS_VAR);
1765                 }
1766                 return packFloat32(zSign, 0, 0);
1767             }
1768             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1769         }
1770         zSig = 0x40000000 + aSig + bSig;
1771         zExp = aExp;
1772         goto roundAndPack;
1773     }
1774     aSig |= 0x20000000;
1775     zSig = ( aSig + bSig )<<1;
1776     --zExp;
1777     if ( (int32_t) zSig < 0 ) {
1778         zSig = aSig + bSig;
1779         ++zExp;
1780     }
1781  roundAndPack:
1782     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1783
1784 }
1785
1786 /*----------------------------------------------------------------------------
1787 | Returns the result of subtracting the absolute values of the single-
1788 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1789 | difference is negated before being returned.  `zSign' is ignored if the
1790 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1791 | Standard for Binary Floating-Point Arithmetic.
1792 *----------------------------------------------------------------------------*/
1793
1794 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1795 {
1796     int_fast16_t aExp, bExp, zExp;
1797     uint32_t aSig, bSig, zSig;
1798     int_fast16_t expDiff;
1799
1800     aSig = extractFloat32Frac( a );
1801     aExp = extractFloat32Exp( a );
1802     bSig = extractFloat32Frac( b );
1803     bExp = extractFloat32Exp( b );
1804     expDiff = aExp - bExp;
1805     aSig <<= 7;
1806     bSig <<= 7;
1807     if ( 0 < expDiff ) goto aExpBigger;
1808     if ( expDiff < 0 ) goto bExpBigger;
1809     if ( aExp == 0xFF ) {
1810         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1811         float_raise( float_flag_invalid STATUS_VAR);
1812         return float32_default_nan;
1813     }
1814     if ( aExp == 0 ) {
1815         aExp = 1;
1816         bExp = 1;
1817     }
1818     if ( bSig < aSig ) goto aBigger;
1819     if ( aSig < bSig ) goto bBigger;
1820     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1821  bExpBigger:
1822     if ( bExp == 0xFF ) {
1823         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1824         return packFloat32( zSign ^ 1, 0xFF, 0 );
1825     }
1826     if ( aExp == 0 ) {
1827         ++expDiff;
1828     }
1829     else {
1830         aSig |= 0x40000000;
1831     }
1832     shift32RightJamming( aSig, - expDiff, &aSig );
1833     bSig |= 0x40000000;
1834  bBigger:
1835     zSig = bSig - aSig;
1836     zExp = bExp;
1837     zSign ^= 1;
1838     goto normalizeRoundAndPack;
1839  aExpBigger:
1840     if ( aExp == 0xFF ) {
1841         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1842         return a;
1843     }
1844     if ( bExp == 0 ) {
1845         --expDiff;
1846     }
1847     else {
1848         bSig |= 0x40000000;
1849     }
1850     shift32RightJamming( bSig, expDiff, &bSig );
1851     aSig |= 0x40000000;
1852  aBigger:
1853     zSig = aSig - bSig;
1854     zExp = aExp;
1855  normalizeRoundAndPack:
1856     --zExp;
1857     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1858
1859 }
1860
1861 /*----------------------------------------------------------------------------
1862 | Returns the result of adding the single-precision floating-point values `a'
1863 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1864 | Binary Floating-Point Arithmetic.
1865 *----------------------------------------------------------------------------*/
1866
1867 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1868 {
1869     flag aSign, bSign;
1870     a = float32_squash_input_denormal(a STATUS_VAR);
1871     b = float32_squash_input_denormal(b STATUS_VAR);
1872
1873     aSign = extractFloat32Sign( a );
1874     bSign = extractFloat32Sign( b );
1875     if ( aSign == bSign ) {
1876         return addFloat32Sigs( a, b, aSign STATUS_VAR);
1877     }
1878     else {
1879         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1880     }
1881
1882 }
1883
1884 /*----------------------------------------------------------------------------
1885 | Returns the result of subtracting the single-precision floating-point values
1886 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1887 | for Binary Floating-Point Arithmetic.
1888 *----------------------------------------------------------------------------*/
1889
1890 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1891 {
1892     flag aSign, bSign;
1893     a = float32_squash_input_denormal(a STATUS_VAR);
1894     b = float32_squash_input_denormal(b STATUS_VAR);
1895
1896     aSign = extractFloat32Sign( a );
1897     bSign = extractFloat32Sign( b );
1898     if ( aSign == bSign ) {
1899         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1900     }
1901     else {
1902         return addFloat32Sigs( a, b, aSign STATUS_VAR );
1903     }
1904
1905 }
1906
1907 /*----------------------------------------------------------------------------
1908 | Returns the result of multiplying the single-precision floating-point values
1909 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1910 | for Binary Floating-Point Arithmetic.
1911 *----------------------------------------------------------------------------*/
1912
1913 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1914 {
1915     flag aSign, bSign, zSign;
1916     int_fast16_t aExp, bExp, zExp;
1917     uint32_t aSig, bSig;
1918     uint64_t zSig64;
1919     uint32_t zSig;
1920
1921     a = float32_squash_input_denormal(a STATUS_VAR);
1922     b = float32_squash_input_denormal(b STATUS_VAR);
1923
1924     aSig = extractFloat32Frac( a );
1925     aExp = extractFloat32Exp( a );
1926     aSign = extractFloat32Sign( a );
1927     bSig = extractFloat32Frac( b );
1928     bExp = extractFloat32Exp( b );
1929     bSign = extractFloat32Sign( b );
1930     zSign = aSign ^ bSign;
1931     if ( aExp == 0xFF ) {
1932         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1933             return propagateFloat32NaN( a, b STATUS_VAR );
1934         }
1935         if ( ( bExp | bSig ) == 0 ) {
1936             float_raise( float_flag_invalid STATUS_VAR);
1937             return float32_default_nan;
1938         }
1939         return packFloat32( zSign, 0xFF, 0 );
1940     }
1941     if ( bExp == 0xFF ) {
1942         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1943         if ( ( aExp | aSig ) == 0 ) {
1944             float_raise( float_flag_invalid STATUS_VAR);
1945             return float32_default_nan;
1946         }
1947         return packFloat32( zSign, 0xFF, 0 );
1948     }
1949     if ( aExp == 0 ) {
1950         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1951         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1952     }
1953     if ( bExp == 0 ) {
1954         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1955         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1956     }
1957     zExp = aExp + bExp - 0x7F;
1958     aSig = ( aSig | 0x00800000 )<<7;
1959     bSig = ( bSig | 0x00800000 )<<8;
1960     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
1961     zSig = zSig64;
1962     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
1963         zSig <<= 1;
1964         --zExp;
1965     }
1966     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1967
1968 }
1969
1970 /*----------------------------------------------------------------------------
1971 | Returns the result of dividing the single-precision floating-point value `a'
1972 | by the corresponding value `b'.  The operation is performed according to the
1973 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1974 *----------------------------------------------------------------------------*/
1975
1976 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1977 {
1978     flag aSign, bSign, zSign;
1979     int_fast16_t aExp, bExp, zExp;
1980     uint32_t aSig, bSig, zSig;
1981     a = float32_squash_input_denormal(a STATUS_VAR);
1982     b = float32_squash_input_denormal(b STATUS_VAR);
1983
1984     aSig = extractFloat32Frac( a );
1985     aExp = extractFloat32Exp( a );
1986     aSign = extractFloat32Sign( a );
1987     bSig = extractFloat32Frac( b );
1988     bExp = extractFloat32Exp( b );
1989     bSign = extractFloat32Sign( b );
1990     zSign = aSign ^ bSign;
1991     if ( aExp == 0xFF ) {
1992         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1993         if ( bExp == 0xFF ) {
1994             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1995             float_raise( float_flag_invalid STATUS_VAR);
1996             return float32_default_nan;
1997         }
1998         return packFloat32( zSign, 0xFF, 0 );
1999     }
2000     if ( bExp == 0xFF ) {
2001         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2002         return packFloat32( zSign, 0, 0 );
2003     }
2004     if ( bExp == 0 ) {
2005         if ( bSig == 0 ) {
2006             if ( ( aExp | aSig ) == 0 ) {
2007                 float_raise( float_flag_invalid STATUS_VAR);
2008                 return float32_default_nan;
2009             }
2010             float_raise( float_flag_divbyzero STATUS_VAR);
2011             return packFloat32( zSign, 0xFF, 0 );
2012         }
2013         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2014     }
2015     if ( aExp == 0 ) {
2016         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2017         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2018     }
2019     zExp = aExp - bExp + 0x7D;
2020     aSig = ( aSig | 0x00800000 )<<7;
2021     bSig = ( bSig | 0x00800000 )<<8;
2022     if ( bSig <= ( aSig + aSig ) ) {
2023         aSig >>= 1;
2024         ++zExp;
2025     }
2026     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2027     if ( ( zSig & 0x3F ) == 0 ) {
2028         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2029     }
2030     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2031
2032 }
2033
2034 /*----------------------------------------------------------------------------
2035 | Returns the remainder of the single-precision floating-point value `a'
2036 | with respect to the corresponding value `b'.  The operation is performed
2037 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2038 *----------------------------------------------------------------------------*/
2039
2040 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2041 {
2042     flag aSign, zSign;
2043     int_fast16_t aExp, bExp, expDiff;
2044     uint32_t aSig, bSig;
2045     uint32_t q;
2046     uint64_t aSig64, bSig64, q64;
2047     uint32_t alternateASig;
2048     int32_t sigMean;
2049     a = float32_squash_input_denormal(a STATUS_VAR);
2050     b = float32_squash_input_denormal(b STATUS_VAR);
2051
2052     aSig = extractFloat32Frac( a );
2053     aExp = extractFloat32Exp( a );
2054     aSign = extractFloat32Sign( a );
2055     bSig = extractFloat32Frac( b );
2056     bExp = extractFloat32Exp( b );
2057     if ( aExp == 0xFF ) {
2058         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2059             return propagateFloat32NaN( a, b STATUS_VAR );
2060         }
2061         float_raise( float_flag_invalid STATUS_VAR);
2062         return float32_default_nan;
2063     }
2064     if ( bExp == 0xFF ) {
2065         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2066         return a;
2067     }
2068     if ( bExp == 0 ) {
2069         if ( bSig == 0 ) {
2070             float_raise( float_flag_invalid STATUS_VAR);
2071             return float32_default_nan;
2072         }
2073         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2074     }
2075     if ( aExp == 0 ) {
2076         if ( aSig == 0 ) return a;
2077         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2078     }
2079     expDiff = aExp - bExp;
2080     aSig |= 0x00800000;
2081     bSig |= 0x00800000;
2082     if ( expDiff < 32 ) {
2083         aSig <<= 8;
2084         bSig <<= 8;
2085         if ( expDiff < 0 ) {
2086             if ( expDiff < -1 ) return a;
2087             aSig >>= 1;
2088         }
2089         q = ( bSig <= aSig );
2090         if ( q ) aSig -= bSig;
2091         if ( 0 < expDiff ) {
2092             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2093             q >>= 32 - expDiff;
2094             bSig >>= 2;
2095             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2096         }
2097         else {
2098             aSig >>= 2;
2099             bSig >>= 2;
2100         }
2101     }
2102     else {
2103         if ( bSig <= aSig ) aSig -= bSig;
2104         aSig64 = ( (uint64_t) aSig )<<40;
2105         bSig64 = ( (uint64_t) bSig )<<40;
2106         expDiff -= 64;
2107         while ( 0 < expDiff ) {
2108             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2109             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2110             aSig64 = - ( ( bSig * q64 )<<38 );
2111             expDiff -= 62;
2112         }
2113         expDiff += 64;
2114         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2115         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2116         q = q64>>( 64 - expDiff );
2117         bSig <<= 6;
2118         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2119     }
2120     do {
2121         alternateASig = aSig;
2122         ++q;
2123         aSig -= bSig;
2124     } while ( 0 <= (int32_t) aSig );
2125     sigMean = aSig + alternateASig;
2126     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2127         aSig = alternateASig;
2128     }
2129     zSign = ( (int32_t) aSig < 0 );
2130     if ( zSign ) aSig = - aSig;
2131     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2132
2133 }
2134
2135 /*----------------------------------------------------------------------------
2136 | Returns the result of multiplying the single-precision floating-point values
2137 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2138 | multiplication.  The operation is performed according to the IEC/IEEE
2139 | Standard for Binary Floating-Point Arithmetic 754-2008.
2140 | The flags argument allows the caller to select negation of the
2141 | addend, the intermediate product, or the final result. (The difference
2142 | between this and having the caller do a separate negation is that negating
2143 | externally will flip the sign bit on NaNs.)
2144 *----------------------------------------------------------------------------*/
2145
2146 float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2147 {
2148     flag aSign, bSign, cSign, zSign;
2149     int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2150     uint32_t aSig, bSig, cSig;
2151     flag pInf, pZero, pSign;
2152     uint64_t pSig64, cSig64, zSig64;
2153     uint32_t pSig;
2154     int shiftcount;
2155     flag signflip, infzero;
2156
2157     a = float32_squash_input_denormal(a STATUS_VAR);
2158     b = float32_squash_input_denormal(b STATUS_VAR);
2159     c = float32_squash_input_denormal(c STATUS_VAR);
2160     aSig = extractFloat32Frac(a);
2161     aExp = extractFloat32Exp(a);
2162     aSign = extractFloat32Sign(a);
2163     bSig = extractFloat32Frac(b);
2164     bExp = extractFloat32Exp(b);
2165     bSign = extractFloat32Sign(b);
2166     cSig = extractFloat32Frac(c);
2167     cExp = extractFloat32Exp(c);
2168     cSign = extractFloat32Sign(c);
2169
2170     infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2171                (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2172
2173     /* It is implementation-defined whether the cases of (0,inf,qnan)
2174      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2175      * they return if they do), so we have to hand this information
2176      * off to the target-specific pick-a-NaN routine.
2177      */
2178     if (((aExp == 0xff) && aSig) ||
2179         ((bExp == 0xff) && bSig) ||
2180         ((cExp == 0xff) && cSig)) {
2181         return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2182     }
2183
2184     if (infzero) {
2185         float_raise(float_flag_invalid STATUS_VAR);
2186         return float32_default_nan;
2187     }
2188
2189     if (flags & float_muladd_negate_c) {
2190         cSign ^= 1;
2191     }
2192
2193     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2194
2195     /* Work out the sign and type of the product */
2196     pSign = aSign ^ bSign;
2197     if (flags & float_muladd_negate_product) {
2198         pSign ^= 1;
2199     }
2200     pInf = (aExp == 0xff) || (bExp == 0xff);
2201     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2202
2203     if (cExp == 0xff) {
2204         if (pInf && (pSign ^ cSign)) {
2205             /* addition of opposite-signed infinities => InvalidOperation */
2206             float_raise(float_flag_invalid STATUS_VAR);
2207             return float32_default_nan;
2208         }
2209         /* Otherwise generate an infinity of the same sign */
2210         return packFloat32(cSign ^ signflip, 0xff, 0);
2211     }
2212
2213     if (pInf) {
2214         return packFloat32(pSign ^ signflip, 0xff, 0);
2215     }
2216
2217     if (pZero) {
2218         if (cExp == 0) {
2219             if (cSig == 0) {
2220                 /* Adding two exact zeroes */
2221                 if (pSign == cSign) {
2222                     zSign = pSign;
2223                 } else if (STATUS(float_rounding_mode) == float_round_down) {
2224                     zSign = 1;
2225                 } else {
2226                     zSign = 0;
2227                 }
2228                 return packFloat32(zSign ^ signflip, 0, 0);
2229             }
2230             /* Exact zero plus a denorm */
2231             if (STATUS(flush_to_zero)) {
2232                 float_raise(float_flag_output_denormal STATUS_VAR);
2233                 return packFloat32(cSign ^ signflip, 0, 0);
2234             }
2235         }
2236         /* Zero plus something non-zero : just return the something */
2237         return packFloat32(cSign ^ signflip, cExp, cSig);
2238     }
2239
2240     if (aExp == 0) {
2241         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2242     }
2243     if (bExp == 0) {
2244         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2245     }
2246
2247     /* Calculate the actual result a * b + c */
2248
2249     /* Multiply first; this is easy. */
2250     /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2251      * because we want the true exponent, not the "one-less-than"
2252      * flavour that roundAndPackFloat32() takes.
2253      */
2254     pExp = aExp + bExp - 0x7e;
2255     aSig = (aSig | 0x00800000) << 7;
2256     bSig = (bSig | 0x00800000) << 8;
2257     pSig64 = (uint64_t)aSig * bSig;
2258     if ((int64_t)(pSig64 << 1) >= 0) {
2259         pSig64 <<= 1;
2260         pExp--;
2261     }
2262
2263     zSign = pSign ^ signflip;
2264
2265     /* Now pSig64 is the significand of the multiply, with the explicit bit in
2266      * position 62.
2267      */
2268     if (cExp == 0) {
2269         if (!cSig) {
2270             /* Throw out the special case of c being an exact zero now */
2271             shift64RightJamming(pSig64, 32, &pSig64);
2272             pSig = pSig64;
2273             return roundAndPackFloat32(zSign, pExp - 1,
2274                                        pSig STATUS_VAR);
2275         }
2276         normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2277     }
2278
2279     cSig64 = (uint64_t)cSig << (62 - 23);
2280     cSig64 |= LIT64(0x4000000000000000);
2281     expDiff = pExp - cExp;
2282
2283     if (pSign == cSign) {
2284         /* Addition */
2285         if (expDiff > 0) {
2286             /* scale c to match p */
2287             shift64RightJamming(cSig64, expDiff, &cSig64);
2288             zExp = pExp;
2289         } else if (expDiff < 0) {
2290             /* scale p to match c */
2291             shift64RightJamming(pSig64, -expDiff, &pSig64);
2292             zExp = cExp;
2293         } else {
2294             /* no scaling needed */
2295             zExp = cExp;
2296         }
2297         /* Add significands and make sure explicit bit ends up in posn 62 */
2298         zSig64 = pSig64 + cSig64;
2299         if ((int64_t)zSig64 < 0) {
2300             shift64RightJamming(zSig64, 1, &zSig64);
2301         } else {
2302             zExp--;
2303         }
2304     } else {
2305         /* Subtraction */
2306         if (expDiff > 0) {
2307             shift64RightJamming(cSig64, expDiff, &cSig64);
2308             zSig64 = pSig64 - cSig64;
2309             zExp = pExp;
2310         } else if (expDiff < 0) {
2311             shift64RightJamming(pSig64, -expDiff, &pSig64);
2312             zSig64 = cSig64 - pSig64;
2313             zExp = cExp;
2314             zSign ^= 1;
2315         } else {
2316             zExp = pExp;
2317             if (cSig64 < pSig64) {
2318                 zSig64 = pSig64 - cSig64;
2319             } else if (pSig64 < cSig64) {
2320                 zSig64 = cSig64 - pSig64;
2321                 zSign ^= 1;
2322             } else {
2323                 /* Exact zero */
2324                 zSign = signflip;
2325                 if (STATUS(float_rounding_mode) == float_round_down) {
2326                     zSign ^= 1;
2327                 }
2328                 return packFloat32(zSign, 0, 0);
2329             }
2330         }
2331         --zExp;
2332         /* Normalize to put the explicit bit back into bit 62. */
2333         shiftcount = countLeadingZeros64(zSig64) - 1;
2334         zSig64 <<= shiftcount;
2335         zExp -= shiftcount;
2336     }
2337     shift64RightJamming(zSig64, 32, &zSig64);
2338     return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2339 }
2340
2341
2342 /*----------------------------------------------------------------------------
2343 | Returns the square root of the single-precision floating-point value `a'.
2344 | The operation is performed according to the IEC/IEEE Standard for Binary
2345 | Floating-Point Arithmetic.
2346 *----------------------------------------------------------------------------*/
2347
2348 float32 float32_sqrt( float32 a STATUS_PARAM )
2349 {
2350     flag aSign;
2351     int_fast16_t aExp, zExp;
2352     uint32_t aSig, zSig;
2353     uint64_t rem, term;
2354     a = float32_squash_input_denormal(a STATUS_VAR);
2355
2356     aSig = extractFloat32Frac( a );
2357     aExp = extractFloat32Exp( a );
2358     aSign = extractFloat32Sign( a );
2359     if ( aExp == 0xFF ) {
2360         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2361         if ( ! aSign ) return a;
2362         float_raise( float_flag_invalid STATUS_VAR);
2363         return float32_default_nan;
2364     }
2365     if ( aSign ) {
2366         if ( ( aExp | aSig ) == 0 ) return a;
2367         float_raise( float_flag_invalid STATUS_VAR);
2368         return float32_default_nan;
2369     }
2370     if ( aExp == 0 ) {
2371         if ( aSig == 0 ) return float32_zero;
2372         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2373     }
2374     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2375     aSig = ( aSig | 0x00800000 )<<8;
2376     zSig = estimateSqrt32( aExp, aSig ) + 2;
2377     if ( ( zSig & 0x7F ) <= 5 ) {
2378         if ( zSig < 2 ) {
2379             zSig = 0x7FFFFFFF;
2380             goto roundAndPack;
2381         }
2382         aSig >>= aExp & 1;
2383         term = ( (uint64_t) zSig ) * zSig;
2384         rem = ( ( (uint64_t) aSig )<<32 ) - term;
2385         while ( (int64_t) rem < 0 ) {
2386             --zSig;
2387             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2388         }
2389         zSig |= ( rem != 0 );
2390     }
2391     shift32RightJamming( zSig, 1, &zSig );
2392  roundAndPack:
2393     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2394
2395 }
2396
2397 /*----------------------------------------------------------------------------
2398 | Returns the binary exponential of the single-precision floating-point value
2399 | `a'. The operation is performed according to the IEC/IEEE Standard for
2400 | Binary Floating-Point Arithmetic.
2401 |
2402 | Uses the following identities:
2403 |
2404 | 1. -------------------------------------------------------------------------
2405 |      x    x*ln(2)
2406 |     2  = e
2407 |
2408 | 2. -------------------------------------------------------------------------
2409 |                      2     3     4     5           n
2410 |      x        x     x     x     x     x           x
2411 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2412 |               1!    2!    3!    4!    5!          n!
2413 *----------------------------------------------------------------------------*/
2414
2415 static const float64 float32_exp2_coefficients[15] =
2416 {
2417     const_float64( 0x3ff0000000000000ll ), /*  1 */
2418     const_float64( 0x3fe0000000000000ll ), /*  2 */
2419     const_float64( 0x3fc5555555555555ll ), /*  3 */
2420     const_float64( 0x3fa5555555555555ll ), /*  4 */
2421     const_float64( 0x3f81111111111111ll ), /*  5 */
2422     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2423     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2424     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2425     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2426     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2427     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2428     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2429     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2430     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2431     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2432 };
2433
2434 float32 float32_exp2( float32 a STATUS_PARAM )
2435 {
2436     flag aSign;
2437     int_fast16_t aExp;
2438     uint32_t aSig;
2439     float64 r, x, xn;
2440     int i;
2441     a = float32_squash_input_denormal(a STATUS_VAR);
2442
2443     aSig = extractFloat32Frac( a );
2444     aExp = extractFloat32Exp( a );
2445     aSign = extractFloat32Sign( a );
2446
2447     if ( aExp == 0xFF) {
2448         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2449         return (aSign) ? float32_zero : a;
2450     }
2451     if (aExp == 0) {
2452         if (aSig == 0) return float32_one;
2453     }
2454
2455     float_raise( float_flag_inexact STATUS_VAR);
2456
2457     /* ******************************* */
2458     /* using float64 for approximation */
2459     /* ******************************* */
2460     x = float32_to_float64(a STATUS_VAR);
2461     x = float64_mul(x, float64_ln2 STATUS_VAR);
2462
2463     xn = x;
2464     r = float64_one;
2465     for (i = 0 ; i < 15 ; i++) {
2466         float64 f;
2467
2468         f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2469         r = float64_add(r, f STATUS_VAR);
2470
2471         xn = float64_mul(xn, x STATUS_VAR);
2472     }
2473
2474     return float64_to_float32(r, status);
2475 }
2476
2477 /*----------------------------------------------------------------------------
2478 | Returns the binary log of the single-precision floating-point value `a'.
2479 | The operation is performed according to the IEC/IEEE Standard for Binary
2480 | Floating-Point Arithmetic.
2481 *----------------------------------------------------------------------------*/
2482 float32 float32_log2( float32 a STATUS_PARAM )
2483 {
2484     flag aSign, zSign;
2485     int_fast16_t aExp;
2486     uint32_t aSig, zSig, i;
2487
2488     a = float32_squash_input_denormal(a STATUS_VAR);
2489     aSig = extractFloat32Frac( a );
2490     aExp = extractFloat32Exp( a );
2491     aSign = extractFloat32Sign( a );
2492
2493     if ( aExp == 0 ) {
2494         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2495         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2496     }
2497     if ( aSign ) {
2498         float_raise( float_flag_invalid STATUS_VAR);
2499         return float32_default_nan;
2500     }
2501     if ( aExp == 0xFF ) {
2502         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2503         return a;
2504     }
2505
2506     aExp -= 0x7F;
2507     aSig |= 0x00800000;
2508     zSign = aExp < 0;
2509     zSig = aExp << 23;
2510
2511     for (i = 1 << 22; i > 0; i >>= 1) {
2512         aSig = ( (uint64_t)aSig * aSig ) >> 23;
2513         if ( aSig & 0x01000000 ) {
2514             aSig >>= 1;
2515             zSig |= i;
2516         }
2517     }
2518
2519     if ( zSign )
2520         zSig = -zSig;
2521
2522     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2523 }
2524
2525 /*----------------------------------------------------------------------------
2526 | Returns 1 if the single-precision floating-point value `a' is equal to
2527 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2528 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2529 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2530 *----------------------------------------------------------------------------*/
2531
2532 int float32_eq( float32 a, float32 b STATUS_PARAM )
2533 {
2534     uint32_t av, bv;
2535     a = float32_squash_input_denormal(a STATUS_VAR);
2536     b = float32_squash_input_denormal(b STATUS_VAR);
2537
2538     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2539          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2540        ) {
2541         float_raise( float_flag_invalid STATUS_VAR);
2542         return 0;
2543     }
2544     av = float32_val(a);
2545     bv = float32_val(b);
2546     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2547 }
2548
2549 /*----------------------------------------------------------------------------
2550 | Returns 1 if the single-precision floating-point value `a' is less than
2551 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
2552 | exception is raised if either operand is a NaN.  The comparison is performed
2553 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2554 *----------------------------------------------------------------------------*/
2555
2556 int float32_le( float32 a, float32 b STATUS_PARAM )
2557 {
2558     flag aSign, bSign;
2559     uint32_t av, bv;
2560     a = float32_squash_input_denormal(a STATUS_VAR);
2561     b = float32_squash_input_denormal(b STATUS_VAR);
2562
2563     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2564          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2565        ) {
2566         float_raise( float_flag_invalid STATUS_VAR);
2567         return 0;
2568     }
2569     aSign = extractFloat32Sign( a );
2570     bSign = extractFloat32Sign( b );
2571     av = float32_val(a);
2572     bv = float32_val(b);
2573     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2574     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2575
2576 }
2577
2578 /*----------------------------------------------------------------------------
2579 | Returns 1 if the single-precision floating-point value `a' is less than
2580 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2581 | raised if either operand is a NaN.  The comparison is performed according
2582 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2583 *----------------------------------------------------------------------------*/
2584
2585 int float32_lt( float32 a, float32 b STATUS_PARAM )
2586 {
2587     flag aSign, bSign;
2588     uint32_t av, bv;
2589     a = float32_squash_input_denormal(a STATUS_VAR);
2590     b = float32_squash_input_denormal(b STATUS_VAR);
2591
2592     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2593          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2594        ) {
2595         float_raise( float_flag_invalid STATUS_VAR);
2596         return 0;
2597     }
2598     aSign = extractFloat32Sign( a );
2599     bSign = extractFloat32Sign( b );
2600     av = float32_val(a);
2601     bv = float32_val(b);
2602     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2603     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2604
2605 }
2606
2607 /*----------------------------------------------------------------------------
2608 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2609 | be compared, and 0 otherwise.  The invalid exception is raised if either
2610 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
2611 | Standard for Binary Floating-Point Arithmetic.
2612 *----------------------------------------------------------------------------*/
2613
2614 int float32_unordered( float32 a, float32 b STATUS_PARAM )
2615 {
2616     a = float32_squash_input_denormal(a STATUS_VAR);
2617     b = float32_squash_input_denormal(b STATUS_VAR);
2618
2619     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2620          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2621        ) {
2622         float_raise( float_flag_invalid STATUS_VAR);
2623         return 1;
2624     }
2625     return 0;
2626 }
2627
2628 /*----------------------------------------------------------------------------
2629 | Returns 1 if the single-precision floating-point value `a' is equal to
2630 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2631 | exception.  The comparison is performed according to the IEC/IEEE Standard
2632 | for Binary Floating-Point Arithmetic.
2633 *----------------------------------------------------------------------------*/
2634
2635 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2636 {
2637     a = float32_squash_input_denormal(a STATUS_VAR);
2638     b = float32_squash_input_denormal(b STATUS_VAR);
2639
2640     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2641          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2642        ) {
2643         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2644             float_raise( float_flag_invalid STATUS_VAR);
2645         }
2646         return 0;
2647     }
2648     return ( float32_val(a) == float32_val(b) ) ||
2649             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2650 }
2651
2652 /*----------------------------------------------------------------------------
2653 | Returns 1 if the single-precision floating-point value `a' is less than or
2654 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2655 | cause an exception.  Otherwise, the comparison is performed according to the
2656 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2657 *----------------------------------------------------------------------------*/
2658
2659 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2660 {
2661     flag aSign, bSign;
2662     uint32_t av, bv;
2663     a = float32_squash_input_denormal(a STATUS_VAR);
2664     b = float32_squash_input_denormal(b STATUS_VAR);
2665
2666     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2667          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2668        ) {
2669         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2670             float_raise( float_flag_invalid STATUS_VAR);
2671         }
2672         return 0;
2673     }
2674     aSign = extractFloat32Sign( a );
2675     bSign = extractFloat32Sign( b );
2676     av = float32_val(a);
2677     bv = float32_val(b);
2678     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2679     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2680
2681 }
2682
2683 /*----------------------------------------------------------------------------
2684 | Returns 1 if the single-precision floating-point value `a' is less than
2685 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2686 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2687 | Standard for Binary Floating-Point Arithmetic.
2688 *----------------------------------------------------------------------------*/
2689
2690 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2691 {
2692     flag aSign, bSign;
2693     uint32_t av, bv;
2694     a = float32_squash_input_denormal(a STATUS_VAR);
2695     b = float32_squash_input_denormal(b STATUS_VAR);
2696
2697     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2698          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2699        ) {
2700         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2701             float_raise( float_flag_invalid STATUS_VAR);
2702         }
2703         return 0;
2704     }
2705     aSign = extractFloat32Sign( a );
2706     bSign = extractFloat32Sign( b );
2707     av = float32_val(a);
2708     bv = float32_val(b);
2709     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2710     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2711
2712 }
2713
2714 /*----------------------------------------------------------------------------
2715 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2716 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
2717 | comparison is performed according to the IEC/IEEE Standard for Binary
2718 | Floating-Point Arithmetic.
2719 *----------------------------------------------------------------------------*/
2720
2721 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2722 {
2723     a = float32_squash_input_denormal(a STATUS_VAR);
2724     b = float32_squash_input_denormal(b STATUS_VAR);
2725
2726     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2727          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2728        ) {
2729         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2730             float_raise( float_flag_invalid STATUS_VAR);
2731         }
2732         return 1;
2733     }
2734     return 0;
2735 }
2736
2737 /*----------------------------------------------------------------------------
2738 | Returns the result of converting the double-precision floating-point value
2739 | `a' to the 32-bit two's complement integer format.  The conversion is
2740 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2741 | Arithmetic---which means in particular that the conversion is rounded
2742 | according to the current rounding mode.  If `a' is a NaN, the largest
2743 | positive integer is returned.  Otherwise, if the conversion overflows, the
2744 | largest integer with the same sign as `a' is returned.
2745 *----------------------------------------------------------------------------*/
2746
2747 int32 float64_to_int32( float64 a STATUS_PARAM )
2748 {
2749     flag aSign;
2750     int_fast16_t aExp, shiftCount;
2751     uint64_t aSig;
2752     a = float64_squash_input_denormal(a STATUS_VAR);
2753
2754     aSig = extractFloat64Frac( a );
2755     aExp = extractFloat64Exp( a );
2756     aSign = extractFloat64Sign( a );
2757     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2758     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2759     shiftCount = 0x42C - aExp;
2760     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2761     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2762
2763 }
2764
2765 /*----------------------------------------------------------------------------
2766 | Returns the result of converting the double-precision floating-point value
2767 | `a' to the 32-bit two's complement integer format.  The conversion is
2768 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2769 | Arithmetic, except that the conversion is always rounded toward zero.
2770 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2771 | the conversion overflows, the largest integer with the same sign as `a' is
2772 | returned.
2773 *----------------------------------------------------------------------------*/
2774
2775 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2776 {
2777     flag aSign;
2778     int_fast16_t aExp, shiftCount;
2779     uint64_t aSig, savedASig;
2780     int32_t z;
2781     a = float64_squash_input_denormal(a STATUS_VAR);
2782
2783     aSig = extractFloat64Frac( a );
2784     aExp = extractFloat64Exp( a );
2785     aSign = extractFloat64Sign( a );
2786     if ( 0x41E < aExp ) {
2787         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2788         goto invalid;
2789     }
2790     else if ( aExp < 0x3FF ) {
2791         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2792         return 0;
2793     }
2794     aSig |= LIT64( 0x0010000000000000 );
2795     shiftCount = 0x433 - aExp;
2796     savedASig = aSig;
2797     aSig >>= shiftCount;
2798     z = aSig;
2799     if ( aSign ) z = - z;
2800     if ( ( z < 0 ) ^ aSign ) {
2801  invalid:
2802         float_raise( float_flag_invalid STATUS_VAR);
2803         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2804     }
2805     if ( ( aSig<<shiftCount ) != savedASig ) {
2806         STATUS(float_exception_flags) |= float_flag_inexact;
2807     }
2808     return z;
2809
2810 }
2811
2812 /*----------------------------------------------------------------------------
2813 | Returns the result of converting the double-precision floating-point value
2814 | `a' to the 16-bit two's complement integer format.  The conversion is
2815 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2816 | Arithmetic, except that the conversion is always rounded toward zero.
2817 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2818 | the conversion overflows, the largest integer with the same sign as `a' is
2819 | returned.
2820 *----------------------------------------------------------------------------*/
2821
2822 int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
2823 {
2824     flag aSign;
2825     int_fast16_t aExp, shiftCount;
2826     uint64_t aSig, savedASig;
2827     int32 z;
2828
2829     aSig = extractFloat64Frac( a );
2830     aExp = extractFloat64Exp( a );
2831     aSign = extractFloat64Sign( a );
2832     if ( 0x40E < aExp ) {
2833         if ( ( aExp == 0x7FF ) && aSig ) {
2834             aSign = 0;
2835         }
2836         goto invalid;
2837     }
2838     else if ( aExp < 0x3FF ) {
2839         if ( aExp || aSig ) {
2840             STATUS(float_exception_flags) |= float_flag_inexact;
2841         }
2842         return 0;
2843     }
2844     aSig |= LIT64( 0x0010000000000000 );
2845     shiftCount = 0x433 - aExp;
2846     savedASig = aSig;
2847     aSig >>= shiftCount;
2848     z = aSig;
2849     if ( aSign ) {
2850         z = - z;
2851     }
2852     if ( ( (int16_t)z < 0 ) ^ aSign ) {
2853  invalid:
2854         float_raise( float_flag_invalid STATUS_VAR);
2855         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2856     }
2857     if ( ( aSig<<shiftCount ) != savedASig ) {
2858         STATUS(float_exception_flags) |= float_flag_inexact;
2859     }
2860     return z;
2861 }
2862
2863 /*----------------------------------------------------------------------------
2864 | Returns the result of converting the double-precision floating-point value
2865 | `a' to the 64-bit two's complement integer format.  The conversion is
2866 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2867 | Arithmetic---which means in particular that the conversion is rounded
2868 | according to the current rounding mode.  If `a' is a NaN, the largest
2869 | positive integer is returned.  Otherwise, if the conversion overflows, the
2870 | largest integer with the same sign as `a' is returned.
2871 *----------------------------------------------------------------------------*/
2872
2873 int64 float64_to_int64( float64 a STATUS_PARAM )
2874 {
2875     flag aSign;
2876     int_fast16_t aExp, shiftCount;
2877     uint64_t aSig, aSigExtra;
2878     a = float64_squash_input_denormal(a STATUS_VAR);
2879
2880     aSig = extractFloat64Frac( a );
2881     aExp = extractFloat64Exp( a );
2882     aSign = extractFloat64Sign( a );
2883     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2884     shiftCount = 0x433 - aExp;
2885     if ( shiftCount <= 0 ) {
2886         if ( 0x43E < aExp ) {
2887             float_raise( float_flag_invalid STATUS_VAR);
2888             if (    ! aSign
2889                  || (    ( aExp == 0x7FF )
2890                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2891                ) {
2892                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2893             }
2894             return (int64_t) LIT64( 0x8000000000000000 );
2895         }
2896         aSigExtra = 0;
2897         aSig <<= - shiftCount;
2898     }
2899     else {
2900         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2901     }
2902     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2903
2904 }
2905
2906 /*----------------------------------------------------------------------------
2907 | Returns the result of converting the double-precision floating-point value
2908 | `a' to the 64-bit two's complement integer format.  The conversion is
2909 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2910 | Arithmetic, except that the conversion is always rounded toward zero.
2911 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2912 | the conversion overflows, the largest integer with the same sign as `a' is
2913 | returned.
2914 *----------------------------------------------------------------------------*/
2915
2916 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2917 {
2918     flag aSign;
2919     int_fast16_t aExp, shiftCount;
2920     uint64_t aSig;
2921     int64 z;
2922     a = float64_squash_input_denormal(a STATUS_VAR);
2923
2924     aSig = extractFloat64Frac( a );
2925     aExp = extractFloat64Exp( a );
2926     aSign = extractFloat64Sign( a );
2927     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2928     shiftCount = aExp - 0x433;
2929     if ( 0 <= shiftCount ) {
2930         if ( 0x43E <= aExp ) {
2931             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2932                 float_raise( float_flag_invalid STATUS_VAR);
2933                 if (    ! aSign
2934                      || (    ( aExp == 0x7FF )
2935                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2936                    ) {
2937                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2938                 }
2939             }
2940             return (int64_t) LIT64( 0x8000000000000000 );
2941         }
2942         z = aSig<<shiftCount;
2943     }
2944     else {
2945         if ( aExp < 0x3FE ) {
2946             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2947             return 0;
2948         }
2949         z = aSig>>( - shiftCount );
2950         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
2951             STATUS(float_exception_flags) |= float_flag_inexact;
2952         }
2953     }
2954     if ( aSign ) z = - z;
2955     return z;
2956
2957 }
2958
2959 /*----------------------------------------------------------------------------
2960 | Returns the result of converting the double-precision floating-point value
2961 | `a' to the single-precision floating-point format.  The conversion is
2962 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2963 | Arithmetic.
2964 *----------------------------------------------------------------------------*/
2965
2966 float32 float64_to_float32( float64 a STATUS_PARAM )
2967 {
2968     flag aSign;
2969     int_fast16_t aExp;
2970     uint64_t aSig;
2971     uint32_t zSig;
2972     a = float64_squash_input_denormal(a STATUS_VAR);
2973
2974     aSig = extractFloat64Frac( a );
2975     aExp = extractFloat64Exp( a );
2976     aSign = extractFloat64Sign( a );
2977     if ( aExp == 0x7FF ) {
2978         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2979         return packFloat32( aSign, 0xFF, 0 );
2980     }
2981     shift64RightJamming( aSig, 22, &aSig );
2982     zSig = aSig;
2983     if ( aExp || zSig ) {
2984         zSig |= 0x40000000;
2985         aExp -= 0x381;
2986     }
2987     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2988
2989 }
2990
2991
2992 /*----------------------------------------------------------------------------
2993 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2994 | half-precision floating-point value, returning the result.  After being
2995 | shifted into the proper positions, the three fields are simply added
2996 | together to form the result.  This means that any integer portion of `zSig'
2997 | will be added into the exponent.  Since a properly normalized significand
2998 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2999 | than the desired result exponent whenever `zSig' is a complete, normalized
3000 | significand.
3001 *----------------------------------------------------------------------------*/
3002 static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3003 {
3004     return make_float16(
3005         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3006 }
3007
3008 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3009    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3010
3011 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
3012 {
3013     flag aSign;
3014     int_fast16_t aExp;
3015     uint32_t aSig;
3016
3017     aSign = extractFloat16Sign(a);
3018     aExp = extractFloat16Exp(a);
3019     aSig = extractFloat16Frac(a);
3020
3021     if (aExp == 0x1f && ieee) {
3022         if (aSig) {
3023             return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3024         }
3025         return packFloat32(aSign, 0xff, 0);
3026     }
3027     if (aExp == 0) {
3028         int8 shiftCount;
3029
3030         if (aSig == 0) {
3031             return packFloat32(aSign, 0, 0);
3032         }
3033
3034         shiftCount = countLeadingZeros32( aSig ) - 21;
3035         aSig = aSig << shiftCount;
3036         aExp = -shiftCount;
3037     }
3038     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3039 }
3040
3041 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
3042 {
3043     flag aSign;
3044     int_fast16_t aExp;
3045     uint32_t aSig;
3046     uint32_t mask;
3047     uint32_t increment;
3048     int8 roundingMode;
3049     a = float32_squash_input_denormal(a STATUS_VAR);
3050
3051     aSig = extractFloat32Frac( a );
3052     aExp = extractFloat32Exp( a );
3053     aSign = extractFloat32Sign( a );
3054     if ( aExp == 0xFF ) {
3055         if (aSig) {
3056             /* Input is a NaN */
3057             float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3058             if (!ieee) {
3059                 return packFloat16(aSign, 0, 0);
3060             }
3061             return r;
3062         }
3063         /* Infinity */
3064         if (!ieee) {
3065             float_raise(float_flag_invalid STATUS_VAR);
3066             return packFloat16(aSign, 0x1f, 0x3ff);
3067         }
3068         return packFloat16(aSign, 0x1f, 0);
3069     }
3070     if (aExp == 0 && aSig == 0) {
3071         return packFloat16(aSign, 0, 0);
3072     }
3073     /* Decimal point between bits 22 and 23.  */
3074     aSig |= 0x00800000;
3075     aExp -= 0x7f;
3076     if (aExp < -14) {
3077         mask = 0x00ffffff;
3078         if (aExp >= -24) {
3079             mask >>= 25 + aExp;
3080         }
3081     } else {
3082         mask = 0x00001fff;
3083     }
3084     if (aSig & mask) {
3085         float_raise( float_flag_underflow STATUS_VAR );
3086         roundingMode = STATUS(float_rounding_mode);
3087         switch (roundingMode) {
3088         case float_round_nearest_even:
3089             increment = (mask + 1) >> 1;
3090             if ((aSig & mask) == increment) {
3091                 increment = aSig & (increment << 1);
3092             }
3093             break;
3094         case float_round_up:
3095             increment = aSign ? 0 : mask;
3096             break;
3097         case float_round_down:
3098             increment = aSign ? mask : 0;
3099             break;
3100         default: /* round_to_zero */
3101             increment = 0;
3102             break;
3103         }
3104         aSig += increment;
3105         if (aSig >= 0x01000000) {
3106             aSig >>= 1;
3107             aExp++;
3108         }
3109     } else if (aExp < -14
3110           && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
3111         float_raise( float_flag_underflow STATUS_VAR);
3112     }
3113
3114     if (ieee) {
3115         if (aExp > 15) {
3116             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
3117             return packFloat16(aSign, 0x1f, 0);
3118         }
3119     } else {
3120         if (aExp > 16) {
3121             float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
3122             return packFloat16(aSign, 0x1f, 0x3ff);
3123         }
3124     }
3125     if (aExp < -24) {
3126         return packFloat16(aSign, 0, 0);
3127     }
3128     if (aExp < -14) {
3129         aSig >>= -14 - aExp;
3130         aExp = -14;
3131     }
3132     return packFloat16(aSign, aExp + 14, aSig >> 13);
3133 }
3134
3135 /*----------------------------------------------------------------------------
3136 | Returns the result of converting the double-precision floating-point value
3137 | `a' to the extended double-precision floating-point format.  The conversion
3138 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3139 | Arithmetic.
3140 *----------------------------------------------------------------------------*/
3141
3142 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3143 {
3144     flag aSign;
3145     int_fast16_t aExp;
3146     uint64_t aSig;
3147
3148     a = float64_squash_input_denormal(a STATUS_VAR);
3149     aSig = extractFloat64Frac( a );
3150     aExp = extractFloat64Exp( a );
3151     aSign = extractFloat64Sign( a );
3152     if ( aExp == 0x7FF ) {
3153         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3154         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3155     }
3156     if ( aExp == 0 ) {
3157         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3158         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3159     }
3160     return
3161         packFloatx80(
3162             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3163
3164 }
3165
3166 /*----------------------------------------------------------------------------
3167 | Returns the result of converting the double-precision floating-point value
3168 | `a' to the quadruple-precision floating-point format.  The conversion is
3169 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3170 | Arithmetic.
3171 *----------------------------------------------------------------------------*/
3172
3173 float128 float64_to_float128( float64 a STATUS_PARAM )
3174 {
3175     flag aSign;
3176     int_fast16_t aExp;
3177     uint64_t aSig, zSig0, zSig1;
3178
3179     a = float64_squash_input_denormal(a STATUS_VAR);
3180     aSig = extractFloat64Frac( a );
3181     aExp = extractFloat64Exp( a );
3182     aSign = extractFloat64Sign( a );
3183     if ( aExp == 0x7FF ) {
3184         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3185         return packFloat128( aSign, 0x7FFF, 0, 0 );
3186     }
3187     if ( aExp == 0 ) {
3188         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3189         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3190         --aExp;
3191     }
3192     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3193     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3194
3195 }
3196
3197 /*----------------------------------------------------------------------------
3198 | Rounds the double-precision floating-point value `a' to an integer, and
3199 | returns the result as a double-precision floating-point value.  The
3200 | operation is performed according to the IEC/IEEE Standard for Binary
3201 | Floating-Point Arithmetic.
3202 *----------------------------------------------------------------------------*/
3203
3204 float64 float64_round_to_int( float64 a STATUS_PARAM )
3205 {
3206     flag aSign;
3207     int_fast16_t aExp;
3208     uint64_t lastBitMask, roundBitsMask;
3209     int8 roundingMode;
3210     uint64_t z;
3211     a = float64_squash_input_denormal(a STATUS_VAR);
3212
3213     aExp = extractFloat64Exp( a );
3214     if ( 0x433 <= aExp ) {
3215         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3216             return propagateFloat64NaN( a, a STATUS_VAR );
3217         }
3218         return a;
3219     }
3220     if ( aExp < 0x3FF ) {
3221         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3222         STATUS(float_exception_flags) |= float_flag_inexact;
3223         aSign = extractFloat64Sign( a );
3224         switch ( STATUS(float_rounding_mode) ) {
3225          case float_round_nearest_even:
3226             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3227                 return packFloat64( aSign, 0x3FF, 0 );
3228             }
3229             break;
3230          case float_round_down:
3231             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3232          case float_round_up:
3233             return make_float64(
3234             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3235         }
3236         return packFloat64( aSign, 0, 0 );
3237     }
3238     lastBitMask = 1;
3239     lastBitMask <<= 0x433 - aExp;
3240     roundBitsMask = lastBitMask - 1;
3241     z = float64_val(a);
3242     roundingMode = STATUS(float_rounding_mode);
3243     if ( roundingMode == float_round_nearest_even ) {
3244         z += lastBitMask>>1;
3245         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3246     }
3247     else if ( roundingMode != float_round_to_zero ) {
3248         if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3249             z += roundBitsMask;
3250         }
3251     }
3252     z &= ~ roundBitsMask;
3253     if ( z != float64_val(a) )
3254         STATUS(float_exception_flags) |= float_flag_inexact;
3255     return make_float64(z);
3256
3257 }
3258
3259 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3260 {
3261     int oldmode;
3262     float64 res;
3263     oldmode = STATUS(float_rounding_mode);
3264     STATUS(float_rounding_mode) = float_round_to_zero;
3265     res = float64_round_to_int(a STATUS_VAR);
3266     STATUS(float_rounding_mode) = oldmode;
3267     return res;
3268 }
3269
3270 /*----------------------------------------------------------------------------
3271 | Returns the result of adding the absolute values of the double-precision
3272 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3273 | before being returned.  `zSign' is ignored if the result is a NaN.
3274 | The addition is performed according to the IEC/IEEE Standard for Binary
3275 | Floating-Point Arithmetic.
3276 *----------------------------------------------------------------------------*/
3277
3278 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3279 {
3280     int_fast16_t aExp, bExp, zExp;
3281     uint64_t aSig, bSig, zSig;
3282     int_fast16_t expDiff;
3283
3284     aSig = extractFloat64Frac( a );
3285     aExp = extractFloat64Exp( a );
3286     bSig = extractFloat64Frac( b );
3287     bExp = extractFloat64Exp( b );
3288     expDiff = aExp - bExp;
3289     aSig <<= 9;
3290     bSig <<= 9;
3291     if ( 0 < expDiff ) {
3292         if ( aExp == 0x7FF ) {
3293             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3294             return a;
3295         }
3296         if ( bExp == 0 ) {
3297             --expDiff;
3298         }
3299         else {
3300             bSig |= LIT64( 0x2000000000000000 );
3301         }
3302         shift64RightJamming( bSig, expDiff, &bSig );
3303         zExp = aExp;
3304     }
3305     else if ( expDiff < 0 ) {
3306         if ( bExp == 0x7FF ) {
3307             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3308             return packFloat64( zSign, 0x7FF, 0 );
3309         }
3310         if ( aExp == 0 ) {
3311             ++expDiff;
3312         }
3313         else {
3314             aSig |= LIT64( 0x2000000000000000 );
3315         }
3316         shift64RightJamming( aSig, - expDiff, &aSig );
3317         zExp = bExp;
3318     }
3319     else {
3320         if ( aExp == 0x7FF ) {
3321             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3322             return a;
3323         }
3324         if ( aExp == 0 ) {
3325             if (STATUS(flush_to_zero)) {
3326                 if (aSig | bSig) {
3327                     float_raise(float_flag_output_denormal STATUS_VAR);
3328                 }
3329                 return packFloat64(zSign, 0, 0);
3330             }
3331             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3332         }
3333         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3334         zExp = aExp;
3335         goto roundAndPack;
3336     }
3337     aSig |= LIT64( 0x2000000000000000 );
3338     zSig = ( aSig + bSig )<<1;
3339     --zExp;
3340     if ( (int64_t) zSig < 0 ) {
3341         zSig = aSig + bSig;
3342         ++zExp;
3343     }
3344  roundAndPack:
3345     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3346
3347 }
3348
3349 /*----------------------------------------------------------------------------
3350 | Returns the result of subtracting the absolute values of the double-
3351 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3352 | difference is negated before being returned.  `zSign' is ignored if the
3353 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3354 | Standard for Binary Floating-Point Arithmetic.
3355 *----------------------------------------------------------------------------*/
3356
3357 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3358 {
3359     int_fast16_t aExp, bExp, zExp;
3360     uint64_t aSig, bSig, zSig;
3361     int_fast16_t expDiff;
3362
3363     aSig = extractFloat64Frac( a );
3364     aExp = extractFloat64Exp( a );
3365     bSig = extractFloat64Frac( b );
3366     bExp = extractFloat64Exp( b );
3367     expDiff = aExp - bExp;
3368     aSig <<= 10;
3369     bSig <<= 10;
3370     if ( 0 < expDiff ) goto aExpBigger;
3371     if ( expDiff < 0 ) goto bExpBigger;
3372     if ( aExp == 0x7FF ) {
3373         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3374         float_raise( float_flag_invalid STATUS_VAR);
3375         return float64_default_nan;
3376     }
3377     if ( aExp == 0 ) {
3378         aExp = 1;
3379         bExp = 1;
3380     }
3381     if ( bSig < aSig ) goto aBigger;
3382     if ( aSig < bSig ) goto bBigger;
3383     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3384  bExpBigger:
3385     if ( bExp == 0x7FF ) {
3386         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3387         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3388     }
3389     if ( aExp == 0 ) {
3390         ++expDiff;
3391     }
3392     else {
3393         aSig |= LIT64( 0x4000000000000000 );
3394     }
3395     shift64RightJamming( aSig, - expDiff, &aSig );
3396     bSig |= LIT64( 0x4000000000000000 );
3397  bBigger:
3398     zSig = bSig - aSig;
3399     zExp = bExp;
3400     zSign ^= 1;
3401     goto normalizeRoundAndPack;
3402  aExpBigger:
3403     if ( aExp == 0x7FF ) {
3404         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3405         return a;
3406     }
3407     if ( bExp == 0 ) {
3408         --expDiff;
3409     }
3410     else {
3411         bSig |= LIT64( 0x4000000000000000 );
3412     }
3413     shift64RightJamming( bSig, expDiff, &bSig );
3414     aSig |= LIT64( 0x4000000000000000 );
3415  aBigger:
3416     zSig = aSig - bSig;
3417     zExp = aExp;
3418  normalizeRoundAndPack:
3419     --zExp;
3420     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3421
3422 }
3423
3424 /*----------------------------------------------------------------------------
3425 | Returns the result of adding the double-precision floating-point values `a'
3426 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3427 | Binary Floating-Point Arithmetic.
3428 *----------------------------------------------------------------------------*/
3429
3430 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3431 {
3432     flag aSign, bSign;
3433     a = float64_squash_input_denormal(a STATUS_VAR);
3434     b = float64_squash_input_denormal(b STATUS_VAR);
3435
3436     aSign = extractFloat64Sign( a );
3437     bSign = extractFloat64Sign( b );
3438     if ( aSign == bSign ) {
3439         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3440     }
3441     else {
3442         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3443     }
3444
3445 }
3446
3447 /*----------------------------------------------------------------------------
3448 | Returns the result of subtracting the double-precision floating-point values
3449 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3450 | for Binary Floating-Point Arithmetic.
3451 *----------------------------------------------------------------------------*/
3452
3453 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3454 {
3455     flag aSign, bSign;
3456     a = float64_squash_input_denormal(a STATUS_VAR);
3457     b = float64_squash_input_denormal(b STATUS_VAR);
3458
3459     aSign = extractFloat64Sign( a );
3460     bSign = extractFloat64Sign( b );
3461     if ( aSign == bSign ) {
3462         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3463     }
3464     else {
3465         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3466     }
3467
3468 }
3469
3470 /*----------------------------------------------------------------------------
3471 | Returns the result of multiplying the double-precision floating-point values
3472 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3473 | for Binary Floating-Point Arithmetic.
3474 *----------------------------------------------------------------------------*/
3475
3476 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3477 {
3478     flag aSign, bSign, zSign;
3479     int_fast16_t aExp, bExp, zExp;
3480     uint64_t aSig, bSig, zSig0, zSig1;
3481
3482     a = float64_squash_input_denormal(a STATUS_VAR);
3483     b = float64_squash_input_denormal(b STATUS_VAR);
3484
3485     aSig = extractFloat64Frac( a );
3486     aExp = extractFloat64Exp( a );
3487     aSign = extractFloat64Sign( a );
3488     bSig = extractFloat64Frac( b );
3489     bExp = extractFloat64Exp( b );
3490     bSign = extractFloat64Sign( b );
3491     zSign = aSign ^ bSign;
3492     if ( aExp == 0x7FF ) {
3493         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3494             return propagateFloat64NaN( a, b STATUS_VAR );
3495         }
3496         if ( ( bExp | bSig ) == 0 ) {
3497             float_raise( float_flag_invalid STATUS_VAR);
3498             return float64_default_nan;
3499         }
3500         return packFloat64( zSign, 0x7FF, 0 );
3501     }
3502     if ( bExp == 0x7FF ) {
3503         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3504         if ( ( aExp | aSig ) == 0 ) {
3505             float_raise( float_flag_invalid STATUS_VAR);
3506             return float64_default_nan;
3507         }
3508         return packFloat64( zSign, 0x7FF, 0 );
3509     }
3510     if ( aExp == 0 ) {
3511         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3512         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3513     }
3514     if ( bExp == 0 ) {
3515         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3516         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3517     }
3518     zExp = aExp + bExp - 0x3FF;
3519     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3520     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3521     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3522     zSig0 |= ( zSig1 != 0 );
3523     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3524         zSig0 <<= 1;
3525         --zExp;
3526     }
3527     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3528
3529 }
3530
3531 /*----------------------------------------------------------------------------
3532 | Returns the result of dividing the double-precision floating-point value `a'
3533 | by the corresponding value `b'.  The operation is performed according to
3534 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3535 *----------------------------------------------------------------------------*/
3536
3537 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3538 {
3539     flag aSign, bSign, zSign;
3540     int_fast16_t aExp, bExp, zExp;
3541     uint64_t aSig, bSig, zSig;
3542     uint64_t rem0, rem1;
3543     uint64_t term0, term1;
3544     a = float64_squash_input_denormal(a STATUS_VAR);
3545     b = float64_squash_input_denormal(b STATUS_VAR);
3546
3547     aSig = extractFloat64Frac( a );
3548     aExp = extractFloat64Exp( a );
3549     aSign = extractFloat64Sign( a );
3550     bSig = extractFloat64Frac( b );
3551     bExp = extractFloat64Exp( b );
3552     bSign = extractFloat64Sign( b );
3553     zSign = aSign ^ bSign;
3554     if ( aExp == 0x7FF ) {
3555         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3556         if ( bExp == 0x7FF ) {
3557             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3558             float_raise( float_flag_invalid STATUS_VAR);
3559             return float64_default_nan;
3560         }
3561         return packFloat64( zSign, 0x7FF, 0 );
3562     }
3563     if ( bExp == 0x7FF ) {
3564         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3565         return packFloat64( zSign, 0, 0 );
3566     }
3567     if ( bExp == 0 ) {
3568         if ( bSig == 0 ) {
3569             if ( ( aExp | aSig ) == 0 ) {
3570                 float_raise( float_flag_invalid STATUS_VAR);
3571                 return float64_default_nan;
3572             }
3573             float_raise( float_flag_divbyzero STATUS_VAR);
3574             return packFloat64( zSign, 0x7FF, 0 );
3575         }
3576         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3577     }
3578     if ( aExp == 0 ) {
3579         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3580         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3581     }
3582     zExp = aExp - bExp + 0x3FD;
3583     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3584     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3585     if ( bSig <= ( aSig + aSig ) ) {
3586         aSig >>= 1;
3587         ++zExp;
3588     }
3589     zSig = estimateDiv128To64( aSig, 0, bSig );
3590     if ( ( zSig & 0x1FF ) <= 2 ) {
3591         mul64To128( bSig, zSig, &term0, &term1 );
3592         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3593         while ( (int64_t) rem0 < 0 ) {
3594             --zSig;
3595             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3596         }
3597         zSig |= ( rem1 != 0 );
3598     }
3599     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3600
3601 }
3602
3603 /*----------------------------------------------------------------------------
3604 | Returns the remainder of the double-precision floating-point value `a'
3605 | with respect to the corresponding value `b'.  The operation is performed
3606 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3607 *----------------------------------------------------------------------------*/
3608
3609 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3610 {
3611     flag aSign, zSign;
3612     int_fast16_t aExp, bExp, expDiff;
3613     uint64_t aSig, bSig;
3614     uint64_t q, alternateASig;
3615     int64_t sigMean;
3616
3617     a = float64_squash_input_denormal(a STATUS_VAR);
3618     b = float64_squash_input_denormal(b STATUS_VAR);
3619     aSig = extractFloat64Frac( a );
3620     aExp = extractFloat64Exp( a );
3621     aSign = extractFloat64Sign( a );
3622     bSig = extractFloat64Frac( b );
3623     bExp = extractFloat64Exp( b );
3624     if ( aExp == 0x7FF ) {
3625         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3626             return propagateFloat64NaN( a, b STATUS_VAR );
3627         }
3628         float_raise( float_flag_invalid STATUS_VAR);
3629         return float64_default_nan;
3630     }
3631     if ( bExp == 0x7FF ) {
3632         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3633         return a;
3634     }
3635     if ( bExp == 0 ) {
3636         if ( bSig == 0 ) {
3637             float_raise( float_flag_invalid STATUS_VAR);
3638             return float64_default_nan;
3639         }
3640         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3641     }
3642     if ( aExp == 0 ) {
3643         if ( aSig == 0 ) return a;
3644         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3645     }
3646     expDiff = aExp - bExp;
3647     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3648     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3649     if ( expDiff < 0 ) {
3650         if ( expDiff < -1 ) return a;
3651         aSig >>= 1;
3652     }
3653     q = ( bSig <= aSig );
3654     if ( q ) aSig -= bSig;
3655     expDiff -= 64;
3656     while ( 0 < expDiff ) {
3657         q = estimateDiv128To64( aSig, 0, bSig );
3658         q = ( 2 < q ) ? q - 2 : 0;
3659         aSig = - ( ( bSig>>2 ) * q );
3660         expDiff -= 62;
3661     }
3662     expDiff += 64;
3663     if ( 0 < expDiff ) {
3664         q = estimateDiv128To64( aSig, 0, bSig );
3665         q = ( 2 < q ) ? q - 2 : 0;
3666         q >>= 64 - expDiff;
3667         bSig >>= 2;
3668         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3669     }
3670     else {
3671         aSig >>= 2;
3672         bSig >>= 2;
3673     }
3674     do {
3675         alternateASig = aSig;
3676         ++q;
3677         aSig -= bSig;
3678     } while ( 0 <= (int64_t) aSig );
3679     sigMean = aSig + alternateASig;
3680     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3681         aSig = alternateASig;
3682     }
3683     zSign = ( (int64_t) aSig < 0 );
3684     if ( zSign ) aSig = - aSig;
3685     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3686
3687 }
3688
3689 /*----------------------------------------------------------------------------
3690 | Returns the result of multiplying the double-precision floating-point values
3691 | `a' and `b' then adding 'c', with no intermediate rounding step after the
3692 | multiplication.  The operation is performed according to the IEC/IEEE
3693 | Standard for Binary Floating-Point Arithmetic 754-2008.
3694 | The flags argument allows the caller to select negation of the
3695 | addend, the intermediate product, or the final result. (The difference
3696 | between this and having the caller do a separate negation is that negating
3697 | externally will flip the sign bit on NaNs.)
3698 *----------------------------------------------------------------------------*/
3699
3700 float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
3701 {
3702     flag aSign, bSign, cSign, zSign;
3703     int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
3704     uint64_t aSig, bSig, cSig;
3705     flag pInf, pZero, pSign;
3706     uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
3707     int shiftcount;
3708     flag signflip, infzero;
3709
3710     a = float64_squash_input_denormal(a STATUS_VAR);
3711     b = float64_squash_input_denormal(b STATUS_VAR);
3712     c = float64_squash_input_denormal(c STATUS_VAR);
3713     aSig = extractFloat64Frac(a);
3714     aExp = extractFloat64Exp(a);
3715     aSign = extractFloat64Sign(a);
3716     bSig = extractFloat64Frac(b);
3717     bExp = extractFloat64Exp(b);
3718     bSign = extractFloat64Sign(b);
3719     cSig = extractFloat64Frac(c);
3720     cExp = extractFloat64Exp(c);
3721     cSign = extractFloat64Sign(c);
3722
3723     infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
3724                (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
3725
3726     /* It is implementation-defined whether the cases of (0,inf,qnan)
3727      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
3728      * they return if they do), so we have to hand this information
3729      * off to the target-specific pick-a-NaN routine.
3730      */
3731     if (((aExp == 0x7ff) && aSig) ||
3732         ((bExp == 0x7ff) && bSig) ||
3733         ((cExp == 0x7ff) && cSig)) {
3734         return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
3735     }
3736
3737     if (infzero) {
3738         float_raise(float_flag_invalid STATUS_VAR);
3739         return float64_default_nan;
3740     }
3741
3742     if (flags & float_muladd_negate_c) {
3743         cSign ^= 1;
3744     }
3745
3746     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
3747
3748     /* Work out the sign and type of the product */
3749     pSign = aSign ^ bSign;
3750     if (flags & float_muladd_negate_product) {
3751         pSign ^= 1;
3752     }
3753     pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
3754     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
3755
3756     if (cExp == 0x7ff) {
3757         if (pInf && (pSign ^ cSign)) {
3758             /* addition of opposite-signed infinities => InvalidOperation */
3759             float_raise(float_flag_invalid STATUS_VAR);
3760             return float64_default_nan;
3761         }
3762         /* Otherwise generate an infinity of the same sign */
3763         return packFloat64(cSign ^ signflip, 0x7ff, 0);
3764     }
3765
3766     if (pInf) {
3767         return packFloat64(pSign ^ signflip, 0x7ff, 0);
3768     }
3769
3770     if (pZero) {
3771         if (cExp == 0) {
3772             if (cSig == 0) {
3773                 /* Adding two exact zeroes */
3774                 if (pSign == cSign) {
3775                     zSign = pSign;
3776                 } else if (STATUS(float_rounding_mode) == float_round_down) {
3777                     zSign = 1;
3778                 } else {
3779                     zSign = 0;
3780                 }
3781                 return packFloat64(zSign ^ signflip, 0, 0);
3782             }
3783             /* Exact zero plus a denorm */
3784             if (STATUS(flush_to_zero)) {
3785                 float_raise(float_flag_output_denormal STATUS_VAR);
3786                 return packFloat64(cSign ^ signflip, 0, 0);
3787             }
3788         }
3789         /* Zero plus something non-zero : just return the something */
3790         return packFloat64(cSign ^ signflip, cExp, cSig);
3791     }
3792
3793     if (aExp == 0) {
3794         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
3795     }
3796     if (bExp == 0) {
3797         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
3798     }
3799
3800     /* Calculate the actual result a * b + c */
3801
3802     /* Multiply first; this is easy. */
3803     /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
3804      * because we want the true exponent, not the "one-less-than"
3805      * flavour that roundAndPackFloat64() takes.
3806      */
3807     pExp = aExp + bExp - 0x3fe;
3808     aSig = (aSig | LIT64(0x0010000000000000))<<10;
3809     bSig = (bSig | LIT64(0x0010000000000000))<<11;
3810     mul64To128(aSig, bSig, &pSig0, &pSig1);
3811     if ((int64_t)(pSig0 << 1) >= 0) {
3812         shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
3813         pExp--;
3814     }
3815
3816     zSign = pSign ^ signflip;
3817
3818     /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
3819      * bit in position 126.
3820      */
3821     if (cExp == 0) {
3822         if (!cSig) {
3823             /* Throw out the special case of c being an exact zero now */
3824             shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
3825             return roundAndPackFloat64(zSign, pExp - 1,
3826                                        pSig1 STATUS_VAR);
3827         }
3828         normalizeFloat64Subnormal(cSig, &cExp, &cSig);
3829     }
3830
3831     /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
3832      * significand of the addend, with the explicit bit in position 126.
3833      */
3834     cSig0 = cSig << (126 - 64 - 52);
3835     cSig1 = 0;
3836     cSig0 |= LIT64(0x4000000000000000);
3837     expDiff = pExp - cExp;
3838
3839     if (pSign == cSign) {
3840         /* Addition */
3841         if (expDiff > 0) {
3842             /* scale c to match p */
3843             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3844             zExp = pExp;
3845         } else if (expDiff < 0) {
3846             /* scale p to match c */
3847             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3848             zExp = cExp;
3849         } else {
3850             /* no scaling needed */
3851             zExp = cExp;
3852         }
3853         /* Add significands and make sure explicit bit ends up in posn 126 */
3854         add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3855         if ((int64_t)zSig0 < 0) {
3856             shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
3857         } else {
3858             zExp--;
3859         }
3860         shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
3861         return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
3862     } else {
3863         /* Subtraction */
3864         if (expDiff > 0) {
3865             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3866             sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3867             zExp = pExp;
3868         } else if (expDiff < 0) {
3869             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3870             sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
3871             zExp = cExp;
3872             zSign ^= 1;
3873         } else {
3874             zExp = pExp;
3875             if (lt128(cSig0, cSig1, pSig0, pSig1)) {
3876                 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3877             } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
3878                 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
3879                 zSign ^= 1;
3880             } else {
3881                 /* Exact zero */
3882                 zSign = signflip;
3883                 if (STATUS(float_rounding_mode) == float_round_down) {
3884                     zSign ^= 1;
3885                 }
3886                 return packFloat64(zSign, 0, 0);
3887             }
3888         }
3889         --zExp;
3890         /* Do the equivalent of normalizeRoundAndPackFloat64() but
3891          * starting with the significand in a pair of uint64_t.
3892          */
3893         if (zSig0) {
3894             shiftcount = countLeadingZeros64(zSig0) - 1;
3895             shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
3896             if (zSig1) {
3897                 zSig0 |= 1;
3898             }
3899             zExp -= shiftcount;
3900         } else {
3901             shiftcount = countLeadingZeros64(zSig1) - 1;
3902             zSig0 = zSig1 << shiftcount;
3903             zExp -= (shiftcount + 64);
3904         }
3905         return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
3906     }
3907 }
3908
3909 /*----------------------------------------------------------------------------
3910 | Returns the square root of the double-precision floating-point value `a'.
3911 | The operation is performed according to the IEC/IEEE Standard for Binary
3912 | Floating-Point Arithmetic.
3913 *----------------------------------------------------------------------------*/
3914
3915 float64 float64_sqrt( float64 a STATUS_PARAM )
3916 {
3917     flag aSign;
3918     int_fast16_t aExp, zExp;
3919     uint64_t aSig, zSig, doubleZSig;
3920     uint64_t rem0, rem1, term0, term1;
3921     a = float64_squash_input_denormal(a STATUS_VAR);
3922
3923     aSig = extractFloat64Frac( a );
3924     aExp = extractFloat64Exp( a );
3925     aSign = extractFloat64Sign( a );
3926     if ( aExp == 0x7FF ) {
3927         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3928         if ( ! aSign ) return a;
3929         float_raise( float_flag_invalid STATUS_VAR);
3930         return float64_default_nan;
3931     }
3932     if ( aSign ) {
3933         if ( ( aExp | aSig ) == 0 ) return a;
3934         float_raise( float_flag_invalid STATUS_VAR);
3935         return float64_default_nan;
3936     }
3937     if ( aExp == 0 ) {
3938         if ( aSig == 0 ) return float64_zero;
3939         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3940     }
3941     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3942     aSig |= LIT64( 0x0010000000000000 );
3943     zSig = estimateSqrt32( aExp, aSig>>21 );
3944     aSig <<= 9 - ( aExp & 1 );
3945     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3946     if ( ( zSig & 0x1FF ) <= 5 ) {
3947         doubleZSig = zSig<<1;
3948         mul64To128( zSig, zSig, &term0, &term1 );
3949         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3950         while ( (int64_t) rem0 < 0 ) {
3951             --zSig;
3952             doubleZSig -= 2;
3953             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3954         }
3955         zSig |= ( ( rem0 | rem1 ) != 0 );
3956     }
3957     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3958
3959 }
3960
3961 /*----------------------------------------------------------------------------
3962 | Returns the binary log of the double-precision floating-point value `a'.
3963 | The operation is performed according to the IEC/IEEE Standard for Binary
3964 | Floating-Point Arithmetic.
3965 *----------------------------------------------------------------------------*/
3966 float64 float64_log2( float64 a STATUS_PARAM )
3967 {
3968     flag aSign, zSign;
3969     int_fast16_t aExp;
3970     uint64_t aSig, aSig0, aSig1, zSig, i;
3971     a = float64_squash_input_denormal(a STATUS_VAR);
3972
3973     aSig = extractFloat64Frac( a );
3974     aExp = extractFloat64Exp( a );
3975     aSign = extractFloat64Sign( a );
3976
3977     if ( aExp == 0 ) {
3978         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3979         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3980     }
3981     if ( aSign ) {
3982         float_raise( float_flag_invalid STATUS_VAR);
3983         return float64_default_nan;
3984     }
3985     if ( aExp == 0x7FF ) {
3986         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3987         return a;
3988     }
3989
3990     aExp -= 0x3FF;
3991     aSig |= LIT64( 0x0010000000000000 );
3992     zSign = aExp < 0;
3993     zSig = (uint64_t)aExp << 52;
3994     for (i = 1LL << 51; i > 0; i >>= 1) {
3995         mul64To128( aSig, aSig, &aSig0, &aSig1 );
3996         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3997         if ( aSig & LIT64( 0x0020000000000000 ) ) {
3998             aSig >>= 1;
3999             zSig |= i;
4000         }
4001     }
4002
4003     if ( zSign )
4004         zSig = -zSig;
4005     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
4006 }
4007
4008 /*----------------------------------------------------------------------------
4009 | Returns 1 if the double-precision floating-point value `a' is equal to the
4010 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4011 | if either operand is a NaN.  Otherwise, the comparison is performed
4012 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4013 *----------------------------------------------------------------------------*/
4014
4015 int float64_eq( float64 a, float64 b STATUS_PARAM )
4016 {
4017     uint64_t av, bv;
4018     a = float64_squash_input_denormal(a STATUS_VAR);
4019     b = float64_squash_input_denormal(b STATUS_VAR);
4020
4021     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4022          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4023        ) {
4024         float_raise( float_flag_invalid STATUS_VAR);
4025         return 0;
4026     }
4027     av = float64_val(a);
4028     bv = float64_val(b);
4029     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4030
4031 }
4032
4033 /*----------------------------------------------------------------------------
4034 | Returns 1 if the double-precision floating-point value `a' is less than or
4035 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4036 | exception is raised if either operand is a NaN.  The comparison is performed
4037 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4038 *----------------------------------------------------------------------------*/
4039
4040 int float64_le( float64 a, float64 b STATUS_PARAM )
4041 {
4042     flag aSign, bSign;
4043     uint64_t av, bv;
4044     a = float64_squash_input_denormal(a STATUS_VAR);
4045     b = float64_squash_input_denormal(b STATUS_VAR);
4046
4047     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4048          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4049        ) {
4050         float_raise( float_flag_invalid STATUS_VAR);
4051         return 0;
4052     }
4053     aSign = extractFloat64Sign( a );
4054     bSign = extractFloat64Sign( b );
4055     av = float64_val(a);
4056     bv = float64_val(b);
4057     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4058     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4059
4060 }
4061
4062 /*----------------------------------------------------------------------------
4063 | Returns 1 if the double-precision floating-point value `a' is less than
4064 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4065 | raised if either operand is a NaN.  The comparison is performed according
4066 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4067 *----------------------------------------------------------------------------*/
4068
4069 int float64_lt( float64 a, float64 b STATUS_PARAM )
4070 {
4071     flag aSign, bSign;
4072     uint64_t av, bv;
4073
4074     a = float64_squash_input_denormal(a STATUS_VAR);
4075     b = float64_squash_input_denormal(b STATUS_VAR);
4076     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4077          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4078        ) {
4079         float_raise( float_flag_invalid STATUS_VAR);
4080         return 0;
4081     }
4082     aSign = extractFloat64Sign( a );
4083     bSign = extractFloat64Sign( b );
4084     av = float64_val(a);
4085     bv = float64_val(b);
4086     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4087     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4088
4089 }
4090
4091 /*----------------------------------------------------------------------------
4092 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4093 | be compared, and 0 otherwise.  The invalid exception is raised if either
4094 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4095 | Standard for Binary Floating-Point Arithmetic.
4096 *----------------------------------------------------------------------------*/
4097
4098 int float64_unordered( float64 a, float64 b STATUS_PARAM )
4099 {
4100     a = float64_squash_input_denormal(a STATUS_VAR);
4101     b = float64_squash_input_denormal(b STATUS_VAR);
4102
4103     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4104          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4105        ) {
4106         float_raise( float_flag_invalid STATUS_VAR);
4107         return 1;
4108     }
4109     return 0;
4110 }
4111
4112 /*----------------------------------------------------------------------------
4113 | Returns 1 if the double-precision floating-point value `a' is equal to the
4114 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4115 | exception.The comparison is performed according to the IEC/IEEE Standard
4116 | for Binary Floating-Point Arithmetic.
4117 *----------------------------------------------------------------------------*/
4118
4119 int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
4120 {
4121     uint64_t av, bv;
4122     a = float64_squash_input_denormal(a STATUS_VAR);
4123     b = float64_squash_input_denormal(b STATUS_VAR);
4124
4125     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4126          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4127        ) {
4128         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4129             float_raise( float_flag_invalid STATUS_VAR);
4130         }
4131         return 0;
4132     }
4133     av = float64_val(a);
4134     bv = float64_val(b);
4135     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4136
4137 }
4138
4139 /*----------------------------------------------------------------------------
4140 | Returns 1 if the double-precision floating-point value `a' is less than or
4141 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4142 | cause an exception.  Otherwise, the comparison is performed according to the
4143 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4144 *----------------------------------------------------------------------------*/
4145
4146 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
4147 {
4148     flag aSign, bSign;
4149     uint64_t av, bv;
4150     a = float64_squash_input_denormal(a STATUS_VAR);
4151     b = float64_squash_input_denormal(b STATUS_VAR);
4152
4153     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4154          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4155        ) {
4156         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4157             float_raise( float_flag_invalid STATUS_VAR);
4158         }
4159         return 0;
4160     }
4161     aSign = extractFloat64Sign( a );
4162     bSign = extractFloat64Sign( b );
4163     av = float64_val(a);
4164     bv = float64_val(b);
4165     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4166     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4167
4168 }
4169
4170 /*----------------------------------------------------------------------------
4171 | Returns 1 if the double-precision floating-point value `a' is less than
4172 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4173 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4174 | Standard for Binary Floating-Point Arithmetic.
4175 *----------------------------------------------------------------------------*/
4176
4177 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
4178 {
4179     flag aSign, bSign;
4180     uint64_t av, bv;
4181     a = float64_squash_input_denormal(a STATUS_VAR);
4182     b = float64_squash_input_denormal(b STATUS_VAR);
4183
4184     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4185          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4186        ) {
4187         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4188             float_raise( float_flag_invalid STATUS_VAR);
4189         }
4190         return 0;
4191     }
4192     aSign = extractFloat64Sign( a );
4193     bSign = extractFloat64Sign( b );
4194     av = float64_val(a);
4195     bv = float64_val(b);
4196     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4197     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4198
4199 }
4200
4201 /*----------------------------------------------------------------------------
4202 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4203 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4204 | comparison is performed according to the IEC/IEEE Standard for Binary
4205 | Floating-Point Arithmetic.
4206 *----------------------------------------------------------------------------*/
4207
4208 int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4209 {
4210     a = float64_squash_input_denormal(a STATUS_VAR);
4211     b = float64_squash_input_denormal(b STATUS_VAR);
4212
4213     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4214          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4215        ) {
4216         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4217             float_raise( float_flag_invalid STATUS_VAR);
4218         }
4219         return 1;
4220     }
4221     return 0;
4222 }
4223
4224 /*----------------------------------------------------------------------------
4225 | Returns the result of converting the extended double-precision floating-
4226 | point value `a' to the 32-bit two's complement integer format.  The
4227 | conversion is performed according to the IEC/IEEE Standard for Binary
4228 | Floating-Point Arithmetic---which means in particular that the conversion
4229 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4230 | largest positive integer is returned.  Otherwise, if the conversion
4231 | overflows, the largest integer with the same sign as `a' is returned.
4232 *----------------------------------------------------------------------------*/
4233
4234 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4235 {
4236     flag aSign;
4237     int32 aExp, shiftCount;
4238     uint64_t aSig;
4239
4240     aSig = extractFloatx80Frac( a );
4241     aExp = extractFloatx80Exp( a );
4242     aSign = extractFloatx80Sign( a );
4243     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4244     shiftCount = 0x4037 - aExp;
4245     if ( shiftCount <= 0 ) shiftCount = 1;
4246     shift64RightJamming( aSig, shiftCount, &aSig );
4247     return roundAndPackInt32( aSign, aSig STATUS_VAR );
4248
4249 }
4250
4251 /*----------------------------------------------------------------------------
4252 | Returns the result of converting the extended double-precision floating-
4253 | point value `a' to the 32-bit two's complement integer format.  The
4254 | conversion is performed according to the IEC/IEEE Standard for Binary
4255 | Floating-Point Arithmetic, except that the conversion is always rounded
4256 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4257 | Otherwise, if the conversion overflows, the largest integer with the same
4258 | sign as `a' is returned.
4259 *----------------------------------------------------------------------------*/
4260
4261 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4262 {
4263     flag aSign;
4264     int32 aExp, shiftCount;
4265     uint64_t aSig, savedASig;
4266     int32_t z;
4267
4268     aSig = extractFloatx80Frac( a );
4269     aExp = extractFloatx80Exp( a );
4270     aSign = extractFloatx80Sign( a );
4271     if ( 0x401E < aExp ) {
4272         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4273         goto invalid;
4274     }
4275     else if ( aExp < 0x3FFF ) {
4276         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4277         return 0;
4278     }
4279     shiftCount = 0x403E - aExp;
4280     savedASig = aSig;
4281     aSig >>= shiftCount;
4282     z = aSig;
4283     if ( aSign ) z = - z;
4284     if ( ( z < 0 ) ^ aSign ) {
4285  invalid:
4286         float_raise( float_flag_invalid STATUS_VAR);
4287         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4288     }
4289     if ( ( aSig<<shiftCount ) != savedASig ) {
4290         STATUS(float_exception_flags) |= float_flag_inexact;
4291     }
4292     return z;
4293
4294 }
4295
4296 /*----------------------------------------------------------------------------
4297 | Returns the result of converting the extended double-precision floating-
4298 | point value `a' to the 64-bit two's complement integer format.  The
4299 | conversion is performed according to the IEC/IEEE Standard for Binary
4300 | Floating-Point Arithmetic---which means in particular that the conversion
4301 | is rounded according to the current rounding mode.  If `a' is a NaN,
4302 | the largest positive integer is returned.  Otherwise, if the conversion
4303 | overflows, the largest integer with the same sign as `a' is returned.
4304 *----------------------------------------------------------------------------*/
4305
4306 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4307 {
4308     flag aSign;
4309     int32 aExp, shiftCount;
4310     uint64_t aSig, aSigExtra;
4311
4312     aSig = extractFloatx80Frac( a );
4313     aExp = extractFloatx80Exp( a );
4314     aSign = extractFloatx80Sign( a );
4315     shiftCount = 0x403E - aExp;
4316     if ( shiftCount <= 0 ) {
4317         if ( shiftCount ) {
4318             float_raise( float_flag_invalid STATUS_VAR);
4319             if (    ! aSign
4320                  || (    ( aExp == 0x7FFF )
4321                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
4322                ) {
4323                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4324             }
4325             return (int64_t) LIT64( 0x8000000000000000 );
4326         }
4327         aSigExtra = 0;
4328     }
4329     else {
4330         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4331     }
4332     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4333
4334 }
4335
4336 /*----------------------------------------------------------------------------
4337 | Returns the result of converting the extended double-precision floating-
4338 | point value `a' to the 64-bit two's complement integer format.  The
4339 | conversion is performed according to the IEC/IEEE Standard for Binary
4340 | Floating-Point Arithmetic, except that the conversion is always rounded
4341 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4342 | Otherwise, if the conversion overflows, the largest integer with the same
4343 | sign as `a' is returned.
4344 *----------------------------------------------------------------------------*/
4345
4346 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4347 {
4348     flag aSign;
4349     int32 aExp, shiftCount;
4350     uint64_t aSig;
4351     int64 z;
4352
4353     aSig = extractFloatx80Frac( a );
4354     aExp = extractFloatx80Exp( a );
4355     aSign = extractFloatx80Sign( a );
4356     shiftCount = aExp - 0x403E;
4357     if ( 0 <= shiftCount ) {
4358         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4359         if ( ( a.high != 0xC03E ) || aSig ) {
4360             float_raise( float_flag_invalid STATUS_VAR);
4361             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4362                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4363             }
4364         }
4365         return (int64_t) LIT64( 0x8000000000000000 );
4366     }
4367     else if ( aExp < 0x3FFF ) {
4368         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4369         return 0;
4370     }
4371     z = aSig>>( - shiftCount );
4372     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4373         STATUS(float_exception_flags) |= float_flag_inexact;
4374     }
4375     if ( aSign ) z = - z;
4376     return z;
4377
4378 }
4379
4380 /*----------------------------------------------------------------------------
4381 | Returns the result of converting the extended double-precision floating-
4382 | point value `a' to the single-precision floating-point format.  The
4383 | conversion is performed according to the IEC/IEEE Standard for Binary
4384 | Floating-Point Arithmetic.
4385 *----------------------------------------------------------------------------*/
4386
4387 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4388 {
4389     flag aSign;
4390     int32 aExp;
4391     uint64_t aSig;
4392
4393     aSig = extractFloatx80Frac( a );
4394     aExp = extractFloatx80Exp( a );
4395     aSign = extractFloatx80Sign( a );
4396     if ( aExp == 0x7FFF ) {
4397         if ( (uint64_t) ( aSig<<1 ) ) {
4398             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4399         }
4400         return packFloat32( aSign, 0xFF, 0 );
4401     }
4402     shift64RightJamming( aSig, 33, &aSig );
4403     if ( aExp || aSig ) aExp -= 0x3F81;
4404     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4405
4406 }
4407
4408 /*----------------------------------------------------------------------------
4409 | Returns the result of converting the extended double-precision floating-
4410 | point value `a' to the double-precision floating-point format.  The
4411 | conversion is performed according to the IEC/IEEE Standard for Binary
4412 | Floating-Point Arithmetic.
4413 *----------------------------------------------------------------------------*/
4414
4415 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4416 {
4417     flag aSign;
4418     int32 aExp;
4419     uint64_t aSig, zSig;
4420
4421     aSig = extractFloatx80Frac( a );
4422     aExp = extractFloatx80Exp( a );
4423     aSign = extractFloatx80Sign( a );
4424     if ( aExp == 0x7FFF ) {
4425         if ( (uint64_t) ( aSig<<1 ) ) {
4426             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4427         }
4428         return packFloat64( aSign, 0x7FF, 0 );
4429     }
4430     shift64RightJamming( aSig, 1, &zSig );
4431     if ( aExp || aSig ) aExp -= 0x3C01;
4432     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4433
4434 }
4435
4436 /*----------------------------------------------------------------------------
4437 | Returns the result of converting the extended double-precision floating-
4438 | point value `a' to the quadruple-precision floating-point format.  The
4439 | conversion is performed according to the IEC/IEEE Standard for Binary
4440 | Floating-Point Arithmetic.
4441 *----------------------------------------------------------------------------*/
4442
4443 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4444 {
4445     flag aSign;
4446     int_fast16_t aExp;
4447     uint64_t aSig, zSig0, zSig1;
4448
4449     aSig = extractFloatx80Frac( a );
4450     aExp = extractFloatx80Exp( a );
4451     aSign = extractFloatx80Sign( a );
4452     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4453         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4454     }
4455     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4456     return packFloat128( aSign, aExp, zSig0, zSig1 );
4457
4458 }
4459
4460 /*----------------------------------------------------------------------------
4461 | Rounds the extended double-precision floating-point value `a' to an integer,
4462 | and returns the result as an extended quadruple-precision floating-point
4463 | value.  The operation is performed according to the IEC/IEEE Standard for
4464 | Binary Floating-Point Arithmetic.
4465 *----------------------------------------------------------------------------*/
4466
4467 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4468 {
4469     flag aSign;
4470     int32 aExp;
4471     uint64_t lastBitMask, roundBitsMask;
4472     int8 roundingMode;
4473     floatx80 z;
4474
4475     aExp = extractFloatx80Exp( a );
4476     if ( 0x403E <= aExp ) {
4477         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4478             return propagateFloatx80NaN( a, a STATUS_VAR );
4479         }
4480         return a;
4481     }
4482     if ( aExp < 0x3FFF ) {
4483         if (    ( aExp == 0 )
4484              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4485             return a;
4486         }
4487         STATUS(float_exception_flags) |= float_flag_inexact;
4488         aSign = extractFloatx80Sign( a );
4489         switch ( STATUS(float_rounding_mode) ) {
4490          case float_round_nearest_even:
4491             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4492                ) {
4493                 return
4494                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4495             }
4496             break;
4497          case float_round_down:
4498             return
4499                   aSign ?
4500                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4501                 : packFloatx80( 0, 0, 0 );
4502          case float_round_up:
4503             return
4504                   aSign ? packFloatx80( 1, 0, 0 )
4505                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4506         }
4507         return packFloatx80( aSign, 0, 0 );
4508     }
4509     lastBitMask = 1;
4510     lastBitMask <<= 0x403E - aExp;
4511     roundBitsMask = lastBitMask - 1;
4512     z = a;
4513     roundingMode = STATUS(float_rounding_mode);
4514     if ( roundingMode == float_round_nearest_even ) {
4515         z.low += lastBitMask>>1;
4516         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4517     }
4518     else if ( roundingMode != float_round_to_zero ) {
4519         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4520             z.low += roundBitsMask;
4521         }
4522     }
4523     z.low &= ~ roundBitsMask;
4524     if ( z.low == 0 ) {
4525         ++z.high;
4526         z.low = LIT64( 0x8000000000000000 );
4527     }
4528     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4529     return z;
4530
4531 }
4532
4533 /*----------------------------------------------------------------------------
4534 | Returns the result of adding the absolute values of the extended double-
4535 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4536 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4537 | The addition is performed according to the IEC/IEEE Standard for Binary
4538 | Floating-Point Arithmetic.
4539 *----------------------------------------------------------------------------*/
4540
4541 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4542 {
4543     int32 aExp, bExp, zExp;
4544     uint64_t aSig, bSig, zSig0, zSig1;
4545     int32 expDiff;
4546
4547     aSig = extractFloatx80Frac( a );
4548     aExp = extractFloatx80Exp( a );
4549     bSig = extractFloatx80Frac( b );
4550     bExp = extractFloatx80Exp( b );
4551     expDiff = aExp - bExp;
4552     if ( 0 < expDiff ) {
4553         if ( aExp == 0x7FFF ) {
4554             if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4555             return a;
4556         }
4557         if ( bExp == 0 ) --expDiff;
4558         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4559         zExp = aExp;
4560     }
4561     else if ( expDiff < 0 ) {
4562         if ( bExp == 0x7FFF ) {
4563             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4564             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4565         }
4566         if ( aExp == 0 ) ++expDiff;
4567         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4568         zExp = bExp;
4569     }
4570     else {
4571         if ( aExp == 0x7FFF ) {
4572             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4573                 return propagateFloatx80NaN( a, b STATUS_VAR );
4574             }
4575             return a;
4576         }
4577         zSig1 = 0;
4578         zSig0 = aSig + bSig;
4579         if ( aExp == 0 ) {
4580             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4581             goto roundAndPack;
4582         }
4583         zExp = aExp;
4584         goto shiftRight1;
4585     }
4586     zSig0 = aSig + bSig;
4587     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4588  shiftRight1:
4589     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4590     zSig0 |= LIT64( 0x8000000000000000 );
4591     ++zExp;
4592  roundAndPack:
4593     return
4594         roundAndPackFloatx80(
4595             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4596
4597 }
4598
4599 /*----------------------------------------------------------------------------
4600 | Returns the result of subtracting the absolute values of the extended
4601 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4602 | difference is negated before being returned.  `zSign' is ignored if the
4603 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4604 | Standard for Binary Floating-Point Arithmetic.
4605 *----------------------------------------------------------------------------*/
4606
4607 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4608 {
4609     int32 aExp, bExp, zExp;
4610     uint64_t aSig, bSig, zSig0, zSig1;
4611     int32 expDiff;
4612     floatx80 z;
4613
4614     aSig = extractFloatx80Frac( a );
4615     aExp = extractFloatx80Exp( a );
4616     bSig = extractFloatx80Frac( b );
4617     bExp = extractFloatx80Exp( b );
4618     expDiff = aExp - bExp;
4619     if ( 0 < expDiff ) goto aExpBigger;
4620     if ( expDiff < 0 ) goto bExpBigger;
4621     if ( aExp == 0x7FFF ) {
4622         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4623             return propagateFloatx80NaN( a, b STATUS_VAR );
4624         }
4625         float_raise( float_flag_invalid STATUS_VAR);
4626         z.low = floatx80_default_nan_low;
4627         z.high = floatx80_default_nan_high;
4628         return z;
4629     }
4630     if ( aExp == 0 ) {
4631         aExp = 1;
4632         bExp = 1;
4633     }
4634     zSig1 = 0;
4635     if ( bSig < aSig ) goto aBigger;
4636     if ( aSig < bSig ) goto bBigger;
4637     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4638  bExpBigger:
4639     if ( bExp == 0x7FFF ) {
4640         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4641         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4642     }
4643     if ( aExp == 0 ) ++expDiff;
4644     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4645  bBigger:
4646     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4647     zExp = bExp;
4648     zSign ^= 1;
4649     goto normalizeRoundAndPack;
4650  aExpBigger:
4651     if ( aExp == 0x7FFF ) {
4652         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4653         return a;
4654     }
4655     if ( bExp == 0 ) --expDiff;
4656     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4657  aBigger:
4658     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4659     zExp = aExp;
4660  normalizeRoundAndPack:
4661     return
4662         normalizeRoundAndPackFloatx80(
4663             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4664
4665 }
4666
4667 /*----------------------------------------------------------------------------
4668 | Returns the result of adding the extended double-precision floating-point
4669 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4670 | Standard for Binary Floating-Point Arithmetic.
4671 *----------------------------------------------------------------------------*/
4672
4673 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4674 {
4675     flag aSign, bSign;
4676
4677     aSign = extractFloatx80Sign( a );
4678     bSign = extractFloatx80Sign( b );
4679     if ( aSign == bSign ) {
4680         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4681     }
4682     else {
4683         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4684     }
4685
4686 }
4687
4688 /*----------------------------------------------------------------------------
4689 | Returns the result of subtracting the extended double-precision floating-
4690 | point values `a' and `b'.  The operation is performed according to the
4691 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4692 *----------------------------------------------------------------------------*/
4693
4694 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4695 {
4696     flag aSign, bSign;
4697
4698     aSign = extractFloatx80Sign( a );
4699     bSign = extractFloatx80Sign( b );
4700     if ( aSign == bSign ) {
4701         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4702     }
4703     else {
4704         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4705     }
4706
4707 }
4708
4709 /*----------------------------------------------------------------------------
4710 | Returns the result of multiplying the extended double-precision floating-
4711 | point values `a' and `b'.  The operation is performed according to the
4712 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4713 *----------------------------------------------------------------------------*/
4714
4715 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4716 {
4717     flag aSign, bSign, zSign;
4718     int32 aExp, bExp, zExp;
4719     uint64_t aSig, bSig, zSig0, zSig1;
4720     floatx80 z;
4721
4722     aSig = extractFloatx80Frac( a );
4723     aExp = extractFloatx80Exp( a );
4724     aSign = extractFloatx80Sign( a );
4725     bSig = extractFloatx80Frac( b );
4726     bExp = extractFloatx80Exp( b );
4727     bSign = extractFloatx80Sign( b );
4728     zSign = aSign ^ bSign;
4729     if ( aExp == 0x7FFF ) {
4730         if (    (uint64_t) ( aSig<<1 )
4731              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4732             return propagateFloatx80NaN( a, b STATUS_VAR );
4733         }
4734         if ( ( bExp | bSig ) == 0 ) goto invalid;
4735         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4736     }
4737     if ( bExp == 0x7FFF ) {
4738         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4739         if ( ( aExp | aSig ) == 0 ) {
4740  invalid:
4741             float_raise( float_flag_invalid STATUS_VAR);
4742             z.low = floatx80_default_nan_low;
4743             z.high = floatx80_default_nan_high;
4744             return z;
4745         }
4746         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4747     }
4748     if ( aExp == 0 ) {
4749         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4750         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4751     }
4752     if ( bExp == 0 ) {
4753         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4754         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4755     }
4756     zExp = aExp + bExp - 0x3FFE;
4757     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4758     if ( 0 < (int64_t) zSig0 ) {
4759         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4760         --zExp;
4761     }
4762     return
4763         roundAndPackFloatx80(
4764             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4765
4766 }
4767
4768 /*----------------------------------------------------------------------------
4769 | Returns the result of dividing the extended double-precision floating-point
4770 | value `a' by the corresponding value `b'.  The operation is performed
4771 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4772 *----------------------------------------------------------------------------*/
4773
4774 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4775 {
4776     flag aSign, bSign, zSign;
4777     int32 aExp, bExp, zExp;
4778     uint64_t aSig, bSig, zSig0, zSig1;
4779     uint64_t rem0, rem1, rem2, term0, term1, term2;
4780     floatx80 z;
4781
4782     aSig = extractFloatx80Frac( a );
4783     aExp = extractFloatx80Exp( a );
4784     aSign = extractFloatx80Sign( a );
4785     bSig = extractFloatx80Frac( b );
4786     bExp = extractFloatx80Exp( b );
4787     bSign = extractFloatx80Sign( b );
4788     zSign = aSign ^ bSign;
4789     if ( aExp == 0x7FFF ) {
4790         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4791         if ( bExp == 0x7FFF ) {
4792             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4793             goto invalid;
4794         }
4795         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4796     }
4797     if ( bExp == 0x7FFF ) {
4798         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4799         return packFloatx80( zSign, 0, 0 );
4800     }
4801     if ( bExp == 0 ) {
4802         if ( bSig == 0 ) {
4803             if ( ( aExp | aSig ) == 0 ) {
4804  invalid:
4805                 float_raise( float_flag_invalid STATUS_VAR);
4806                 z.low = floatx80_default_nan_low;
4807                 z.high = floatx80_default_nan_high;
4808                 return z;
4809             }
4810             float_raise( float_flag_divbyzero STATUS_VAR);
4811             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4812         }
4813         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4814     }
4815     if ( aExp == 0 ) {
4816         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4817         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4818     }
4819     zExp = aExp - bExp + 0x3FFE;
4820     rem1 = 0;
4821     if ( bSig <= aSig ) {
4822         shift128Right( aSig, 0, 1, &aSig, &rem1 );
4823         ++zExp;
4824     }
4825     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4826     mul64To128( bSig, zSig0, &term0, &term1 );
4827     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4828     while ( (int64_t) rem0 < 0 ) {
4829         --zSig0;
4830         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4831     }
4832     zSig1 = estimateDiv128To64( rem1, 0, bSig );
4833     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4834         mul64To128( bSig, zSig1, &term1, &term2 );
4835         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4836         while ( (int64_t) rem1 < 0 ) {
4837             --zSig1;
4838             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4839         }
4840         zSig1 |= ( ( rem1 | rem2 ) != 0 );
4841     }
4842     return
4843         roundAndPackFloatx80(
4844             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4845
4846 }
4847
4848 /*----------------------------------------------------------------------------
4849 | Returns the remainder of the extended double-precision floating-point value
4850 | `a' with respect to the corresponding value `b'.  The operation is performed
4851 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4852 *----------------------------------------------------------------------------*/
4853
4854 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4855 {
4856     flag aSign, zSign;
4857     int32 aExp, bExp, expDiff;
4858     uint64_t aSig0, aSig1, bSig;
4859     uint64_t q, term0, term1, alternateASig0, alternateASig1;
4860     floatx80 z;
4861
4862     aSig0 = extractFloatx80Frac( a );
4863     aExp = extractFloatx80Exp( a );
4864     aSign = extractFloatx80Sign( a );
4865     bSig = extractFloatx80Frac( b );
4866     bExp = extractFloatx80Exp( b );
4867     if ( aExp == 0x7FFF ) {
4868         if (    (uint64_t) ( aSig0<<1 )
4869              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4870             return propagateFloatx80NaN( a, b STATUS_VAR );
4871         }
4872         goto invalid;
4873     }
4874     if ( bExp == 0x7FFF ) {
4875         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4876         return a;
4877     }
4878     if ( bExp == 0 ) {
4879         if ( bSig == 0 ) {
4880  invalid:
4881             float_raise( float_flag_invalid STATUS_VAR);
4882             z.low = floatx80_default_nan_low;
4883             z.high = floatx80_default_nan_high;
4884             return z;
4885         }
4886         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4887     }
4888     if ( aExp == 0 ) {
4889         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4890         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4891     }
4892     bSig |= LIT64( 0x8000000000000000 );
4893     zSign = aSign;
4894     expDiff = aExp - bExp;
4895     aSig1 = 0;
4896     if ( expDiff < 0 ) {
4897         if ( expDiff < -1 ) return a;
4898         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4899         expDiff = 0;
4900     }
4901     q = ( bSig <= aSig0 );
4902     if ( q ) aSig0 -= bSig;
4903     expDiff -= 64;
4904     while ( 0 < expDiff ) {
4905         q = estimateDiv128To64( aSig0, aSig1, bSig );
4906         q = ( 2 < q ) ? q - 2 : 0;
4907         mul64To128( bSig, q, &term0, &term1 );
4908         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4909         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4910         expDiff -= 62;
4911     }
4912     expDiff += 64;
4913     if ( 0 < expDiff ) {
4914         q = estimateDiv128To64( aSig0, aSig1, bSig );
4915         q = ( 2 < q ) ? q - 2 : 0;
4916         q >>= 64 - expDiff;
4917         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4918         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4919         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4920         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4921             ++q;
4922             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4923         }
4924     }
4925     else {
4926         term1 = 0;
4927         term0 = bSig;
4928     }
4929     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4930     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4931          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4932               && ( q & 1 ) )
4933        ) {
4934         aSig0 = alternateASig0;
4935         aSig1 = alternateASig1;
4936         zSign = ! zSign;
4937     }
4938     return
4939         normalizeRoundAndPackFloatx80(
4940             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4941
4942 }
4943
4944 /*----------------------------------------------------------------------------
4945 | Returns the square root of the extended double-precision floating-point
4946 | value `a'.  The operation is performed according to the IEC/IEEE Standard
4947 | for Binary Floating-Point Arithmetic.
4948 *----------------------------------------------------------------------------*/
4949
4950 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4951 {
4952     flag aSign;
4953     int32 aExp, zExp;
4954     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4955     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4956     floatx80 z;
4957
4958     aSig0 = extractFloatx80Frac( a );
4959     aExp = extractFloatx80Exp( a );
4960     aSign = extractFloatx80Sign( a );
4961     if ( aExp == 0x7FFF ) {
4962         if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4963         if ( ! aSign ) return a;
4964         goto invalid;
4965     }
4966     if ( aSign ) {
4967         if ( ( aExp | aSig0 ) == 0 ) return a;
4968  invalid:
4969         float_raise( float_flag_invalid STATUS_VAR);
4970         z.low = floatx80_default_nan_low;
4971         z.high = floatx80_default_nan_high;
4972         return z;
4973     }
4974     if ( aExp == 0 ) {
4975         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4976         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4977     }
4978     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4979     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4980     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4981     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4982     doubleZSig0 = zSig0<<1;
4983     mul64To128( zSig0, zSig0, &term0, &term1 );
4984     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4985     while ( (int64_t) rem0 < 0 ) {
4986         --zSig0;
4987         doubleZSig0 -= 2;
4988         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4989     }
4990     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4991     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4992         if ( zSig1 == 0 ) zSig1 = 1;
4993         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4994         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4995         mul64To128( zSig1, zSig1, &term2, &term3 );
4996         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4997         while ( (int64_t) rem1 < 0 ) {
4998             --zSig1;
4999             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5000             term3 |= 1;
5001             term2 |= doubleZSig0;
5002             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5003         }
5004         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5005     }
5006     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5007     zSig0 |= doubleZSig0;
5008     return
5009         roundAndPackFloatx80(
5010             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
5011
5012 }
5013
5014 /*----------------------------------------------------------------------------
5015 | Returns 1 if the extended double-precision floating-point value `a' is equal
5016 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5017 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5018 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5019 *----------------------------------------------------------------------------*/
5020
5021 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
5022 {
5023
5024     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5025               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5026          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5027               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5028        ) {
5029         float_raise( float_flag_invalid STATUS_VAR);
5030         return 0;
5031     }
5032     return
5033            ( a.low == b.low )
5034         && (    ( a.high == b.high )
5035              || (    ( a.low == 0 )
5036                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5037            );
5038
5039 }
5040
5041 /*----------------------------------------------------------------------------
5042 | Returns 1 if the extended double-precision floating-point value `a' is
5043 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5044 | invalid exception is raised if either operand is a NaN.  The comparison is
5045 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5046 | Arithmetic.
5047 *----------------------------------------------------------------------------*/
5048
5049 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
5050 {
5051     flag aSign, bSign;
5052
5053     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5054               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5055          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5056               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5057        ) {
5058         float_raise( float_flag_invalid STATUS_VAR);
5059         return 0;
5060     }
5061     aSign = extractFloatx80Sign( a );
5062     bSign = extractFloatx80Sign( b );
5063     if ( aSign != bSign ) {
5064         return
5065                aSign
5066             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5067                  == 0 );
5068     }
5069     return
5070           aSign ? le128( b.high, b.low, a.high, a.low )
5071         : le128( a.high, a.low, b.high, b.low );
5072
5073 }
5074
5075 /*----------------------------------------------------------------------------
5076 | Returns 1 if the extended double-precision floating-point value `a' is
5077 | less than the corresponding value `b', and 0 otherwise.  The invalid
5078 | exception is raised if either operand is a NaN.  The comparison is performed
5079 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5080 *----------------------------------------------------------------------------*/
5081
5082 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
5083 {
5084     flag aSign, bSign;
5085
5086     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5087               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5088          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5089               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5090        ) {
5091         float_raise( float_flag_invalid STATUS_VAR);
5092         return 0;
5093     }
5094     aSign = extractFloatx80Sign( a );
5095     bSign = extractFloatx80Sign( b );
5096     if ( aSign != bSign ) {
5097         return
5098                aSign
5099             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5100                  != 0 );
5101     }
5102     return
5103           aSign ? lt128( b.high, b.low, a.high, a.low )
5104         : lt128( a.high, a.low, b.high, b.low );
5105
5106 }
5107
5108 /*----------------------------------------------------------------------------
5109 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5110 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5111 | either operand is a NaN.   The comparison is performed according to the
5112 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5113 *----------------------------------------------------------------------------*/
5114 int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5115 {
5116     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5117               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5118          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5119               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5120        ) {
5121         float_raise( float_flag_invalid STATUS_VAR);
5122         return 1;
5123     }
5124     return 0;
5125 }
5126
5127 /*----------------------------------------------------------------------------
5128 | Returns 1 if the extended double-precision floating-point value `a' is
5129 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5130 | cause an exception.  The comparison is performed according to the IEC/IEEE
5131 | Standard for Binary Floating-Point Arithmetic.
5132 *----------------------------------------------------------------------------*/
5133
5134 int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5135 {
5136
5137     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5138               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5139          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5140               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5141        ) {
5142         if (    floatx80_is_signaling_nan( a )
5143              || floatx80_is_signaling_nan( b ) ) {
5144             float_raise( float_flag_invalid STATUS_VAR);
5145         }
5146         return 0;
5147     }
5148     return
5149            ( a.low == b.low )
5150         && (    ( a.high == b.high )
5151              || (    ( a.low == 0 )
5152                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5153            );
5154
5155 }
5156
5157 /*----------------------------------------------------------------------------
5158 | Returns 1 if the extended double-precision floating-point value `a' is less
5159 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5160 | do not cause an exception.  Otherwise, the comparison is performed according
5161 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5162 *----------------------------------------------------------------------------*/
5163
5164 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5165 {
5166     flag aSign, bSign;
5167
5168     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5169               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5170          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5171               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5172        ) {
5173         if (    floatx80_is_signaling_nan( a )
5174              || floatx80_is_signaling_nan( b ) ) {
5175             float_raise( float_flag_invalid STATUS_VAR);
5176         }
5177         return 0;
5178     }
5179     aSign = extractFloatx80Sign( a );
5180     bSign = extractFloatx80Sign( b );
5181     if ( aSign != bSign ) {
5182         return
5183                aSign
5184             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5185                  == 0 );
5186     }
5187     return
5188           aSign ? le128( b.high, b.low, a.high, a.low )
5189         : le128( a.high, a.low, b.high, b.low );
5190
5191 }
5192
5193 /*----------------------------------------------------------------------------
5194 | Returns 1 if the extended double-precision floating-point value `a' is less
5195 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5196 | an exception.  Otherwise, the comparison is performed according to the
5197 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5198 *----------------------------------------------------------------------------*/
5199
5200 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5201 {
5202     flag aSign, bSign;
5203
5204     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5205               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5206          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5207               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5208        ) {
5209         if (    floatx80_is_signaling_nan( a )
5210              || floatx80_is_signaling_nan( b ) ) {
5211             float_raise( float_flag_invalid STATUS_VAR);
5212         }
5213         return 0;
5214     }
5215     aSign = extractFloatx80Sign( a );
5216     bSign = extractFloatx80Sign( b );
5217     if ( aSign != bSign ) {
5218         return
5219                aSign
5220             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5221                  != 0 );
5222     }
5223     return
5224           aSign ? lt128( b.high, b.low, a.high, a.low )
5225         : lt128( a.high, a.low, b.high, b.low );
5226
5227 }
5228
5229 /*----------------------------------------------------------------------------
5230 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5231 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5232 | The comparison is performed according to the IEC/IEEE Standard for Binary
5233 | Floating-Point Arithmetic.
5234 *----------------------------------------------------------------------------*/
5235 int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5236 {
5237     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5238               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5239          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5240               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5241        ) {
5242         if (    floatx80_is_signaling_nan( a )
5243              || floatx80_is_signaling_nan( b ) ) {
5244             float_raise( float_flag_invalid STATUS_VAR);
5245         }
5246         return 1;
5247     }
5248     return 0;
5249 }
5250
5251 /*----------------------------------------------------------------------------
5252 | Returns the result of converting the quadruple-precision floating-point
5253 | value `a' to the 32-bit two's complement integer format.  The conversion
5254 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5255 | Arithmetic---which means in particular that the conversion is rounded
5256 | according to the current rounding mode.  If `a' is a NaN, the largest
5257 | positive integer is returned.  Otherwise, if the conversion overflows, the
5258 | largest integer with the same sign as `a' is returned.
5259 *----------------------------------------------------------------------------*/
5260
5261 int32 float128_to_int32( float128 a STATUS_PARAM )
5262 {
5263     flag aSign;
5264     int32 aExp, shiftCount;
5265     uint64_t aSig0, aSig1;
5266
5267     aSig1 = extractFloat128Frac1( a );
5268     aSig0 = extractFloat128Frac0( a );
5269     aExp = extractFloat128Exp( a );
5270     aSign = extractFloat128Sign( a );
5271     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5272     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5273     aSig0 |= ( aSig1 != 0 );
5274     shiftCount = 0x4028 - aExp;
5275     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5276     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5277
5278 }
5279
5280 /*----------------------------------------------------------------------------
5281 | Returns the result of converting the quadruple-precision floating-point
5282 | value `a' to the 32-bit two's complement integer format.  The conversion
5283 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5284 | Arithmetic, except that the conversion is always rounded toward zero.  If
5285 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5286 | conversion overflows, the largest integer with the same sign as `a' is
5287 | returned.
5288 *----------------------------------------------------------------------------*/
5289
5290 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5291 {
5292     flag aSign;
5293     int32 aExp, shiftCount;
5294     uint64_t aSig0, aSig1, savedASig;
5295     int32_t z;
5296
5297     aSig1 = extractFloat128Frac1( a );
5298     aSig0 = extractFloat128Frac0( a );
5299     aExp = extractFloat128Exp( a );
5300     aSign = extractFloat128Sign( a );
5301     aSig0 |= ( aSig1 != 0 );
5302     if ( 0x401E < aExp ) {
5303         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5304         goto invalid;
5305     }
5306     else if ( aExp < 0x3FFF ) {
5307         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5308         return 0;
5309     }
5310     aSig0 |= LIT64( 0x0001000000000000 );
5311     shiftCount = 0x402F - aExp;
5312     savedASig = aSig0;
5313     aSig0 >>= shiftCount;
5314     z = aSig0;
5315     if ( aSign ) z = - z;
5316     if ( ( z < 0 ) ^ aSign ) {
5317  invalid:
5318         float_raise( float_flag_invalid STATUS_VAR);
5319         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5320     }
5321     if ( ( aSig0<<shiftCount ) != savedASig ) {
5322         STATUS(float_exception_flags) |= float_flag_inexact;
5323     }
5324     return z;
5325
5326 }
5327
5328 /*----------------------------------------------------------------------------
5329 | Returns the result of converting the quadruple-precision floating-point
5330 | value `a' to the 64-bit two's complement integer format.  The conversion
5331 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5332 | Arithmetic---which means in particular that the conversion is rounded
5333 | according to the current rounding mode.  If `a' is a NaN, the largest
5334 | positive integer is returned.  Otherwise, if the conversion overflows, the
5335 | largest integer with the same sign as `a' is returned.
5336 *----------------------------------------------------------------------------*/
5337
5338 int64 float128_to_int64( float128 a STATUS_PARAM )
5339 {
5340     flag aSign;
5341     int32 aExp, shiftCount;
5342     uint64_t aSig0, aSig1;
5343
5344     aSig1 = extractFloat128Frac1( a );
5345     aSig0 = extractFloat128Frac0( a );
5346     aExp = extractFloat128Exp( a );
5347     aSign = extractFloat128Sign( a );
5348     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5349     shiftCount = 0x402F - aExp;
5350     if ( shiftCount <= 0 ) {
5351         if ( 0x403E < aExp ) {
5352             float_raise( float_flag_invalid STATUS_VAR);
5353             if (    ! aSign
5354                  || (    ( aExp == 0x7FFF )
5355                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5356                     )
5357                ) {
5358                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5359             }
5360             return (int64_t) LIT64( 0x8000000000000000 );
5361         }
5362         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5363     }
5364     else {
5365         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5366     }
5367     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5368
5369 }
5370
5371 /*----------------------------------------------------------------------------
5372 | Returns the result of converting the quadruple-precision floating-point
5373 | value `a' to the 64-bit two's complement integer format.  The conversion
5374 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5375 | Arithmetic, except that the conversion is always rounded toward zero.
5376 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5377 | the conversion overflows, the largest integer with the same sign as `a' is
5378 | returned.
5379 *----------------------------------------------------------------------------*/
5380
5381 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5382 {
5383     flag aSign;
5384     int32 aExp, shiftCount;
5385     uint64_t aSig0, aSig1;
5386     int64 z;
5387
5388     aSig1 = extractFloat128Frac1( a );
5389     aSig0 = extractFloat128Frac0( a );
5390     aExp = extractFloat128Exp( a );
5391     aSign = extractFloat128Sign( a );
5392     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5393     shiftCount = aExp - 0x402F;
5394     if ( 0 < shiftCount ) {
5395         if ( 0x403E <= aExp ) {
5396             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5397             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5398                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5399                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5400             }
5401             else {
5402                 float_raise( float_flag_invalid STATUS_VAR);
5403                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5404                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5405                 }
5406             }
5407             return (int64_t) LIT64( 0x8000000000000000 );
5408         }
5409         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5410         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5411             STATUS(float_exception_flags) |= float_flag_inexact;
5412         }
5413     }
5414     else {
5415         if ( aExp < 0x3FFF ) {
5416             if ( aExp | aSig0 | aSig1 ) {
5417                 STATUS(float_exception_flags) |= float_flag_inexact;
5418             }
5419             return 0;
5420         }
5421         z = aSig0>>( - shiftCount );
5422         if (    aSig1
5423              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5424             STATUS(float_exception_flags) |= float_flag_inexact;
5425         }
5426     }
5427     if ( aSign ) z = - z;
5428     return z;
5429
5430 }
5431
5432 /*----------------------------------------------------------------------------
5433 | Returns the result of converting the quadruple-precision floating-point
5434 | value `a' to the single-precision floating-point format.  The conversion
5435 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5436 | Arithmetic.
5437 *----------------------------------------------------------------------------*/
5438
5439 float32 float128_to_float32( float128 a STATUS_PARAM )
5440 {
5441     flag aSign;
5442     int32 aExp;
5443     uint64_t aSig0, aSig1;
5444     uint32_t zSig;
5445
5446     aSig1 = extractFloat128Frac1( a );
5447     aSig0 = extractFloat128Frac0( a );
5448     aExp = extractFloat128Exp( a );
5449     aSign = extractFloat128Sign( a );
5450     if ( aExp == 0x7FFF ) {
5451         if ( aSig0 | aSig1 ) {
5452             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5453         }
5454         return packFloat32( aSign, 0xFF, 0 );
5455     }
5456     aSig0 |= ( aSig1 != 0 );
5457     shift64RightJamming( aSig0, 18, &aSig0 );
5458     zSig = aSig0;
5459     if ( aExp || zSig ) {
5460         zSig |= 0x40000000;
5461         aExp -= 0x3F81;
5462     }
5463     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5464
5465 }
5466
5467 /*----------------------------------------------------------------------------
5468 | Returns the result of converting the quadruple-precision floating-point
5469 | value `a' to the double-precision floating-point format.  The conversion
5470 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5471 | Arithmetic.
5472 *----------------------------------------------------------------------------*/
5473
5474 float64 float128_to_float64( float128 a STATUS_PARAM )
5475 {
5476     flag aSign;
5477     int32 aExp;
5478     uint64_t aSig0, aSig1;
5479
5480     aSig1 = extractFloat128Frac1( a );
5481     aSig0 = extractFloat128Frac0( a );
5482     aExp = extractFloat128Exp( a );
5483     aSign = extractFloat128Sign( a );
5484     if ( aExp == 0x7FFF ) {
5485         if ( aSig0 | aSig1 ) {
5486             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5487         }
5488         return packFloat64( aSign, 0x7FF, 0 );
5489     }
5490     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5491     aSig0 |= ( aSig1 != 0 );
5492     if ( aExp || aSig0 ) {
5493         aSig0 |= LIT64( 0x4000000000000000 );
5494         aExp -= 0x3C01;
5495     }
5496     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5497
5498 }
5499
5500 /*----------------------------------------------------------------------------
5501 | Returns the result of converting the quadruple-precision floating-point
5502 | value `a' to the extended double-precision floating-point format.  The
5503 | conversion is performed according to the IEC/IEEE Standard for Binary
5504 | Floating-Point Arithmetic.
5505 *----------------------------------------------------------------------------*/
5506
5507 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5508 {
5509     flag aSign;
5510     int32 aExp;
5511     uint64_t aSig0, aSig1;
5512
5513     aSig1 = extractFloat128Frac1( a );
5514     aSig0 = extractFloat128Frac0( a );
5515     aExp = extractFloat128Exp( a );
5516     aSign = extractFloat128Sign( a );
5517     if ( aExp == 0x7FFF ) {
5518         if ( aSig0 | aSig1 ) {
5519             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5520         }
5521         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5522     }
5523     if ( aExp == 0 ) {
5524         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5525         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5526     }
5527     else {
5528         aSig0 |= LIT64( 0x0001000000000000 );
5529     }
5530     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5531     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5532
5533 }
5534
5535 /*----------------------------------------------------------------------------
5536 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5537 | returns the result as a quadruple-precision floating-point value.  The
5538 | operation is performed according to the IEC/IEEE Standard for Binary
5539 | Floating-Point Arithmetic.
5540 *----------------------------------------------------------------------------*/
5541
5542 float128 float128_round_to_int( float128 a STATUS_PARAM )
5543 {
5544     flag aSign;
5545     int32 aExp;
5546     uint64_t lastBitMask, roundBitsMask;
5547     int8 roundingMode;
5548     float128 z;
5549
5550     aExp = extractFloat128Exp( a );
5551     if ( 0x402F <= aExp ) {
5552         if ( 0x406F <= aExp ) {
5553             if (    ( aExp == 0x7FFF )
5554                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5555                ) {
5556                 return propagateFloat128NaN( a, a STATUS_VAR );
5557             }
5558             return a;
5559         }
5560         lastBitMask = 1;
5561         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5562         roundBitsMask = lastBitMask - 1;
5563         z = a;
5564         roundingMode = STATUS(float_rounding_mode);
5565         if ( roundingMode == float_round_nearest_even ) {
5566             if ( lastBitMask ) {
5567                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5568                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5569             }
5570             else {
5571                 if ( (int64_t) z.low < 0 ) {
5572                     ++z.high;
5573                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5574                 }
5575             }
5576         }
5577         else if ( roundingMode != float_round_to_zero ) {
5578             if (   extractFloat128Sign( z )
5579                  ^ ( roundingMode == float_round_up ) ) {
5580                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5581             }
5582         }
5583         z.low &= ~ roundBitsMask;
5584     }
5585     else {
5586         if ( aExp < 0x3FFF ) {
5587             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5588             STATUS(float_exception_flags) |= float_flag_inexact;
5589             aSign = extractFloat128Sign( a );
5590             switch ( STATUS(float_rounding_mode) ) {
5591              case float_round_nearest_even:
5592                 if (    ( aExp == 0x3FFE )
5593                      && (   extractFloat128Frac0( a )
5594                           | extractFloat128Frac1( a ) )
5595                    ) {
5596                     return packFloat128( aSign, 0x3FFF, 0, 0 );
5597                 }
5598                 break;
5599              case float_round_down:
5600                 return
5601                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5602                     : packFloat128( 0, 0, 0, 0 );
5603              case float_round_up:
5604                 return
5605                       aSign ? packFloat128( 1, 0, 0, 0 )
5606                     : packFloat128( 0, 0x3FFF, 0, 0 );
5607             }
5608             return packFloat128( aSign, 0, 0, 0 );
5609         }
5610         lastBitMask = 1;
5611         lastBitMask <<= 0x402F - aExp;
5612         roundBitsMask = lastBitMask - 1;
5613         z.low = 0;
5614         z.high = a.high;
5615         roundingMode = STATUS(float_rounding_mode);
5616         if ( roundingMode == float_round_nearest_even ) {
5617             z.high += lastBitMask>>1;
5618             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5619                 z.high &= ~ lastBitMask;
5620             }
5621         }
5622         else if ( roundingMode != float_round_to_zero ) {
5623             if (   extractFloat128Sign( z )
5624                  ^ ( roundingMode == float_round_up ) ) {
5625                 z.high |= ( a.low != 0 );
5626                 z.high += roundBitsMask;
5627             }
5628         }
5629         z.high &= ~ roundBitsMask;
5630     }
5631     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5632         STATUS(float_exception_flags) |= float_flag_inexact;
5633     }
5634     return z;
5635
5636 }
5637
5638 /*----------------------------------------------------------------------------
5639 | Returns the result of adding the absolute values of the quadruple-precision
5640 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5641 | before being returned.  `zSign' is ignored if the result is a NaN.
5642 | The addition is performed according to the IEC/IEEE Standard for Binary
5643 | Floating-Point Arithmetic.
5644 *----------------------------------------------------------------------------*/
5645
5646 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5647 {
5648     int32 aExp, bExp, zExp;
5649     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5650     int32 expDiff;
5651
5652     aSig1 = extractFloat128Frac1( a );
5653     aSig0 = extractFloat128Frac0( a );
5654     aExp = extractFloat128Exp( a );
5655     bSig1 = extractFloat128Frac1( b );
5656     bSig0 = extractFloat128Frac0( b );
5657     bExp = extractFloat128Exp( b );
5658     expDiff = aExp - bExp;
5659     if ( 0 < expDiff ) {
5660         if ( aExp == 0x7FFF ) {
5661             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5662             return a;
5663         }
5664         if ( bExp == 0 ) {
5665             --expDiff;
5666         }
5667         else {
5668             bSig0 |= LIT64( 0x0001000000000000 );
5669         }
5670         shift128ExtraRightJamming(
5671             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5672         zExp = aExp;
5673     }
5674     else if ( expDiff < 0 ) {
5675         if ( bExp == 0x7FFF ) {
5676             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5677             return packFloat128( zSign, 0x7FFF, 0, 0 );
5678         }
5679         if ( aExp == 0 ) {
5680             ++expDiff;
5681         }
5682         else {
5683             aSig0 |= LIT64( 0x0001000000000000 );
5684         }
5685         shift128ExtraRightJamming(
5686             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5687         zExp = bExp;
5688     }
5689     else {
5690         if ( aExp == 0x7FFF ) {
5691             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5692                 return propagateFloat128NaN( a, b STATUS_VAR );
5693             }
5694             return a;
5695         }
5696         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5697         if ( aExp == 0 ) {
5698             if (STATUS(flush_to_zero)) {
5699                 if (zSig0 | zSig1) {
5700                     float_raise(float_flag_output_denormal STATUS_VAR);
5701                 }
5702                 return packFloat128(zSign, 0, 0, 0);
5703             }
5704             return packFloat128( zSign, 0, zSig0, zSig1 );
5705         }
5706         zSig2 = 0;
5707         zSig0 |= LIT64( 0x0002000000000000 );
5708         zExp = aExp;
5709         goto shiftRight1;
5710     }
5711     aSig0 |= LIT64( 0x0001000000000000 );
5712     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5713     --zExp;
5714     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5715     ++zExp;
5716  shiftRight1:
5717     shift128ExtraRightJamming(
5718         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5719  roundAndPack:
5720     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5721
5722 }
5723
5724 /*----------------------------------------------------------------------------
5725 | Returns the result of subtracting the absolute values of the quadruple-
5726 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
5727 | difference is negated before being returned.  `zSign' is ignored if the
5728 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5729 | Standard for Binary Floating-Point Arithmetic.
5730 *----------------------------------------------------------------------------*/
5731
5732 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5733 {
5734     int32 aExp, bExp, zExp;
5735     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5736     int32 expDiff;
5737     float128 z;
5738
5739     aSig1 = extractFloat128Frac1( a );
5740     aSig0 = extractFloat128Frac0( a );
5741     aExp = extractFloat128Exp( a );
5742     bSig1 = extractFloat128Frac1( b );
5743     bSig0 = extractFloat128Frac0( b );
5744     bExp = extractFloat128Exp( b );
5745     expDiff = aExp - bExp;
5746     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5747     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5748     if ( 0 < expDiff ) goto aExpBigger;
5749     if ( expDiff < 0 ) goto bExpBigger;
5750     if ( aExp == 0x7FFF ) {
5751         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5752             return propagateFloat128NaN( a, b STATUS_VAR );
5753         }
5754         float_raise( float_flag_invalid STATUS_VAR);
5755         z.low = float128_default_nan_low;
5756         z.high = float128_default_nan_high;
5757         return z;
5758     }
5759     if ( aExp == 0 ) {
5760         aExp = 1;
5761         bExp = 1;
5762     }
5763     if ( bSig0 < aSig0 ) goto aBigger;
5764     if ( aSig0 < bSig0 ) goto bBigger;
5765     if ( bSig1 < aSig1 ) goto aBigger;
5766     if ( aSig1 < bSig1 ) goto bBigger;
5767     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5768  bExpBigger:
5769     if ( bExp == 0x7FFF ) {
5770         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5771         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5772     }
5773     if ( aExp == 0 ) {
5774         ++expDiff;
5775     }
5776     else {
5777         aSig0 |= LIT64( 0x4000000000000000 );
5778     }
5779     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5780     bSig0 |= LIT64( 0x4000000000000000 );
5781  bBigger:
5782     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5783     zExp = bExp;
5784     zSign ^= 1;
5785     goto normalizeRoundAndPack;
5786  aExpBigger:
5787     if ( aExp == 0x7FFF ) {
5788         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5789         return a;
5790     }
5791     if ( bExp == 0 ) {
5792         --expDiff;
5793     }
5794     else {
5795         bSig0 |= LIT64( 0x4000000000000000 );
5796     }
5797     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5798     aSig0 |= LIT64( 0x4000000000000000 );
5799  aBigger:
5800     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5801     zExp = aExp;
5802  normalizeRoundAndPack:
5803     --zExp;
5804     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5805
5806 }
5807
5808 /*----------------------------------------------------------------------------
5809 | Returns the result of adding the quadruple-precision floating-point values
5810 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5811 | for Binary Floating-Point Arithmetic.
5812 *----------------------------------------------------------------------------*/
5813
5814 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5815 {
5816     flag aSign, bSign;
5817
5818     aSign = extractFloat128Sign( a );
5819     bSign = extractFloat128Sign( b );
5820     if ( aSign == bSign ) {
5821         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5822     }
5823     else {
5824         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5825     }
5826
5827 }
5828
5829 /*----------------------------------------------------------------------------
5830 | Returns the result of subtracting the quadruple-precision floating-point
5831 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5832 | Standard for Binary Floating-Point Arithmetic.
5833 *----------------------------------------------------------------------------*/
5834
5835 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5836 {
5837     flag aSign, bSign;
5838
5839     aSign = extractFloat128Sign( a );
5840     bSign = extractFloat128Sign( b );
5841     if ( aSign == bSign ) {
5842         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5843     }
5844     else {
5845         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5846     }
5847
5848 }
5849
5850 /*----------------------------------------------------------------------------
5851 | Returns the result of multiplying the quadruple-precision floating-point
5852 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5853 | Standard for Binary Floating-Point Arithmetic.
5854 *----------------------------------------------------------------------------*/
5855
5856 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5857 {
5858     flag aSign, bSign, zSign;
5859     int32 aExp, bExp, zExp;
5860     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5861     float128 z;
5862
5863     aSig1 = extractFloat128Frac1( a );
5864     aSig0 = extractFloat128Frac0( a );
5865     aExp = extractFloat128Exp( a );
5866     aSign = extractFloat128Sign( a );
5867     bSig1 = extractFloat128Frac1( b );
5868     bSig0 = extractFloat128Frac0( b );
5869     bExp = extractFloat128Exp( b );
5870     bSign = extractFloat128Sign( b );
5871     zSign = aSign ^ bSign;
5872     if ( aExp == 0x7FFF ) {
5873         if (    ( aSig0 | aSig1 )
5874              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5875             return propagateFloat128NaN( a, b STATUS_VAR );
5876         }
5877         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5878         return packFloat128( zSign, 0x7FFF, 0, 0 );
5879     }
5880     if ( bExp == 0x7FFF ) {
5881         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5882         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5883  invalid:
5884             float_raise( float_flag_invalid STATUS_VAR);
5885             z.low = float128_default_nan_low;
5886             z.high = float128_default_nan_high;
5887             return z;
5888         }
5889         return packFloat128( zSign, 0x7FFF, 0, 0 );
5890     }
5891     if ( aExp == 0 ) {
5892         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5893         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5894     }
5895     if ( bExp == 0 ) {
5896         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5897         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5898     }
5899     zExp = aExp + bExp - 0x4000;
5900     aSig0 |= LIT64( 0x0001000000000000 );
5901     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5902     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5903     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5904     zSig2 |= ( zSig3 != 0 );
5905     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5906         shift128ExtraRightJamming(
5907             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5908         ++zExp;
5909     }
5910     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5911
5912 }
5913
5914 /*----------------------------------------------------------------------------
5915 | Returns the result of dividing the quadruple-precision floating-point value
5916 | `a' by the corresponding value `b'.  The operation is performed according to
5917 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5918 *----------------------------------------------------------------------------*/
5919
5920 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5921 {
5922     flag aSign, bSign, zSign;
5923     int32 aExp, bExp, zExp;
5924     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5925     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5926     float128 z;
5927
5928     aSig1 = extractFloat128Frac1( a );
5929     aSig0 = extractFloat128Frac0( a );
5930     aExp = extractFloat128Exp( a );
5931     aSign = extractFloat128Sign( a );
5932     bSig1 = extractFloat128Frac1( b );
5933     bSig0 = extractFloat128Frac0( b );
5934     bExp = extractFloat128Exp( b );
5935     bSign = extractFloat128Sign( b );
5936     zSign = aSign ^ bSign;
5937     if ( aExp == 0x7FFF ) {
5938         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5939         if ( bExp == 0x7FFF ) {
5940             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5941             goto invalid;
5942         }
5943         return packFloat128( zSign, 0x7FFF, 0, 0 );
5944     }
5945     if ( bExp == 0x7FFF ) {
5946         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5947         return packFloat128( zSign, 0, 0, 0 );
5948     }
5949     if ( bExp == 0 ) {
5950         if ( ( bSig0 | bSig1 ) == 0 ) {
5951             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5952  invalid:
5953                 float_raise( float_flag_invalid STATUS_VAR);
5954                 z.low = float128_default_nan_low;
5955                 z.high = float128_default_nan_high;
5956                 return z;
5957             }
5958             float_raise( float_flag_divbyzero STATUS_VAR);
5959             return packFloat128( zSign, 0x7FFF, 0, 0 );
5960         }
5961         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5962     }
5963     if ( aExp == 0 ) {
5964         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5965         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5966     }
5967     zExp = aExp - bExp + 0x3FFD;
5968     shortShift128Left(
5969         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5970     shortShift128Left(
5971         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5972     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5973         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5974         ++zExp;
5975     }
5976     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5977     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5978     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5979     while ( (int64_t) rem0 < 0 ) {
5980         --zSig0;
5981         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5982     }
5983     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5984     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5985         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5986         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5987         while ( (int64_t) rem1 < 0 ) {
5988             --zSig1;
5989             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5990         }
5991         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5992     }
5993     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5994     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5995
5996 }
5997
5998 /*----------------------------------------------------------------------------
5999 | Returns the remainder of the quadruple-precision floating-point value `a'
6000 | with respect to the corresponding value `b'.  The operation is performed
6001 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6002 *----------------------------------------------------------------------------*/
6003
6004 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
6005 {
6006     flag aSign, zSign;
6007     int32 aExp, bExp, expDiff;
6008     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6009     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6010     int64_t sigMean0;
6011     float128 z;
6012
6013     aSig1 = extractFloat128Frac1( a );
6014     aSig0 = extractFloat128Frac0( a );
6015     aExp = extractFloat128Exp( a );
6016     aSign = extractFloat128Sign( a );
6017     bSig1 = extractFloat128Frac1( b );
6018     bSig0 = extractFloat128Frac0( b );
6019     bExp = extractFloat128Exp( b );
6020     if ( aExp == 0x7FFF ) {
6021         if (    ( aSig0 | aSig1 )
6022              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6023             return propagateFloat128NaN( a, b STATUS_VAR );
6024         }
6025         goto invalid;
6026     }
6027     if ( bExp == 0x7FFF ) {
6028         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6029         return a;
6030     }
6031     if ( bExp == 0 ) {
6032         if ( ( bSig0 | bSig1 ) == 0 ) {
6033  invalid:
6034             float_raise( float_flag_invalid STATUS_VAR);
6035             z.low = float128_default_nan_low;
6036             z.high = float128_default_nan_high;
6037             return z;
6038         }
6039         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6040     }
6041     if ( aExp == 0 ) {
6042         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6043         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6044     }
6045     expDiff = aExp - bExp;
6046     if ( expDiff < -1 ) return a;
6047     shortShift128Left(
6048         aSig0 | LIT64( 0x0001000000000000 ),
6049         aSig1,
6050         15 - ( expDiff < 0 ),
6051         &aSig0,
6052         &aSig1
6053     );
6054     shortShift128Left(
6055         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6056     q = le128( bSig0, bSig1, aSig0, aSig1 );
6057     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6058     expDiff -= 64;
6059     while ( 0 < expDiff ) {
6060         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6061         q = ( 4 < q ) ? q - 4 : 0;
6062         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6063         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6064         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6065         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6066         expDiff -= 61;
6067     }
6068     if ( -64 < expDiff ) {
6069         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6070         q = ( 4 < q ) ? q - 4 : 0;
6071         q >>= - expDiff;
6072         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6073         expDiff += 52;
6074         if ( expDiff < 0 ) {
6075             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6076         }
6077         else {
6078             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6079         }
6080         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6081         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6082     }
6083     else {
6084         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6085         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6086     }
6087     do {
6088         alternateASig0 = aSig0;
6089         alternateASig1 = aSig1;
6090         ++q;
6091         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6092     } while ( 0 <= (int64_t) aSig0 );
6093     add128(
6094         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6095     if (    ( sigMean0 < 0 )
6096          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6097         aSig0 = alternateASig0;
6098         aSig1 = alternateASig1;
6099     }
6100     zSign = ( (int64_t) aSig0 < 0 );
6101     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6102     return
6103         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6104
6105 }
6106
6107 /*----------------------------------------------------------------------------
6108 | Returns the square root of the quadruple-precision floating-point value `a'.
6109 | The operation is performed according to the IEC/IEEE Standard for Binary
6110 | Floating-Point Arithmetic.
6111 *----------------------------------------------------------------------------*/
6112
6113 float128 float128_sqrt( float128 a STATUS_PARAM )
6114 {
6115     flag aSign;
6116     int32 aExp, zExp;
6117     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6118     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6119     float128 z;
6120
6121     aSig1 = extractFloat128Frac1( a );
6122     aSig0 = extractFloat128Frac0( a );
6123     aExp = extractFloat128Exp( a );
6124     aSign = extractFloat128Sign( a );
6125     if ( aExp == 0x7FFF ) {
6126         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6127         if ( ! aSign ) return a;
6128         goto invalid;
6129     }
6130     if ( aSign ) {
6131         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6132  invalid:
6133         float_raise( float_flag_invalid STATUS_VAR);
6134         z.low = float128_default_nan_low;
6135         z.high = float128_default_nan_high;
6136         return z;
6137     }
6138     if ( aExp == 0 ) {
6139         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6140         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6141     }
6142     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6143     aSig0 |= LIT64( 0x0001000000000000 );
6144     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6145     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6146     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6147     doubleZSig0 = zSig0<<1;
6148     mul64To128( zSig0, zSig0, &term0, &term1 );
6149     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6150     while ( (int64_t) rem0 < 0 ) {
6151         --zSig0;
6152         doubleZSig0 -= 2;
6153         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6154     }
6155     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6156     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6157         if ( zSig1 == 0 ) zSig1 = 1;
6158         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6159         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6160         mul64To128( zSig1, zSig1, &term2, &term3 );
6161         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6162         while ( (int64_t) rem1 < 0 ) {
6163             --zSig1;
6164             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6165             term3 |= 1;
6166             term2 |= doubleZSig0;
6167             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6168         }
6169         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6170     }
6171     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6172     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6173
6174 }
6175
6176 /*----------------------------------------------------------------------------
6177 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6178 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6179 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6180 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6181 *----------------------------------------------------------------------------*/
6182
6183 int float128_eq( float128 a, float128 b STATUS_PARAM )
6184 {
6185
6186     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6187               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6188          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6189               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6190        ) {
6191         float_raise( float_flag_invalid STATUS_VAR);
6192         return 0;
6193     }
6194     return
6195            ( a.low == b.low )
6196         && (    ( a.high == b.high )
6197              || (    ( a.low == 0 )
6198                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6199            );
6200
6201 }
6202
6203 /*----------------------------------------------------------------------------
6204 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6205 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6206 | exception is raised if either operand is a NaN.  The comparison is performed
6207 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6208 *----------------------------------------------------------------------------*/
6209
6210 int float128_le( float128 a, float128 b STATUS_PARAM )
6211 {
6212     flag aSign, bSign;
6213
6214     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6215               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6216          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6217               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6218        ) {
6219         float_raise( float_flag_invalid STATUS_VAR);
6220         return 0;
6221     }
6222     aSign = extractFloat128Sign( a );
6223     bSign = extractFloat128Sign( b );
6224     if ( aSign != bSign ) {
6225         return
6226                aSign
6227             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6228                  == 0 );
6229     }
6230     return
6231           aSign ? le128( b.high, b.low, a.high, a.low )
6232         : le128( a.high, a.low, b.high, b.low );
6233
6234 }
6235
6236 /*----------------------------------------------------------------------------
6237 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6238 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6239 | raised if either operand is a NaN.  The comparison is performed according
6240 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6241 *----------------------------------------------------------------------------*/
6242
6243 int float128_lt( float128 a, float128 b STATUS_PARAM )
6244 {
6245     flag aSign, bSign;
6246
6247     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6248               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6249          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6250               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6251        ) {
6252         float_raise( float_flag_invalid STATUS_VAR);
6253         return 0;
6254     }
6255     aSign = extractFloat128Sign( a );
6256     bSign = extractFloat128Sign( b );
6257     if ( aSign != bSign ) {
6258         return
6259                aSign
6260             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6261                  != 0 );
6262     }
6263     return
6264           aSign ? lt128( b.high, b.low, a.high, a.low )
6265         : lt128( a.high, a.low, b.high, b.low );
6266
6267 }
6268
6269 /*----------------------------------------------------------------------------
6270 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6271 | be compared, and 0 otherwise.  The invalid exception is raised if either
6272 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6273 | Standard for Binary Floating-Point Arithmetic.
6274 *----------------------------------------------------------------------------*/
6275
6276 int float128_unordered( float128 a, float128 b STATUS_PARAM )
6277 {
6278     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6279               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6280          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6281               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6282        ) {
6283         float_raise( float_flag_invalid STATUS_VAR);
6284         return 1;
6285     }
6286     return 0;
6287 }
6288
6289 /*----------------------------------------------------------------------------
6290 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6291 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6292 | exception.  The comparison is performed according to the IEC/IEEE Standard
6293 | for Binary Floating-Point Arithmetic.
6294 *----------------------------------------------------------------------------*/
6295
6296 int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
6297 {
6298
6299     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6300               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6301          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6302               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6303        ) {
6304         if (    float128_is_signaling_nan( a )
6305              || float128_is_signaling_nan( b ) ) {
6306             float_raise( float_flag_invalid STATUS_VAR);
6307         }
6308         return 0;
6309     }
6310     return
6311            ( a.low == b.low )
6312         && (    ( a.high == b.high )
6313              || (    ( a.low == 0 )
6314                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6315            );
6316
6317 }
6318
6319 /*----------------------------------------------------------------------------
6320 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6321 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6322 | cause an exception.  Otherwise, the comparison is performed according to the
6323 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6324 *----------------------------------------------------------------------------*/
6325
6326 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
6327 {
6328     flag aSign, bSign;
6329
6330     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6331               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6332          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6333               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6334        ) {
6335         if (    float128_is_signaling_nan( a )
6336              || float128_is_signaling_nan( b ) ) {
6337             float_raise( float_flag_invalid STATUS_VAR);
6338         }
6339         return 0;
6340     }
6341     aSign = extractFloat128Sign( a );
6342     bSign = extractFloat128Sign( b );
6343     if ( aSign != bSign ) {
6344         return
6345                aSign
6346             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6347                  == 0 );
6348     }
6349     return
6350           aSign ? le128( b.high, b.low, a.high, a.low )
6351         : le128( a.high, a.low, b.high, b.low );
6352
6353 }
6354
6355 /*----------------------------------------------------------------------------
6356 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6357 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6358 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6359 | Standard for Binary Floating-Point Arithmetic.
6360 *----------------------------------------------------------------------------*/
6361
6362 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
6363 {
6364     flag aSign, bSign;
6365
6366     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6367               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6368          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6369               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6370        ) {
6371         if (    float128_is_signaling_nan( a )
6372              || float128_is_signaling_nan( b ) ) {
6373             float_raise( float_flag_invalid STATUS_VAR);
6374         }
6375         return 0;
6376     }
6377     aSign = extractFloat128Sign( a );
6378     bSign = extractFloat128Sign( b );
6379     if ( aSign != bSign ) {
6380         return
6381                aSign
6382             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6383                  != 0 );
6384     }
6385     return
6386           aSign ? lt128( b.high, b.low, a.high, a.low )
6387         : lt128( a.high, a.low, b.high, b.low );
6388
6389 }
6390
6391 /*----------------------------------------------------------------------------
6392 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6393 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6394 | comparison is performed according to the IEC/IEEE Standard for Binary
6395 | Floating-Point Arithmetic.
6396 *----------------------------------------------------------------------------*/
6397
6398 int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6399 {
6400     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6401               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6402          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6403               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6404        ) {
6405         if (    float128_is_signaling_nan( a )
6406              || float128_is_signaling_nan( b ) ) {
6407             float_raise( float_flag_invalid STATUS_VAR);
6408         }
6409         return 1;
6410     }
6411     return 0;
6412 }
6413
6414 /* misc functions */
6415 float32 uint32_to_float32( uint32 a STATUS_PARAM )
6416 {
6417     return int64_to_float32(a STATUS_VAR);
6418 }
6419
6420 float64 uint32_to_float64( uint32 a STATUS_PARAM )
6421 {
6422     return int64_to_float64(a STATUS_VAR);
6423 }
6424
6425 uint32 float32_to_uint32( float32 a STATUS_PARAM )
6426 {
6427     int64_t v;
6428     uint32 res;
6429
6430     v = float32_to_int64(a STATUS_VAR);
6431     if (v < 0) {
6432         res = 0;
6433         float_raise( float_flag_invalid STATUS_VAR);
6434     } else if (v > 0xffffffff) {
6435         res = 0xffffffff;
6436         float_raise( float_flag_invalid STATUS_VAR);
6437     } else {
6438         res = v;
6439     }
6440     return res;
6441 }
6442
6443 uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6444 {
6445     int64_t v;
6446     uint32 res;
6447
6448     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6449     if (v < 0) {
6450         res = 0;
6451         float_raise( float_flag_invalid STATUS_VAR);
6452     } else if (v > 0xffffffff) {
6453         res = 0xffffffff;
6454         float_raise( float_flag_invalid STATUS_VAR);
6455     } else {
6456         res = v;
6457     }
6458     return res;
6459 }
6460
6461 uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
6462 {
6463     int64_t v;
6464     uint_fast16_t res;
6465
6466     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6467     if (v < 0) {
6468         res = 0;
6469         float_raise( float_flag_invalid STATUS_VAR);
6470     } else if (v > 0xffff) {
6471         res = 0xffff;
6472         float_raise( float_flag_invalid STATUS_VAR);
6473     } else {
6474         res = v;
6475     }
6476     return res;
6477 }
6478
6479 uint32 float64_to_uint32( float64 a STATUS_PARAM )
6480 {
6481     int64_t v;
6482     uint32 res;
6483
6484     v = float64_to_int64(a STATUS_VAR);
6485     if (v < 0) {
6486         res = 0;
6487         float_raise( float_flag_invalid STATUS_VAR);
6488     } else if (v > 0xffffffff) {
6489         res = 0xffffffff;
6490         float_raise( float_flag_invalid STATUS_VAR);
6491     } else {
6492         res = v;
6493     }
6494     return res;
6495 }
6496
6497 uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6498 {
6499     int64_t v;
6500     uint32 res;
6501
6502     v = float64_to_int64_round_to_zero(a STATUS_VAR);
6503     if (v < 0) {
6504         res = 0;
6505         float_raise( float_flag_invalid STATUS_VAR);
6506     } else if (v > 0xffffffff) {
6507         res = 0xffffffff;
6508         float_raise( float_flag_invalid STATUS_VAR);
6509     } else {
6510         res = v;
6511     }
6512     return res;
6513 }
6514
6515 uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
6516 {
6517     int64_t v;
6518     uint_fast16_t res;
6519
6520     v = float64_to_int64_round_to_zero(a STATUS_VAR);
6521     if (v < 0) {
6522         res = 0;
6523         float_raise( float_flag_invalid STATUS_VAR);
6524     } else if (v > 0xffff) {
6525         res = 0xffff;
6526         float_raise( float_flag_invalid STATUS_VAR);
6527     } else {
6528         res = v;
6529     }
6530     return res;
6531 }
6532
6533 /* FIXME: This looks broken.  */
6534 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
6535 {
6536     int64_t v;
6537
6538     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6539     v += float64_val(a);
6540     v = float64_to_int64(make_float64(v) STATUS_VAR);
6541
6542     return v - INT64_MIN;
6543 }
6544
6545 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6546 {
6547     int64_t v;
6548
6549     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6550     v += float64_val(a);
6551     v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
6552
6553     return v - INT64_MIN;
6554 }
6555
6556 #define COMPARE(s, nan_exp)                                                  \
6557 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
6558                                       int is_quiet STATUS_PARAM )            \
6559 {                                                                            \
6560     flag aSign, bSign;                                                       \
6561     uint ## s ## _t av, bv;                                                  \
6562     a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
6563     b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
6564                                                                              \
6565     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
6566          extractFloat ## s ## Frac( a ) ) ||                                 \
6567         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
6568           extractFloat ## s ## Frac( b ) )) {                                \
6569         if (!is_quiet ||                                                     \
6570             float ## s ## _is_signaling_nan( a ) ||                          \
6571             float ## s ## _is_signaling_nan( b ) ) {                         \
6572             float_raise( float_flag_invalid STATUS_VAR);                     \
6573         }                                                                    \
6574         return float_relation_unordered;                                     \
6575     }                                                                        \
6576     aSign = extractFloat ## s ## Sign( a );                                  \
6577     bSign = extractFloat ## s ## Sign( b );                                  \
6578     av = float ## s ## _val(a);                                              \
6579     bv = float ## s ## _val(b);                                              \
6580     if ( aSign != bSign ) {                                                  \
6581         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
6582             /* zero case */                                                  \
6583             return float_relation_equal;                                     \
6584         } else {                                                             \
6585             return 1 - (2 * aSign);                                          \
6586         }                                                                    \
6587     } else {                                                                 \
6588         if (av == bv) {                                                      \
6589             return float_relation_equal;                                     \
6590         } else {                                                             \
6591             return 1 - 2 * (aSign ^ ( av < bv ));                            \
6592         }                                                                    \
6593     }                                                                        \
6594 }                                                                            \
6595                                                                              \
6596 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
6597 {                                                                            \
6598     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
6599 }                                                                            \
6600                                                                              \
6601 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
6602 {                                                                            \
6603     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
6604 }
6605
6606 COMPARE(32, 0xff)
6607 COMPARE(64, 0x7ff)
6608
6609 INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6610                                       int is_quiet STATUS_PARAM )
6611 {
6612     flag aSign, bSign;
6613
6614     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6615           ( extractFloatx80Frac( a )<<1 ) ) ||
6616         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6617           ( extractFloatx80Frac( b )<<1 ) )) {
6618         if (!is_quiet ||
6619             floatx80_is_signaling_nan( a ) ||
6620             floatx80_is_signaling_nan( b ) ) {
6621             float_raise( float_flag_invalid STATUS_VAR);
6622         }
6623         return float_relation_unordered;
6624     }
6625     aSign = extractFloatx80Sign( a );
6626     bSign = extractFloatx80Sign( b );
6627     if ( aSign != bSign ) {
6628
6629         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6630              ( ( a.low | b.low ) == 0 ) ) {
6631             /* zero case */
6632             return float_relation_equal;
6633         } else {
6634             return 1 - (2 * aSign);
6635         }
6636     } else {
6637         if (a.low == b.low && a.high == b.high) {
6638             return float_relation_equal;
6639         } else {
6640             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6641         }
6642     }
6643 }
6644
6645 int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6646 {
6647     return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6648 }
6649
6650 int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6651 {
6652     return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6653 }
6654
6655 INLINE int float128_compare_internal( float128 a, float128 b,
6656                                       int is_quiet STATUS_PARAM )
6657 {
6658     flag aSign, bSign;
6659
6660     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6661           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6662         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6663           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6664         if (!is_quiet ||
6665             float128_is_signaling_nan( a ) ||
6666             float128_is_signaling_nan( b ) ) {
6667             float_raise( float_flag_invalid STATUS_VAR);
6668         }
6669         return float_relation_unordered;
6670     }
6671     aSign = extractFloat128Sign( a );
6672     bSign = extractFloat128Sign( b );
6673     if ( aSign != bSign ) {
6674         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6675             /* zero case */
6676             return float_relation_equal;
6677         } else {
6678             return 1 - (2 * aSign);
6679         }
6680     } else {
6681         if (a.low == b.low && a.high == b.high) {
6682             return float_relation_equal;
6683         } else {
6684             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6685         }
6686     }
6687 }
6688
6689 int float128_compare( float128 a, float128 b STATUS_PARAM )
6690 {
6691     return float128_compare_internal(a, b, 0 STATUS_VAR);
6692 }
6693
6694 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6695 {
6696     return float128_compare_internal(a, b, 1 STATUS_VAR);
6697 }
6698
6699 /* min() and max() functions. These can't be implemented as
6700  * 'compare and pick one input' because that would mishandle
6701  * NaNs and +0 vs -0.
6702  */
6703 #define MINMAX(s, nan_exp)                                              \
6704 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
6705                                         int ismin STATUS_PARAM )        \
6706 {                                                                       \
6707     flag aSign, bSign;                                                  \
6708     uint ## s ## _t av, bv;                                             \
6709     a = float ## s ## _squash_input_denormal(a STATUS_VAR);             \
6710     b = float ## s ## _squash_input_denormal(b STATUS_VAR);             \
6711     if (float ## s ## _is_any_nan(a) ||                                 \
6712         float ## s ## _is_any_nan(b)) {                                 \
6713         return propagateFloat ## s ## NaN(a, b STATUS_VAR);             \
6714     }                                                                   \
6715     aSign = extractFloat ## s ## Sign(a);                               \
6716     bSign = extractFloat ## s ## Sign(b);                               \
6717     av = float ## s ## _val(a);                                         \
6718     bv = float ## s ## _val(b);                                         \
6719     if (aSign != bSign) {                                               \
6720         if (ismin) {                                                    \
6721             return aSign ? a : b;                                       \
6722         } else {                                                        \
6723             return aSign ? b : a;                                       \
6724         }                                                               \
6725     } else {                                                            \
6726         if (ismin) {                                                    \
6727             return (aSign ^ (av < bv)) ? a : b;                         \
6728         } else {                                                        \
6729             return (aSign ^ (av < bv)) ? b : a;                         \
6730         }                                                               \
6731     }                                                                   \
6732 }                                                                       \
6733                                                                         \
6734 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM)  \
6735 {                                                                       \
6736     return float ## s ## _minmax(a, b, 1 STATUS_VAR);                   \
6737 }                                                                       \
6738                                                                         \
6739 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM)  \
6740 {                                                                       \
6741     return float ## s ## _minmax(a, b, 0 STATUS_VAR);                   \
6742 }
6743
6744 MINMAX(32, 0xff)
6745 MINMAX(64, 0x7ff)
6746
6747
6748 /* Multiply A by 2 raised to the power N.  */
6749 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6750 {
6751     flag aSign;
6752     int16_t aExp;
6753     uint32_t aSig;
6754
6755     a = float32_squash_input_denormal(a STATUS_VAR);
6756     aSig = extractFloat32Frac( a );
6757     aExp = extractFloat32Exp( a );
6758     aSign = extractFloat32Sign( a );
6759
6760     if ( aExp == 0xFF ) {
6761         if ( aSig ) {
6762             return propagateFloat32NaN( a, a STATUS_VAR );
6763         }
6764         return a;
6765     }
6766     if ( aExp != 0 )
6767         aSig |= 0x00800000;
6768     else if ( aSig == 0 )
6769         return a;
6770
6771     if (n > 0x200) {
6772         n = 0x200;
6773     } else if (n < -0x200) {
6774         n = -0x200;
6775     }
6776
6777     aExp += n - 1;
6778     aSig <<= 7;
6779     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6780 }
6781
6782 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6783 {
6784     flag aSign;
6785     int16_t aExp;
6786     uint64_t aSig;
6787
6788     a = float64_squash_input_denormal(a STATUS_VAR);
6789     aSig = extractFloat64Frac( a );
6790     aExp = extractFloat64Exp( a );
6791     aSign = extractFloat64Sign( a );
6792
6793     if ( aExp == 0x7FF ) {
6794         if ( aSig ) {
6795             return propagateFloat64NaN( a, a STATUS_VAR );
6796         }
6797         return a;
6798     }
6799     if ( aExp != 0 )
6800         aSig |= LIT64( 0x0010000000000000 );
6801     else if ( aSig == 0 )
6802         return a;
6803
6804     if (n > 0x1000) {
6805         n = 0x1000;
6806     } else if (n < -0x1000) {
6807         n = -0x1000;
6808     }
6809
6810     aExp += n - 1;
6811     aSig <<= 10;
6812     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6813 }
6814
6815 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6816 {
6817     flag aSign;
6818     int32_t aExp;
6819     uint64_t aSig;
6820
6821     aSig = extractFloatx80Frac( a );
6822     aExp = extractFloatx80Exp( a );
6823     aSign = extractFloatx80Sign( a );
6824
6825     if ( aExp == 0x7FFF ) {
6826         if ( aSig<<1 ) {
6827             return propagateFloatx80NaN( a, a STATUS_VAR );
6828         }
6829         return a;
6830     }
6831
6832     if (aExp == 0 && aSig == 0)
6833         return a;
6834
6835     if (n > 0x10000) {
6836         n = 0x10000;
6837     } else if (n < -0x10000) {
6838         n = -0x10000;
6839     }
6840
6841     aExp += n;
6842     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6843                                           aSign, aExp, aSig, 0 STATUS_VAR );
6844 }
6845
6846 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6847 {
6848     flag aSign;
6849     int32_t aExp;
6850     uint64_t aSig0, aSig1;
6851
6852     aSig1 = extractFloat128Frac1( a );
6853     aSig0 = extractFloat128Frac0( a );
6854     aExp = extractFloat128Exp( a );
6855     aSign = extractFloat128Sign( a );
6856     if ( aExp == 0x7FFF ) {
6857         if ( aSig0 | aSig1 ) {
6858             return propagateFloat128NaN( a, a STATUS_VAR );
6859         }
6860         return a;
6861     }
6862     if ( aExp != 0 )
6863         aSig0 |= LIT64( 0x0001000000000000 );
6864     else if ( aSig0 == 0 && aSig1 == 0 )
6865         return a;
6866
6867     if (n > 0x10000) {
6868         n = 0x10000;
6869     } else if (n < -0x10000) {
6870         n = -0x10000;
6871     }
6872
6873     aExp += n - 1;
6874     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6875                                           STATUS_VAR );
6876
6877 }