fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * Derived from SoftFloat.
   5  */
   6
   7 /*============================================================================
   8
   9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  10 Package, Release 2b.
  11
  12 Written by John R. Hauser.  This work was made possible in part by the
  13 International Computer Science Institute, located at Suite 600, 1947 Center
  14 Street, Berkeley, California 94704.  Funding was partially provided by the
  15 National Science Foundation under grant MIP-9311980.  The original version
  16 of this code was written as part of a project to build a fixed-point vector
  17 processor in collaboration with the University of California at Berkeley,
  18 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  20 arithmetic/SoftFloat.html'.
  21
  22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  24 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  30
  31 Derivative works are acceptable, even for commercial purposes, so long as
  32 (1) the source code for the derivative work includes prominent notice that
  33 the work is derivative, and (2) the source code includes prominent notice with
  34 these four paragraphs for those parts of this code that are retained.
  35
  36 =============================================================================*/
  37
  38 /* softfloat (and in particular the code in softfloat-specialize.h) is
  39  * target-dependent and needs the TARGET_* macros.
  40  */
  41 #include "config.h"
  42
  43 #include "fpu/softfloat.h"
  44
  45 /* We only need stdlib for abort() */
  46 #include <stdlib.h>
  47
  48 /*----------------------------------------------------------------------------
  49 | Primitive arithmetic functions, including multi-word arithmetic, and
  50 | division and square root approximations.  (Can be specialized to target if
  51 | desired.)
  52 *----------------------------------------------------------------------------*/
  53 #include "softfloat-macros.h"
  54
  55 /*----------------------------------------------------------------------------
  56 | Functions and definitions to determine:  (1) whether tininess for underflow
  57 | is detected before or after rounding by default, (2) what (if anything)
  58 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  59 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  60 | are propagated from function inputs to output.  These details are target-
  61 | specific.
  62 *----------------------------------------------------------------------------*/
  63 #include "softfloat-specialize.h"
  64
  65 /*----------------------------------------------------------------------------
  66 | Returns the fraction bits of the half-precision floating-point value `a'.
  67 *----------------------------------------------------------------------------*/
  68
  69 INLINE uint32_t extractFloat16Frac(float16 a)
  70 {
  71     return float16_val(a) & 0x3ff;
  72 }
  73
  74 /*----------------------------------------------------------------------------
  75 | Returns the exponent bits of the half-precision floating-point value `a'.
  76 *----------------------------------------------------------------------------*/
  77
  78 INLINE int_fast16_t extractFloat16Exp(float16 a)
  79 {
  80     return (float16_val(a) >> 10) & 0x1f;
  81 }
  82
  83 /*----------------------------------------------------------------------------
  84 | Returns the sign bit of the single-precision floating-point value `a'.
  85 *----------------------------------------------------------------------------*/
  86
  87 INLINE flag extractFloat16Sign(float16 a)
  88 {
  89     return float16_val(a)>>15;
  90 }
  91
  92 /*----------------------------------------------------------------------------
  93 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  94 | and 7, and returns the properly rounded 32-bit integer corresponding to the
  95 | input.  If `zSign' is 1, the input is negated before being converted to an
  96 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
  97 | is simply rounded to an integer, with the inexact exception raised if the
  98 | input cannot be represented exactly as an integer.  However, if the fixed-
  99 | point input is too large, the invalid exception is raised and the largest
 100 | positive or negative integer is returned.
 101 *----------------------------------------------------------------------------*/
 102
 103 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
 104 {
 105     int8 roundingMode;
 106     flag roundNearestEven;
 107     int8 roundIncrement, roundBits;
 108     int32_t z;
 109
 110     roundingMode = STATUS(float_rounding_mode);
 111     roundNearestEven = ( roundingMode == float_round_nearest_even );
 112     switch (roundingMode) {
 113     case float_round_nearest_even:
 114     case float_round_ties_away:
 115         roundIncrement = 0x40;
 116         break;
 117     case float_round_to_zero:
 118         roundIncrement = 0;
 119         break;
 120     case float_round_up:
 121         roundIncrement = zSign ? 0 : 0x7f;
 122         break;
 123     case float_round_down:
 124         roundIncrement = zSign ? 0x7f : 0;
 125         break;
 126     default:
 127         abort();
 128     }
 129     roundBits = absZ & 0x7F;
 130     absZ = ( absZ + roundIncrement )>>7;
 131     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 132     z = absZ;
 133     if ( zSign ) z = - z;
 134     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 135         float_raise( float_flag_invalid STATUS_VAR);
 136         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
 137     }
 138     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 139     return z;
 140
 141 }
 142
 143 /*----------------------------------------------------------------------------
 144 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 145 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 146 | and returns the properly rounded 64-bit integer corresponding to the input.
 147 | If `zSign' is 1, the input is negated before being converted to an integer.
 148 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 149 | the inexact exception raised if the input cannot be represented exactly as
 150 | an integer.  However, if the fixed-point input is too large, the invalid
 151 | exception is raised and the largest positive or negative integer is
 152 | returned.
 153 *----------------------------------------------------------------------------*/
 154
 155 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
 156 {
 157     int8 roundingMode;
 158     flag roundNearestEven, increment;
 159     int64_t z;
 160
 161     roundingMode = STATUS(float_rounding_mode);
 162     roundNearestEven = ( roundingMode == float_round_nearest_even );
 163     switch (roundingMode) {
 164     case float_round_nearest_even:
 165     case float_round_ties_away:
 166         increment = ((int64_t) absZ1 < 0);
 167         break;
 168     case float_round_to_zero:
 169         increment = 0;
 170         break;
 171     case float_round_up:
 172         increment = !zSign && absZ1;
 173         break;
 174     case float_round_down:
 175         increment = zSign && absZ1;
 176         break;
 177     default:
 178         abort();
 179     }
 180     if ( increment ) {
 181         ++absZ0;
 182         if ( absZ0 == 0 ) goto overflow;
 183         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 184     }
 185     z = absZ0;
 186     if ( zSign ) z = - z;
 187     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 188  overflow:
 189         float_raise( float_flag_invalid STATUS_VAR);
 190         return
 191               zSign ? (int64_t) LIT64( 0x8000000000000000 )
 192             : LIT64( 0x7FFFFFFFFFFFFFFF );
 193     }
 194     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 195     return z;
 196
 197 }
 198
 199 /*----------------------------------------------------------------------------
 200 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 201 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 202 | and returns the properly rounded 64-bit unsigned integer corresponding to the
 203 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
 204 | with the inexact exception raised if the input cannot be represented exactly
 205 | as an integer.  However, if the fixed-point input is too large, the invalid
 206 | exception is raised and the largest unsigned integer is returned.
 207 *----------------------------------------------------------------------------*/
 208
 209 static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,
 210                                 uint64_t absZ1 STATUS_PARAM)
 211 {
 212     int8 roundingMode;
 213     flag roundNearestEven, increment;
 214
 215     roundingMode = STATUS(float_rounding_mode);
 216     roundNearestEven = (roundingMode == float_round_nearest_even);
 217     switch (roundingMode) {
 218     case float_round_nearest_even:
 219     case float_round_ties_away:
 220         increment = ((int64_t)absZ1 < 0);
 221         break;
 222     case float_round_to_zero:
 223         increment = 0;
 224         break;
 225     case float_round_up:
 226         increment = !zSign && absZ1;
 227         break;
 228     case float_round_down:
 229         increment = zSign && absZ1;
 230         break;
 231     default:
 232         abort();
 233     }
 234     if (increment) {
 235         ++absZ0;
 236         if (absZ0 == 0) {
 237             float_raise(float_flag_invalid STATUS_VAR);
 238             return LIT64(0xFFFFFFFFFFFFFFFF);
 239         }
 240         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
 241     }
 242
 243     if (zSign && absZ0) {
 244         float_raise(float_flag_invalid STATUS_VAR);
 245         return 0;
 246     }
 247
 248     if (absZ1) {
 249         STATUS(float_exception_flags) |= float_flag_inexact;
 250     }
 251     return absZ0;
 252 }
 253
 254 /*----------------------------------------------------------------------------
 255 | Returns the fraction bits of the single-precision floating-point value `a'.
 256 *----------------------------------------------------------------------------*/
 257
 258 INLINE uint32_t extractFloat32Frac( float32 a )
 259 {
 260
 261     return float32_val(a) & 0x007FFFFF;
 262
 263 }
 264
 265 /*----------------------------------------------------------------------------
 266 | Returns the exponent bits of the single-precision floating-point value `a'.
 267 *----------------------------------------------------------------------------*/
 268
 269 INLINE int_fast16_t extractFloat32Exp(float32 a)
 270 {
 271
 272     return ( float32_val(a)>>23 ) & 0xFF;
 273
 274 }
 275
 276 /*----------------------------------------------------------------------------
 277 | Returns the sign bit of the single-precision floating-point value `a'.
 278 *----------------------------------------------------------------------------*/
 279
 280 INLINE flag extractFloat32Sign( float32 a )
 281 {
 282
 283     return float32_val(a)>>31;
 284
 285 }
 286
 287 /*----------------------------------------------------------------------------
 288 | If `a' is denormal and we are in flush-to-zero mode then set the
 289 | input-denormal exception and return zero. Otherwise just return the value.
 290 *----------------------------------------------------------------------------*/
 291 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
 292 {
 293     if (STATUS(flush_inputs_to_zero)) {
 294         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
 295             float_raise(float_flag_input_denormal STATUS_VAR);
 296             return make_float32(float32_val(a) & 0x80000000);
 297         }
 298     }
 299     return a;
 300 }
 301
 302 /*----------------------------------------------------------------------------
 303 | Normalizes the subnormal single-precision floating-point value represented
 304 | by the denormalized significand `aSig'.  The normalized exponent and
 305 | significand are stored at the locations pointed to by `zExpPtr' and
 306 | `zSigPtr', respectively.
 307 *----------------------------------------------------------------------------*/
 308
 309 static void
 310  normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
 311 {
 312     int8 shiftCount;
 313
 314     shiftCount = countLeadingZeros32( aSig ) - 8;
 315     *zSigPtr = aSig<<shiftCount;
 316     *zExpPtr = 1 - shiftCount;
 317
 318 }
 319
 320 /*----------------------------------------------------------------------------
 321 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 322 | single-precision floating-point value, returning the result.  After being
 323 | shifted into the proper positions, the three fields are simply added
 324 | together to form the result.  This means that any integer portion of `zSig'
 325 | will be added into the exponent.  Since a properly normalized significand
 326 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 327 | than the desired result exponent whenever `zSig' is a complete, normalized
 328 | significand.
 329 *----------------------------------------------------------------------------*/
 330
 331 INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
 332 {
 333
 334     return make_float32(
 335           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
 336
 337 }
 338
 339 /*----------------------------------------------------------------------------
 340 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 341 | and significand `zSig', and returns the proper single-precision floating-
 342 | point value corresponding to the abstract input.  Ordinarily, the abstract
 343 | value is simply rounded and packed into the single-precision format, with
 344 | the inexact exception raised if the abstract input cannot be represented
 345 | exactly.  However, if the abstract value is too large, the overflow and
 346 | inexact exceptions are raised and an infinity or maximal finite value is
 347 | returned.  If the abstract value is too small, the input value is rounded to
 348 | a subnormal number, and the underflow and inexact exceptions are raised if
 349 | the abstract input cannot be represented exactly as a subnormal single-
 350 | precision floating-point number.
 351 |     The input significand `zSig' has its binary point between bits 30
 352 | and 29, which is 7 bits to the left of the usual location.  This shifted
 353 | significand must be normalized or smaller.  If `zSig' is not normalized,
 354 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 355 | and it must not require rounding.  In the usual case that `zSig' is
 356 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 357 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 358 | Binary Floating-Point Arithmetic.
 359 *----------------------------------------------------------------------------*/
 360
 361 static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
 362 {
 363     int8 roundingMode;
 364     flag roundNearestEven;
 365     int8 roundIncrement, roundBits;
 366     flag isTiny;
 367
 368     roundingMode = STATUS(float_rounding_mode);
 369     roundNearestEven = ( roundingMode == float_round_nearest_even );
 370     switch (roundingMode) {
 371     case float_round_nearest_even:
 372     case float_round_ties_away:
 373         roundIncrement = 0x40;
 374         break;
 375     case float_round_to_zero:
 376         roundIncrement = 0;
 377         break;
 378     case float_round_up:
 379         roundIncrement = zSign ? 0 : 0x7f;
 380         break;
 381     case float_round_down:
 382         roundIncrement = zSign ? 0x7f : 0;
 383         break;
 384     default:
 385         abort();
 386         break;
 387     }
 388     roundBits = zSig & 0x7F;
 389     if ( 0xFD <= (uint16_t) zExp ) {
 390         if (    ( 0xFD < zExp )
 391              || (    ( zExp == 0xFD )
 392                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
 393            ) {
 394             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 395             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 396         }
 397         if ( zExp < 0 ) {
 398             if (STATUS(flush_to_zero)) {
 399                 float_raise(float_flag_output_denormal STATUS_VAR);
 400                 return packFloat32(zSign, 0, 0);
 401             }
 402             isTiny =
 403                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 404                 || ( zExp < -1 )
 405                 || ( zSig + roundIncrement < 0x80000000 );
 406             shift32RightJamming( zSig, - zExp, &zSig );
 407             zExp = 0;
 408             roundBits = zSig & 0x7F;
 409             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 410         }
 411     }
 412     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 413     zSig = ( zSig + roundIncrement )>>7;
 414     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 415     if ( zSig == 0 ) zExp = 0;
 416     return packFloat32( zSign, zExp, zSig );
 417
 418 }
 419
 420 /*----------------------------------------------------------------------------
 421 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 422 | and significand `zSig', and returns the proper single-precision floating-
 423 | point value corresponding to the abstract input.  This routine is just like
 424 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 425 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 426 | floating-point exponent.
 427 *----------------------------------------------------------------------------*/
 428
 429 static float32
 430  normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
 431 {
 432     int8 shiftCount;
 433
 434     shiftCount = countLeadingZeros32( zSig ) - 1;
 435     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 436
 437 }
 438
 439 /*----------------------------------------------------------------------------
 440 | Returns the fraction bits of the double-precision floating-point value `a'.
 441 *----------------------------------------------------------------------------*/
 442
 443 INLINE uint64_t extractFloat64Frac( float64 a )
 444 {
 445
 446     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 447
 448 }
 449
 450 /*----------------------------------------------------------------------------
 451 | Returns the exponent bits of the double-precision floating-point value `a'.
 452 *----------------------------------------------------------------------------*/
 453
 454 INLINE int_fast16_t extractFloat64Exp(float64 a)
 455 {
 456
 457     return ( float64_val(a)>>52 ) & 0x7FF;
 458
 459 }
 460
 461 /*----------------------------------------------------------------------------
 462 | Returns the sign bit of the double-precision floating-point value `a'.
 463 *----------------------------------------------------------------------------*/
 464
 465 INLINE flag extractFloat64Sign( float64 a )
 466 {
 467
 468     return float64_val(a)>>63;
 469
 470 }
 471
 472 /*----------------------------------------------------------------------------
 473 | If `a' is denormal and we are in flush-to-zero mode then set the
 474 | input-denormal exception and return zero. Otherwise just return the value.
 475 *----------------------------------------------------------------------------*/
 476 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
 477 {
 478     if (STATUS(flush_inputs_to_zero)) {
 479         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
 480             float_raise(float_flag_input_denormal STATUS_VAR);
 481             return make_float64(float64_val(a) & (1ULL << 63));
 482         }
 483     }
 484     return a;
 485 }
 486
 487 /*----------------------------------------------------------------------------
 488 | Normalizes the subnormal double-precision floating-point value represented
 489 | by the denormalized significand `aSig'.  The normalized exponent and
 490 | significand are stored at the locations pointed to by `zExpPtr' and
 491 | `zSigPtr', respectively.
 492 *----------------------------------------------------------------------------*/
 493
 494 static void
 495  normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
 496 {
 497     int8 shiftCount;
 498
 499     shiftCount = countLeadingZeros64( aSig ) - 11;
 500     *zSigPtr = aSig<<shiftCount;
 501     *zExpPtr = 1 - shiftCount;
 502
 503 }
 504
 505 /*----------------------------------------------------------------------------
 506 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 507 | double-precision floating-point value, returning the result.  After being
 508 | shifted into the proper positions, the three fields are simply added
 509 | together to form the result.  This means that any integer portion of `zSig'
 510 | will be added into the exponent.  Since a properly normalized significand
 511 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 512 | than the desired result exponent whenever `zSig' is a complete, normalized
 513 | significand.
 514 *----------------------------------------------------------------------------*/
 515
 516 INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
 517 {
 518
 519     return make_float64(
 520         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
 521
 522 }
 523
 524 /*----------------------------------------------------------------------------
 525 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 526 | and significand `zSig', and returns the proper double-precision floating-
 527 | point value corresponding to the abstract input.  Ordinarily, the abstract
 528 | value is simply rounded and packed into the double-precision format, with
 529 | the inexact exception raised if the abstract input cannot be represented
 530 | exactly.  However, if the abstract value is too large, the overflow and
 531 | inexact exceptions are raised and an infinity or maximal finite value is
 532 | returned.  If the abstract value is too small, the input value is rounded
 533 | to a subnormal number, and the underflow and inexact exceptions are raised
 534 | if the abstract input cannot be represented exactly as a subnormal double-
 535 | precision floating-point number.
 536 |     The input significand `zSig' has its binary point between bits 62
 537 | and 61, which is 10 bits to the left of the usual location.  This shifted
 538 | significand must be normalized or smaller.  If `zSig' is not normalized,
 539 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 540 | and it must not require rounding.  In the usual case that `zSig' is
 541 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 542 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 543 | Binary Floating-Point Arithmetic.
 544 *----------------------------------------------------------------------------*/
 545
 546 static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
 547 {
 548     int8 roundingMode;
 549     flag roundNearestEven;
 550     int_fast16_t roundIncrement, roundBits;
 551     flag isTiny;
 552
 553     roundingMode = STATUS(float_rounding_mode);
 554     roundNearestEven = ( roundingMode == float_round_nearest_even );
 555     switch (roundingMode) {
 556     case float_round_nearest_even:
 557     case float_round_ties_away:
 558         roundIncrement = 0x200;
 559         break;
 560     case float_round_to_zero:
 561         roundIncrement = 0;
 562         break;
 563     case float_round_up:
 564         roundIncrement = zSign ? 0 : 0x3ff;
 565         break;
 566     case float_round_down:
 567         roundIncrement = zSign ? 0x3ff : 0;
 568         break;
 569     default:
 570         abort();
 571     }
 572     roundBits = zSig & 0x3FF;
 573     if ( 0x7FD <= (uint16_t) zExp ) {
 574         if (    ( 0x7FD < zExp )
 575              || (    ( zExp == 0x7FD )
 576                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
 577            ) {
 578             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 579             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 580         }
 581         if ( zExp < 0 ) {
 582             if (STATUS(flush_to_zero)) {
 583                 float_raise(float_flag_output_denormal STATUS_VAR);
 584                 return packFloat64(zSign, 0, 0);
 585             }
 586             isTiny =
 587                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 588                 || ( zExp < -1 )
 589                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 590             shift64RightJamming( zSig, - zExp, &zSig );
 591             zExp = 0;
 592             roundBits = zSig & 0x3FF;
 593             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 594         }
 595     }
 596     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 597     zSig = ( zSig + roundIncrement )>>10;
 598     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 599     if ( zSig == 0 ) zExp = 0;
 600     return packFloat64( zSign, zExp, zSig );
 601
 602 }
 603
 604 /*----------------------------------------------------------------------------
 605 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 606 | and significand `zSig', and returns the proper double-precision floating-
 607 | point value corresponding to the abstract input.  This routine is just like
 608 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 609 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 610 | floating-point exponent.
 611 *----------------------------------------------------------------------------*/
 612
 613 static float64
 614  normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
 615 {
 616     int8 shiftCount;
 617
 618     shiftCount = countLeadingZeros64( zSig ) - 1;
 619     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 620
 621 }
 622
 623 /*----------------------------------------------------------------------------
 624 | Returns the fraction bits of the extended double-precision floating-point
 625 | value `a'.
 626 *----------------------------------------------------------------------------*/
 627
 628 INLINE uint64_t extractFloatx80Frac( floatx80 a )
 629 {
 630
 631     return a.low;
 632
 633 }
 634
 635 /*----------------------------------------------------------------------------
 636 | Returns the exponent bits of the extended double-precision floating-point
 637 | value `a'.
 638 *----------------------------------------------------------------------------*/
 639
 640 INLINE int32 extractFloatx80Exp( floatx80 a )
 641 {
 642
 643     return a.high & 0x7FFF;
 644
 645 }
 646
 647 /*----------------------------------------------------------------------------
 648 | Returns the sign bit of the extended double-precision floating-point value
 649 | `a'.
 650 *----------------------------------------------------------------------------*/
 651
 652 INLINE flag extractFloatx80Sign( floatx80 a )
 653 {
 654
 655     return a.high>>15;
 656
 657 }
 658
 659 /*----------------------------------------------------------------------------
 660 | Normalizes the subnormal extended double-precision floating-point value
 661 | represented by the denormalized significand `aSig'.  The normalized exponent
 662 | and significand are stored at the locations pointed to by `zExpPtr' and
 663 | `zSigPtr', respectively.
 664 *----------------------------------------------------------------------------*/
 665
 666 static void
 667  normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
 668 {
 669     int8 shiftCount;
 670
 671     shiftCount = countLeadingZeros64( aSig );
 672     *zSigPtr = aSig<<shiftCount;
 673     *zExpPtr = 1 - shiftCount;
 674
 675 }
 676
 677 /*----------------------------------------------------------------------------
 678 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 679 | extended double-precision floating-point value, returning the result.
 680 *----------------------------------------------------------------------------*/
 681
 682 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
 683 {
 684     floatx80 z;
 685
 686     z.low = zSig;
 687     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
 688     return z;
 689
 690 }
 691
 692 /*----------------------------------------------------------------------------
 693 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 694 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 695 | and returns the proper extended double-precision floating-point value
 696 | corresponding to the abstract input.  Ordinarily, the abstract value is
 697 | rounded and packed into the extended double-precision format, with the
 698 | inexact exception raised if the abstract input cannot be represented
 699 | exactly.  However, if the abstract value is too large, the overflow and
 700 | inexact exceptions are raised and an infinity or maximal finite value is
 701 | returned.  If the abstract value is too small, the input value is rounded to
 702 | a subnormal number, and the underflow and inexact exceptions are raised if
 703 | the abstract input cannot be represented exactly as a subnormal extended
 704 | double-precision floating-point number.
 705 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 706 | number of bits as single or double precision, respectively.  Otherwise, the
 707 | result is rounded to the full precision of the extended double-precision
 708 | format.
 709 |     The input significand must be normalized or smaller.  If the input
 710 | significand is not normalized, `zExp' must be 0; in that case, the result
 711 | returned is a subnormal number, and it must not require rounding.  The
 712 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 713 | Floating-Point Arithmetic.
 714 *----------------------------------------------------------------------------*/
 715
 716 static floatx80
 717  roundAndPackFloatx80(
 718      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 719  STATUS_PARAM)
 720 {
 721     int8 roundingMode;
 722     flag roundNearestEven, increment, isTiny;
 723     int64 roundIncrement, roundMask, roundBits;
 724
 725     roundingMode = STATUS(float_rounding_mode);
 726     roundNearestEven = ( roundingMode == float_round_nearest_even );
 727     if ( roundingPrecision == 80 ) goto precision80;
 728     if ( roundingPrecision == 64 ) {
 729         roundIncrement = LIT64( 0x0000000000000400 );
 730         roundMask = LIT64( 0x00000000000007FF );
 731     }
 732     else if ( roundingPrecision == 32 ) {
 733         roundIncrement = LIT64( 0x0000008000000000 );
 734         roundMask = LIT64( 0x000000FFFFFFFFFF );
 735     }
 736     else {
 737         goto precision80;
 738     }
 739     zSig0 |= ( zSig1 != 0 );
 740     switch (roundingMode) {
 741     case float_round_nearest_even:
 742     case float_round_ties_away:
 743         break;
 744     case float_round_to_zero:
 745         roundIncrement = 0;
 746         break;
 747     case float_round_up:
 748         roundIncrement = zSign ? 0 : roundMask;
 749         break;
 750     case float_round_down:
 751         roundIncrement = zSign ? roundMask : 0;
 752         break;
 753     default:
 754         abort();
 755     }
 756     roundBits = zSig0 & roundMask;
 757     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 758         if (    ( 0x7FFE < zExp )
 759              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 760            ) {
 761             goto overflow;
 762         }
 763         if ( zExp <= 0 ) {
 764             if (STATUS(flush_to_zero)) {
 765                 float_raise(float_flag_output_denormal STATUS_VAR);
 766                 return packFloatx80(zSign, 0, 0);
 767             }
 768             isTiny =
 769                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 770                 || ( zExp < 0 )
 771                 || ( zSig0 <= zSig0 + roundIncrement );
 772             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 773             zExp = 0;
 774             roundBits = zSig0 & roundMask;
 775             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 776             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 777             zSig0 += roundIncrement;
 778             if ( (int64_t) zSig0 < 0 ) zExp = 1;
 779             roundIncrement = roundMask + 1;
 780             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 781                 roundMask |= roundIncrement;
 782             }
 783             zSig0 &= ~ roundMask;
 784             return packFloatx80( zSign, zExp, zSig0 );
 785         }
 786     }
 787     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 788     zSig0 += roundIncrement;
 789     if ( zSig0 < roundIncrement ) {
 790         ++zExp;
 791         zSig0 = LIT64( 0x8000000000000000 );
 792     }
 793     roundIncrement = roundMask + 1;
 794     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 795         roundMask |= roundIncrement;
 796     }
 797     zSig0 &= ~ roundMask;
 798     if ( zSig0 == 0 ) zExp = 0;
 799     return packFloatx80( zSign, zExp, zSig0 );
 800  precision80:
 801     switch (roundingMode) {
 802     case float_round_nearest_even:
 803     case float_round_ties_away:
 804         increment = ((int64_t)zSig1 < 0);
 805         break;
 806     case float_round_to_zero:
 807         increment = 0;
 808         break;
 809     case float_round_up:
 810         increment = !zSign && zSig1;
 811         break;
 812     case float_round_down:
 813         increment = zSign && zSig1;
 814         break;
 815     default:
 816         abort();
 817     }
 818     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 819         if (    ( 0x7FFE < zExp )
 820              || (    ( zExp == 0x7FFE )
 821                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 822                   && increment
 823                 )
 824            ) {
 825             roundMask = 0;
 826  overflow:
 827             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 828             if (    ( roundingMode == float_round_to_zero )
 829                  || ( zSign && ( roundingMode == float_round_up ) )
 830                  || ( ! zSign && ( roundingMode == float_round_down ) )
 831                ) {
 832                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 833             }
 834             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 835         }
 836         if ( zExp <= 0 ) {
 837             isTiny =
 838                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 839                 || ( zExp < 0 )
 840                 || ! increment
 841                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 842             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 843             zExp = 0;
 844             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 845             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 846             switch (roundingMode) {
 847             case float_round_nearest_even:
 848             case float_round_ties_away:
 849                 increment = ((int64_t)zSig1 < 0);
 850                 break;
 851             case float_round_to_zero:
 852                 increment = 0;
 853                 break;
 854             case float_round_up:
 855                 increment = !zSign && zSig1;
 856                 break;
 857             case float_round_down:
 858                 increment = zSign && zSig1;
 859                 break;
 860             default:
 861                 abort();
 862             }
 863             if ( increment ) {
 864                 ++zSig0;
 865                 zSig0 &=
 866                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 867                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
 868             }
 869             return packFloatx80( zSign, zExp, zSig0 );
 870         }
 871     }
 872     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 873     if ( increment ) {
 874         ++zSig0;
 875         if ( zSig0 == 0 ) {
 876             ++zExp;
 877             zSig0 = LIT64( 0x8000000000000000 );
 878         }
 879         else {
 880             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 881         }
 882     }
 883     else {
 884         if ( zSig0 == 0 ) zExp = 0;
 885     }
 886     return packFloatx80( zSign, zExp, zSig0 );
 887
 888 }
 889
 890 /*----------------------------------------------------------------------------
 891 | Takes an abstract floating-point value having sign `zSign', exponent
 892 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 893 | and returns the proper extended double-precision floating-point value
 894 | corresponding to the abstract input.  This routine is just like
 895 | `roundAndPackFloatx80' except that the input significand does not have to be
 896 | normalized.
 897 *----------------------------------------------------------------------------*/
 898
 899 static floatx80
 900  normalizeRoundAndPackFloatx80(
 901      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 902  STATUS_PARAM)
 903 {
 904     int8 shiftCount;
 905
 906     if ( zSig0 == 0 ) {
 907         zSig0 = zSig1;
 908         zSig1 = 0;
 909         zExp -= 64;
 910     }
 911     shiftCount = countLeadingZeros64( zSig0 );
 912     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 913     zExp -= shiftCount;
 914     return
 915         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 916
 917 }
 918
 919 /*----------------------------------------------------------------------------
 920 | Returns the least-significant 64 fraction bits of the quadruple-precision
 921 | floating-point value `a'.
 922 *----------------------------------------------------------------------------*/
 923
 924 INLINE uint64_t extractFloat128Frac1( float128 a )
 925 {
 926
 927     return a.low;
 928
 929 }
 930
 931 /*----------------------------------------------------------------------------
 932 | Returns the most-significant 48 fraction bits of the quadruple-precision
 933 | floating-point value `a'.
 934 *----------------------------------------------------------------------------*/
 935
 936 INLINE uint64_t extractFloat128Frac0( float128 a )
 937 {
 938
 939     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 940
 941 }
 942
 943 /*----------------------------------------------------------------------------
 944 | Returns the exponent bits of the quadruple-precision floating-point value
 945 | `a'.
 946 *----------------------------------------------------------------------------*/
 947
 948 INLINE int32 extractFloat128Exp( float128 a )
 949 {
 950
 951     return ( a.high>>48 ) & 0x7FFF;
 952
 953 }
 954
 955 /*----------------------------------------------------------------------------
 956 | Returns the sign bit of the quadruple-precision floating-point value `a'.
 957 *----------------------------------------------------------------------------*/
 958
 959 INLINE flag extractFloat128Sign( float128 a )
 960 {
 961
 962     return a.high>>63;
 963
 964 }
 965
 966 /*----------------------------------------------------------------------------
 967 | Normalizes the subnormal quadruple-precision floating-point value
 968 | represented by the denormalized significand formed by the concatenation of
 969 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 970 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 971 | significand are stored at the location pointed to by `zSig0Ptr', and the
 972 | least significant 64 bits of the normalized significand are stored at the
 973 | location pointed to by `zSig1Ptr'.
 974 *----------------------------------------------------------------------------*/
 975
 976 static void
 977  normalizeFloat128Subnormal(
 978      uint64_t aSig0,
 979      uint64_t aSig1,
 980      int32 *zExpPtr,
 981      uint64_t *zSig0Ptr,
 982      uint64_t *zSig1Ptr
 983  )
 984 {
 985     int8 shiftCount;
 986
 987     if ( aSig0 == 0 ) {
 988         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 989         if ( shiftCount < 0 ) {
 990             *zSig0Ptr = aSig1>>( - shiftCount );
 991             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 992         }
 993         else {
 994             *zSig0Ptr = aSig1<<shiftCount;
 995             *zSig1Ptr = 0;
 996         }
 997         *zExpPtr = - shiftCount - 63;
 998     }
 999     else {
1000         shiftCount = countLeadingZeros64( aSig0 ) - 15;
1001         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1002         *zExpPtr = 1 - shiftCount;
1003     }
1004
1005 }
1006
1007 /*----------------------------------------------------------------------------
1008 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1009 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1010 | floating-point value, returning the result.  After being shifted into the
1011 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1012 | added together to form the most significant 32 bits of the result.  This
1013 | means that any integer portion of `zSig0' will be added into the exponent.
1014 | Since a properly normalized significand will have an integer portion equal
1015 | to 1, the `zExp' input should be 1 less than the desired result exponent
1016 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1017 | significand.
1018 *----------------------------------------------------------------------------*/
1019
1020 INLINE float128
1021  packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
1022 {
1023     float128 z;
1024
1025     z.low = zSig1;
1026     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1027     return z;
1028
1029 }
1030
1031 /*----------------------------------------------------------------------------
1032 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1033 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1034 | and `zSig2', and returns the proper quadruple-precision floating-point value
1035 | corresponding to the abstract input.  Ordinarily, the abstract value is
1036 | simply rounded and packed into the quadruple-precision format, with the
1037 | inexact exception raised if the abstract input cannot be represented
1038 | exactly.  However, if the abstract value is too large, the overflow and
1039 | inexact exceptions are raised and an infinity or maximal finite value is
1040 | returned.  If the abstract value is too small, the input value is rounded to
1041 | a subnormal number, and the underflow and inexact exceptions are raised if
1042 | the abstract input cannot be represented exactly as a subnormal quadruple-
1043 | precision floating-point number.
1044 |     The input significand must be normalized or smaller.  If the input
1045 | significand is not normalized, `zExp' must be 0; in that case, the result
1046 | returned is a subnormal number, and it must not require rounding.  In the
1047 | usual case that the input significand is normalized, `zExp' must be 1 less
1048 | than the ``true'' floating-point exponent.  The handling of underflow and
1049 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1050 *----------------------------------------------------------------------------*/
1051
1052 static float128
1053  roundAndPackFloat128(
1054      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
1055 {
1056     int8 roundingMode;
1057     flag roundNearestEven, increment, isTiny;
1058
1059     roundingMode = STATUS(float_rounding_mode);
1060     roundNearestEven = ( roundingMode == float_round_nearest_even );
1061     switch (roundingMode) {
1062     case float_round_nearest_even:
1063     case float_round_ties_away:
1064         increment = ((int64_t)zSig2 < 0);
1065         break;
1066     case float_round_to_zero:
1067         increment = 0;
1068         break;
1069     case float_round_up:
1070         increment = !zSign && zSig2;
1071         break;
1072     case float_round_down:
1073         increment = zSign && zSig2;
1074         break;
1075     default:
1076         abort();
1077     }
1078     if ( 0x7FFD <= (uint32_t) zExp ) {
1079         if (    ( 0x7FFD < zExp )
1080              || (    ( zExp == 0x7FFD )
1081                   && eq128(
1082                          LIT64( 0x0001FFFFFFFFFFFF ),
1083                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1084                          zSig0,
1085                          zSig1
1086                      )
1087                   && increment
1088                 )
1089            ) {
1090             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1091             if (    ( roundingMode == float_round_to_zero )
1092                  || ( zSign && ( roundingMode == float_round_up ) )
1093                  || ( ! zSign && ( roundingMode == float_round_down ) )
1094                ) {
1095                 return
1096                     packFloat128(
1097                         zSign,
1098                         0x7FFE,
1099                         LIT64( 0x0000FFFFFFFFFFFF ),
1100                         LIT64( 0xFFFFFFFFFFFFFFFF )
1101                     );
1102             }
1103             return packFloat128( zSign, 0x7FFF, 0, 0 );
1104         }
1105         if ( zExp < 0 ) {
1106             if (STATUS(flush_to_zero)) {
1107                 float_raise(float_flag_output_denormal STATUS_VAR);
1108                 return packFloat128(zSign, 0, 0, 0);
1109             }
1110             isTiny =
1111                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1112                 || ( zExp < -1 )
1113                 || ! increment
1114                 || lt128(
1115                        zSig0,
1116                        zSig1,
1117                        LIT64( 0x0001FFFFFFFFFFFF ),
1118                        LIT64( 0xFFFFFFFFFFFFFFFF )
1119                    );
1120             shift128ExtraRightJamming(
1121                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1122             zExp = 0;
1123             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1124             switch (roundingMode) {
1125             case float_round_nearest_even:
1126             case float_round_ties_away:
1127                 increment = ((int64_t)zSig2 < 0);
1128                 break;
1129             case float_round_to_zero:
1130                 increment = 0;
1131                 break;
1132             case float_round_up:
1133                 increment = !zSign && zSig2;
1134                 break;
1135             case float_round_down:
1136                 increment = zSign && zSig2;
1137                 break;
1138             default:
1139                 abort();
1140             }
1141         }
1142     }
1143     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1144     if ( increment ) {
1145         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1146         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1147     }
1148     else {
1149         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1150     }
1151     return packFloat128( zSign, zExp, zSig0, zSig1 );
1152
1153 }
1154
1155 /*----------------------------------------------------------------------------
1156 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1157 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1158 | returns the proper quadruple-precision floating-point value corresponding
1159 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1160 | except that the input significand has fewer bits and does not have to be
1161 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1162 | point exponent.
1163 *----------------------------------------------------------------------------*/
1164
1165 static float128
1166  normalizeRoundAndPackFloat128(
1167      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1168 {
1169     int8 shiftCount;
1170     uint64_t zSig2;
1171
1172     if ( zSig0 == 0 ) {
1173         zSig0 = zSig1;
1174         zSig1 = 0;
1175         zExp -= 64;
1176     }
1177     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1178     if ( 0 <= shiftCount ) {
1179         zSig2 = 0;
1180         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1181     }
1182     else {
1183         shift128ExtraRightJamming(
1184             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1185     }
1186     zExp -= shiftCount;
1187     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1188
1189 }
1190
1191 /*----------------------------------------------------------------------------
1192 | Returns the result of converting the 32-bit two's complement integer `a'
1193 | to the single-precision floating-point format.  The conversion is performed
1194 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1195 *----------------------------------------------------------------------------*/
1196
1197 float32 int32_to_float32(int32_t a STATUS_PARAM)
1198 {
1199     flag zSign;
1200
1201     if ( a == 0 ) return float32_zero;
1202     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1203     zSign = ( a < 0 );
1204     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1205
1206 }
1207
1208 /*----------------------------------------------------------------------------
1209 | Returns the result of converting the 32-bit two's complement integer `a'
1210 | to the double-precision floating-point format.  The conversion is performed
1211 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1212 *----------------------------------------------------------------------------*/
1213
1214 float64 int32_to_float64(int32_t a STATUS_PARAM)
1215 {
1216     flag zSign;
1217     uint32 absA;
1218     int8 shiftCount;
1219     uint64_t zSig;
1220
1221     if ( a == 0 ) return float64_zero;
1222     zSign = ( a < 0 );
1223     absA = zSign ? - a : a;
1224     shiftCount = countLeadingZeros32( absA ) + 21;
1225     zSig = absA;
1226     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1227
1228 }
1229
1230 /*----------------------------------------------------------------------------
1231 | Returns the result of converting the 32-bit two's complement integer `a'
1232 | to the extended double-precision floating-point format.  The conversion
1233 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1234 | Arithmetic.
1235 *----------------------------------------------------------------------------*/
1236
1237 floatx80 int32_to_floatx80(int32_t a STATUS_PARAM)
1238 {
1239     flag zSign;
1240     uint32 absA;
1241     int8 shiftCount;
1242     uint64_t zSig;
1243
1244     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1245     zSign = ( a < 0 );
1246     absA = zSign ? - a : a;
1247     shiftCount = countLeadingZeros32( absA ) + 32;
1248     zSig = absA;
1249     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1250
1251 }
1252
1253 /*----------------------------------------------------------------------------
1254 | Returns the result of converting the 32-bit two's complement integer `a' to
1255 | the quadruple-precision floating-point format.  The conversion is performed
1256 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1257 *----------------------------------------------------------------------------*/
1258
1259 float128 int32_to_float128(int32_t a STATUS_PARAM)
1260 {
1261     flag zSign;
1262     uint32 absA;
1263     int8 shiftCount;
1264     uint64_t zSig0;
1265
1266     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1267     zSign = ( a < 0 );
1268     absA = zSign ? - a : a;
1269     shiftCount = countLeadingZeros32( absA ) + 17;
1270     zSig0 = absA;
1271     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1272
1273 }
1274
1275 /*----------------------------------------------------------------------------
1276 | Returns the result of converting the 64-bit two's complement integer `a'
1277 | to the single-precision floating-point format.  The conversion is performed
1278 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1279 *----------------------------------------------------------------------------*/
1280
1281 float32 int64_to_float32(int64_t a STATUS_PARAM)
1282 {
1283     flag zSign;
1284     uint64 absA;
1285     int8 shiftCount;
1286
1287     if ( a == 0 ) return float32_zero;
1288     zSign = ( a < 0 );
1289     absA = zSign ? - a : a;
1290     shiftCount = countLeadingZeros64( absA ) - 40;
1291     if ( 0 <= shiftCount ) {
1292         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1293     }
1294     else {
1295         shiftCount += 7;
1296         if ( shiftCount < 0 ) {
1297             shift64RightJamming( absA, - shiftCount, &absA );
1298         }
1299         else {
1300             absA <<= shiftCount;
1301         }
1302         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1303     }
1304
1305 }
1306
1307 float32 uint64_to_float32(uint64_t a STATUS_PARAM)
1308 {
1309     int8 shiftCount;
1310
1311     if ( a == 0 ) return float32_zero;
1312     shiftCount = countLeadingZeros64( a ) - 40;
1313     if ( 0 <= shiftCount ) {
1314         return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
1315     }
1316     else {
1317         shiftCount += 7;
1318         if ( shiftCount < 0 ) {
1319             shift64RightJamming( a, - shiftCount, &a );
1320         }
1321         else {
1322             a <<= shiftCount;
1323         }
1324         return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
1325     }
1326 }
1327
1328 /*----------------------------------------------------------------------------
1329 | Returns the result of converting the 64-bit two's complement integer `a'
1330 | to the double-precision floating-point format.  The conversion is performed
1331 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1332 *----------------------------------------------------------------------------*/
1333
1334 float64 int64_to_float64(int64_t a STATUS_PARAM)
1335 {
1336     flag zSign;
1337
1338     if ( a == 0 ) return float64_zero;
1339     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1340         return packFloat64( 1, 0x43E, 0 );
1341     }
1342     zSign = ( a < 0 );
1343     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1344
1345 }
1346
1347 float64 uint64_to_float64(uint64_t a STATUS_PARAM)
1348 {
1349     int exp =  0x43C;
1350
1351     if (a == 0) {
1352         return float64_zero;
1353     }
1354     if ((int64_t)a < 0) {
1355         shift64RightJamming(a, 1, &a);
1356         exp += 1;
1357     }
1358     return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);
1359 }
1360
1361 /*----------------------------------------------------------------------------
1362 | Returns the result of converting the 64-bit two's complement integer `a'
1363 | to the extended double-precision floating-point format.  The conversion
1364 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1365 | Arithmetic.
1366 *----------------------------------------------------------------------------*/
1367
1368 floatx80 int64_to_floatx80(int64_t a STATUS_PARAM)
1369 {
1370     flag zSign;
1371     uint64 absA;
1372     int8 shiftCount;
1373
1374     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1375     zSign = ( a < 0 );
1376     absA = zSign ? - a : a;
1377     shiftCount = countLeadingZeros64( absA );
1378     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1379
1380 }
1381
1382 /*----------------------------------------------------------------------------
1383 | Returns the result of converting the 64-bit two's complement integer `a' to
1384 | the quadruple-precision floating-point format.  The conversion is performed
1385 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1386 *----------------------------------------------------------------------------*/
1387
1388 float128 int64_to_float128(int64_t a STATUS_PARAM)
1389 {
1390     flag zSign;
1391     uint64 absA;
1392     int8 shiftCount;
1393     int32 zExp;
1394     uint64_t zSig0, zSig1;
1395
1396     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1397     zSign = ( a < 0 );
1398     absA = zSign ? - a : a;
1399     shiftCount = countLeadingZeros64( absA ) + 49;
1400     zExp = 0x406E - shiftCount;
1401     if ( 64 <= shiftCount ) {
1402         zSig1 = 0;
1403         zSig0 = absA;
1404         shiftCount -= 64;
1405     }
1406     else {
1407         zSig1 = absA;
1408         zSig0 = 0;
1409     }
1410     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1411     return packFloat128( zSign, zExp, zSig0, zSig1 );
1412
1413 }
1414
1415 float128 uint64_to_float128(uint64_t a STATUS_PARAM)
1416 {
1417     if (a == 0) {
1418         return float128_zero;
1419     }
1420     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1421 }
1422
1423 /*----------------------------------------------------------------------------
1424 | Returns the result of converting the single-precision floating-point value
1425 | `a' to the 32-bit two's complement integer format.  The conversion is
1426 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1427 | Arithmetic---which means in particular that the conversion is rounded
1428 | according to the current rounding mode.  If `a' is a NaN, the largest
1429 | positive integer is returned.  Otherwise, if the conversion overflows, the
1430 | largest integer with the same sign as `a' is returned.
1431 *----------------------------------------------------------------------------*/
1432
1433 int32 float32_to_int32( float32 a STATUS_PARAM )
1434 {
1435     flag aSign;
1436     int_fast16_t aExp, shiftCount;
1437     uint32_t aSig;
1438     uint64_t aSig64;
1439
1440     a = float32_squash_input_denormal(a STATUS_VAR);
1441     aSig = extractFloat32Frac( a );
1442     aExp = extractFloat32Exp( a );
1443     aSign = extractFloat32Sign( a );
1444     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1445     if ( aExp ) aSig |= 0x00800000;
1446     shiftCount = 0xAF - aExp;
1447     aSig64 = aSig;
1448     aSig64 <<= 32;
1449     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1450     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1451
1452 }
1453
1454 /*----------------------------------------------------------------------------
1455 | Returns the result of converting the single-precision floating-point value
1456 | `a' to the 32-bit two's complement integer format.  The conversion is
1457 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1458 | Arithmetic, except that the conversion is always rounded toward zero.
1459 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1460 | the conversion overflows, the largest integer with the same sign as `a' is
1461 | returned.
1462 *----------------------------------------------------------------------------*/
1463
1464 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1465 {
1466     flag aSign;
1467     int_fast16_t aExp, shiftCount;
1468     uint32_t aSig;
1469     int32_t z;
1470     a = float32_squash_input_denormal(a STATUS_VAR);
1471
1472     aSig = extractFloat32Frac( a );
1473     aExp = extractFloat32Exp( a );
1474     aSign = extractFloat32Sign( a );
1475     shiftCount = aExp - 0x9E;
1476     if ( 0 <= shiftCount ) {
1477         if ( float32_val(a) != 0xCF000000 ) {
1478             float_raise( float_flag_invalid STATUS_VAR);
1479             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1480         }
1481         return (int32_t) 0x80000000;
1482     }
1483     else if ( aExp <= 0x7E ) {
1484         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1485         return 0;
1486     }
1487     aSig = ( aSig | 0x00800000 )<<8;
1488     z = aSig>>( - shiftCount );
1489     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1490         STATUS(float_exception_flags) |= float_flag_inexact;
1491     }
1492     if ( aSign ) z = - z;
1493     return z;
1494
1495 }
1496
1497 /*----------------------------------------------------------------------------
1498 | Returns the result of converting the single-precision floating-point value
1499 | `a' to the 16-bit two's complement integer format.  The conversion is
1500 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1501 | Arithmetic, except that the conversion is always rounded toward zero.
1502 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1503 | the conversion overflows, the largest integer with the same sign as `a' is
1504 | returned.
1505 *----------------------------------------------------------------------------*/
1506
1507 int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
1508 {
1509     flag aSign;
1510     int_fast16_t aExp, shiftCount;
1511     uint32_t aSig;
1512     int32 z;
1513
1514     aSig = extractFloat32Frac( a );
1515     aExp = extractFloat32Exp( a );
1516     aSign = extractFloat32Sign( a );
1517     shiftCount = aExp - 0x8E;
1518     if ( 0 <= shiftCount ) {
1519         if ( float32_val(a) != 0xC7000000 ) {
1520             float_raise( float_flag_invalid STATUS_VAR);
1521             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1522                 return 0x7FFF;
1523             }
1524         }
1525         return (int32_t) 0xffff8000;
1526     }
1527     else if ( aExp <= 0x7E ) {
1528         if ( aExp | aSig ) {
1529             STATUS(float_exception_flags) |= float_flag_inexact;
1530         }
1531         return 0;
1532     }
1533     shiftCount -= 0x10;
1534     aSig = ( aSig | 0x00800000 )<<8;
1535     z = aSig>>( - shiftCount );
1536     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1537         STATUS(float_exception_flags) |= float_flag_inexact;
1538     }
1539     if ( aSign ) {
1540         z = - z;
1541     }
1542     return z;
1543
1544 }
1545
1546 /*----------------------------------------------------------------------------
1547 | Returns the result of converting the single-precision floating-point value
1548 | `a' to the 64-bit two's complement integer format.  The conversion is
1549 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1550 | Arithmetic---which means in particular that the conversion is rounded
1551 | according to the current rounding mode.  If `a' is a NaN, the largest
1552 | positive integer is returned.  Otherwise, if the conversion overflows, the
1553 | largest integer with the same sign as `a' is returned.
1554 *----------------------------------------------------------------------------*/
1555
1556 int64 float32_to_int64( float32 a STATUS_PARAM )
1557 {
1558     flag aSign;
1559     int_fast16_t aExp, shiftCount;
1560     uint32_t aSig;
1561     uint64_t aSig64, aSigExtra;
1562     a = float32_squash_input_denormal(a STATUS_VAR);
1563
1564     aSig = extractFloat32Frac( a );
1565     aExp = extractFloat32Exp( a );
1566     aSign = extractFloat32Sign( a );
1567     shiftCount = 0xBE - aExp;
1568     if ( shiftCount < 0 ) {
1569         float_raise( float_flag_invalid STATUS_VAR);
1570         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1571             return LIT64( 0x7FFFFFFFFFFFFFFF );
1572         }
1573         return (int64_t) LIT64( 0x8000000000000000 );
1574     }
1575     if ( aExp ) aSig |= 0x00800000;
1576     aSig64 = aSig;
1577     aSig64 <<= 40;
1578     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1579     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1580
1581 }
1582
1583 /*----------------------------------------------------------------------------
1584 | Returns the result of converting the single-precision floating-point value
1585 | `a' to the 64-bit unsigned integer format.  The conversion is
1586 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1587 | Arithmetic---which means in particular that the conversion is rounded
1588 | according to the current rounding mode.  If `a' is a NaN, the largest
1589 | unsigned integer is returned.  Otherwise, if the conversion overflows, the
1590 | largest unsigned integer is returned.  If the 'a' is negative, the result
1591 | is rounded and zero is returned; values that do not round to zero will
1592 | raise the inexact exception flag.
1593 *----------------------------------------------------------------------------*/
1594
1595 uint64 float32_to_uint64(float32 a STATUS_PARAM)
1596 {
1597     flag aSign;
1598     int_fast16_t aExp, shiftCount;
1599     uint32_t aSig;
1600     uint64_t aSig64, aSigExtra;
1601     a = float32_squash_input_denormal(a STATUS_VAR);
1602
1603     aSig = extractFloat32Frac(a);
1604     aExp = extractFloat32Exp(a);
1605     aSign = extractFloat32Sign(a);
1606     if ((aSign) && (aExp > 126)) {
1607         float_raise(float_flag_invalid STATUS_VAR);
1608         if (float32_is_any_nan(a)) {
1609             return LIT64(0xFFFFFFFFFFFFFFFF);
1610         } else {
1611             return 0;
1612         }
1613     }
1614     shiftCount = 0xBE - aExp;
1615     if (aExp) {
1616         aSig |= 0x00800000;
1617     }
1618     if (shiftCount < 0) {
1619         float_raise(float_flag_invalid STATUS_VAR);
1620         return LIT64(0xFFFFFFFFFFFFFFFF);
1621     }
1622
1623     aSig64 = aSig;
1624     aSig64 <<= 40;
1625     shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1626     return roundAndPackUint64(aSign, aSig64, aSigExtra STATUS_VAR);
1627 }
1628
1629 /*----------------------------------------------------------------------------
1630 | Returns the result of converting the single-precision floating-point value
1631 | `a' to the 64-bit two's complement integer format.  The conversion is
1632 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1633 | Arithmetic, except that the conversion is always rounded toward zero.  If
1634 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1635 | conversion overflows, the largest integer with the same sign as `a' is
1636 | returned.
1637 *----------------------------------------------------------------------------*/
1638
1639 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1640 {
1641     flag aSign;
1642     int_fast16_t aExp, shiftCount;
1643     uint32_t aSig;
1644     uint64_t aSig64;
1645     int64 z;
1646     a = float32_squash_input_denormal(a STATUS_VAR);
1647
1648     aSig = extractFloat32Frac( a );
1649     aExp = extractFloat32Exp( a );
1650     aSign = extractFloat32Sign( a );
1651     shiftCount = aExp - 0xBE;
1652     if ( 0 <= shiftCount ) {
1653         if ( float32_val(a) != 0xDF000000 ) {
1654             float_raise( float_flag_invalid STATUS_VAR);
1655             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1656                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1657             }
1658         }
1659         return (int64_t) LIT64( 0x8000000000000000 );
1660     }
1661     else if ( aExp <= 0x7E ) {
1662         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1663         return 0;
1664     }
1665     aSig64 = aSig | 0x00800000;
1666     aSig64 <<= 40;
1667     z = aSig64>>( - shiftCount );
1668     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1669         STATUS(float_exception_flags) |= float_flag_inexact;
1670     }
1671     if ( aSign ) z = - z;
1672     return z;
1673
1674 }
1675
1676 /*----------------------------------------------------------------------------
1677 | Returns the result of converting the single-precision floating-point value
1678 | `a' to the double-precision floating-point format.  The conversion is
1679 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1680 | Arithmetic.
1681 *----------------------------------------------------------------------------*/
1682
1683 float64 float32_to_float64( float32 a STATUS_PARAM )
1684 {
1685     flag aSign;
1686     int_fast16_t aExp;
1687     uint32_t aSig;
1688     a = float32_squash_input_denormal(a STATUS_VAR);
1689
1690     aSig = extractFloat32Frac( a );
1691     aExp = extractFloat32Exp( a );
1692     aSign = extractFloat32Sign( a );
1693     if ( aExp == 0xFF ) {
1694         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1695         return packFloat64( aSign, 0x7FF, 0 );
1696     }
1697     if ( aExp == 0 ) {
1698         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1699         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1700         --aExp;
1701     }
1702     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1703
1704 }
1705
1706 /*----------------------------------------------------------------------------
1707 | Returns the result of converting the single-precision floating-point value
1708 | `a' to the extended double-precision floating-point format.  The conversion
1709 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1710 | Arithmetic.
1711 *----------------------------------------------------------------------------*/
1712
1713 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1714 {
1715     flag aSign;
1716     int_fast16_t aExp;
1717     uint32_t aSig;
1718
1719     a = float32_squash_input_denormal(a STATUS_VAR);
1720     aSig = extractFloat32Frac( a );
1721     aExp = extractFloat32Exp( a );
1722     aSign = extractFloat32Sign( a );
1723     if ( aExp == 0xFF ) {
1724         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1725         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1726     }
1727     if ( aExp == 0 ) {
1728         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1729         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1730     }
1731     aSig |= 0x00800000;
1732     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1733
1734 }
1735
1736 /*----------------------------------------------------------------------------
1737 | Returns the result of converting the single-precision floating-point value
1738 | `a' to the double-precision floating-point format.  The conversion is
1739 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1740 | Arithmetic.
1741 *----------------------------------------------------------------------------*/
1742
1743 float128 float32_to_float128( float32 a STATUS_PARAM )
1744 {
1745     flag aSign;
1746     int_fast16_t aExp;
1747     uint32_t aSig;
1748
1749     a = float32_squash_input_denormal(a STATUS_VAR);
1750     aSig = extractFloat32Frac( a );
1751     aExp = extractFloat32Exp( a );
1752     aSign = extractFloat32Sign( a );
1753     if ( aExp == 0xFF ) {
1754         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1755         return packFloat128( aSign, 0x7FFF, 0, 0 );
1756     }
1757     if ( aExp == 0 ) {
1758         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1759         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1760         --aExp;
1761     }
1762     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1763
1764 }
1765
1766 /*----------------------------------------------------------------------------
1767 | Rounds the single-precision floating-point value `a' to an integer, and
1768 | returns the result as a single-precision floating-point value.  The
1769 | operation is performed according to the IEC/IEEE Standard for Binary
1770 | Floating-Point Arithmetic.
1771 *----------------------------------------------------------------------------*/
1772
1773 float32 float32_round_to_int( float32 a STATUS_PARAM)
1774 {
1775     flag aSign;
1776     int_fast16_t aExp;
1777     uint32_t lastBitMask, roundBitsMask;
1778     uint32_t z;
1779     a = float32_squash_input_denormal(a STATUS_VAR);
1780
1781     aExp = extractFloat32Exp( a );
1782     if ( 0x96 <= aExp ) {
1783         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1784             return propagateFloat32NaN( a, a STATUS_VAR );
1785         }
1786         return a;
1787     }
1788     if ( aExp <= 0x7E ) {
1789         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1790         STATUS(float_exception_flags) |= float_flag_inexact;
1791         aSign = extractFloat32Sign( a );
1792         switch ( STATUS(float_rounding_mode) ) {
1793          case float_round_nearest_even:
1794             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1795                 return packFloat32( aSign, 0x7F, 0 );
1796             }
1797             break;
1798         case float_round_ties_away:
1799             if (aExp == 0x7E) {
1800                 return packFloat32(aSign, 0x7F, 0);
1801             }
1802             break;
1803          case float_round_down:
1804             return make_float32(aSign ? 0xBF800000 : 0);
1805          case float_round_up:
1806             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1807         }
1808         return packFloat32( aSign, 0, 0 );
1809     }
1810     lastBitMask = 1;
1811     lastBitMask <<= 0x96 - aExp;
1812     roundBitsMask = lastBitMask - 1;
1813     z = float32_val(a);
1814     switch (STATUS(float_rounding_mode)) {
1815     case float_round_nearest_even:
1816         z += lastBitMask>>1;
1817         if ((z & roundBitsMask) == 0) {
1818             z &= ~lastBitMask;
1819         }
1820         break;
1821     case float_round_ties_away:
1822         z += lastBitMask >> 1;
1823         break;
1824     case float_round_to_zero:
1825         break;
1826     case float_round_up:
1827         if (!extractFloat32Sign(make_float32(z))) {
1828             z += roundBitsMask;
1829         }
1830         break;
1831     case float_round_down:
1832         if (extractFloat32Sign(make_float32(z))) {
1833             z += roundBitsMask;
1834         }
1835         break;
1836     default:
1837         abort();
1838     }
1839     z &= ~ roundBitsMask;
1840     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1841     return make_float32(z);
1842
1843 }
1844
1845 /*----------------------------------------------------------------------------
1846 | Returns the result of adding the absolute values of the single-precision
1847 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1848 | before being returned.  `zSign' is ignored if the result is a NaN.
1849 | The addition is performed according to the IEC/IEEE Standard for Binary
1850 | Floating-Point Arithmetic.
1851 *----------------------------------------------------------------------------*/
1852
1853 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1854 {
1855     int_fast16_t aExp, bExp, zExp;
1856     uint32_t aSig, bSig, zSig;
1857     int_fast16_t expDiff;
1858
1859     aSig = extractFloat32Frac( a );
1860     aExp = extractFloat32Exp( a );
1861     bSig = extractFloat32Frac( b );
1862     bExp = extractFloat32Exp( b );
1863     expDiff = aExp - bExp;
1864     aSig <<= 6;
1865     bSig <<= 6;
1866     if ( 0 < expDiff ) {
1867         if ( aExp == 0xFF ) {
1868             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1869             return a;
1870         }
1871         if ( bExp == 0 ) {
1872             --expDiff;
1873         }
1874         else {
1875             bSig |= 0x20000000;
1876         }
1877         shift32RightJamming( bSig, expDiff, &bSig );
1878         zExp = aExp;
1879     }
1880     else if ( expDiff < 0 ) {
1881         if ( bExp == 0xFF ) {
1882             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1883             return packFloat32( zSign, 0xFF, 0 );
1884         }
1885         if ( aExp == 0 ) {
1886             ++expDiff;
1887         }
1888         else {
1889             aSig |= 0x20000000;
1890         }
1891         shift32RightJamming( aSig, - expDiff, &aSig );
1892         zExp = bExp;
1893     }
1894     else {
1895         if ( aExp == 0xFF ) {
1896             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1897             return a;
1898         }
1899         if ( aExp == 0 ) {
1900             if (STATUS(flush_to_zero)) {
1901                 if (aSig | bSig) {
1902                     float_raise(float_flag_output_denormal STATUS_VAR);
1903                 }
1904                 return packFloat32(zSign, 0, 0);
1905             }
1906             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1907         }
1908         zSig = 0x40000000 + aSig + bSig;
1909         zExp = aExp;
1910         goto roundAndPack;
1911     }
1912     aSig |= 0x20000000;
1913     zSig = ( aSig + bSig )<<1;
1914     --zExp;
1915     if ( (int32_t) zSig < 0 ) {
1916         zSig = aSig + bSig;
1917         ++zExp;
1918     }
1919  roundAndPack:
1920     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1921
1922 }
1923
1924 /*----------------------------------------------------------------------------
1925 | Returns the result of subtracting the absolute values of the single-
1926 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1927 | difference is negated before being returned.  `zSign' is ignored if the
1928 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1929 | Standard for Binary Floating-Point Arithmetic.
1930 *----------------------------------------------------------------------------*/
1931
1932 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1933 {
1934     int_fast16_t aExp, bExp, zExp;
1935     uint32_t aSig, bSig, zSig;
1936     int_fast16_t expDiff;
1937
1938     aSig = extractFloat32Frac( a );
1939     aExp = extractFloat32Exp( a );
1940     bSig = extractFloat32Frac( b );
1941     bExp = extractFloat32Exp( b );
1942     expDiff = aExp - bExp;
1943     aSig <<= 7;
1944     bSig <<= 7;
1945     if ( 0 < expDiff ) goto aExpBigger;
1946     if ( expDiff < 0 ) goto bExpBigger;
1947     if ( aExp == 0xFF ) {
1948         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1949         float_raise( float_flag_invalid STATUS_VAR);
1950         return float32_default_nan;
1951     }
1952     if ( aExp == 0 ) {
1953         aExp = 1;
1954         bExp = 1;
1955     }
1956     if ( bSig < aSig ) goto aBigger;
1957     if ( aSig < bSig ) goto bBigger;
1958     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1959  bExpBigger:
1960     if ( bExp == 0xFF ) {
1961         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1962         return packFloat32( zSign ^ 1, 0xFF, 0 );
1963     }
1964     if ( aExp == 0 ) {
1965         ++expDiff;
1966     }
1967     else {
1968         aSig |= 0x40000000;
1969     }
1970     shift32RightJamming( aSig, - expDiff, &aSig );
1971     bSig |= 0x40000000;
1972  bBigger:
1973     zSig = bSig - aSig;
1974     zExp = bExp;
1975     zSign ^= 1;
1976     goto normalizeRoundAndPack;
1977  aExpBigger:
1978     if ( aExp == 0xFF ) {
1979         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1980         return a;
1981     }
1982     if ( bExp == 0 ) {
1983         --expDiff;
1984     }
1985     else {
1986         bSig |= 0x40000000;
1987     }
1988     shift32RightJamming( bSig, expDiff, &bSig );
1989     aSig |= 0x40000000;
1990  aBigger:
1991     zSig = aSig - bSig;
1992     zExp = aExp;
1993  normalizeRoundAndPack:
1994     --zExp;
1995     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1996
1997 }
1998
1999 /*----------------------------------------------------------------------------
2000 | Returns the result of adding the single-precision floating-point values `a'
2001 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2002 | Binary Floating-Point Arithmetic.
2003 *----------------------------------------------------------------------------*/
2004
2005 float32 float32_add( float32 a, float32 b STATUS_PARAM )
2006 {
2007     flag aSign, bSign;
2008     a = float32_squash_input_denormal(a STATUS_VAR);
2009     b = float32_squash_input_denormal(b STATUS_VAR);
2010
2011     aSign = extractFloat32Sign( a );
2012     bSign = extractFloat32Sign( b );
2013     if ( aSign == bSign ) {
2014         return addFloat32Sigs( a, b, aSign STATUS_VAR);
2015     }
2016     else {
2017         return subFloat32Sigs( a, b, aSign STATUS_VAR );
2018     }
2019
2020 }
2021
2022 /*----------------------------------------------------------------------------
2023 | Returns the result of subtracting the single-precision floating-point values
2024 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2025 | for Binary Floating-Point Arithmetic.
2026 *----------------------------------------------------------------------------*/
2027
2028 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
2029 {
2030     flag aSign, bSign;
2031     a = float32_squash_input_denormal(a STATUS_VAR);
2032     b = float32_squash_input_denormal(b STATUS_VAR);
2033
2034     aSign = extractFloat32Sign( a );
2035     bSign = extractFloat32Sign( b );
2036     if ( aSign == bSign ) {
2037         return subFloat32Sigs( a, b, aSign STATUS_VAR );
2038     }
2039     else {
2040         return addFloat32Sigs( a, b, aSign STATUS_VAR );
2041     }
2042
2043 }
2044
2045 /*----------------------------------------------------------------------------
2046 | Returns the result of multiplying the single-precision floating-point values
2047 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2048 | for Binary Floating-Point Arithmetic.
2049 *----------------------------------------------------------------------------*/
2050
2051 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
2052 {
2053     flag aSign, bSign, zSign;
2054     int_fast16_t aExp, bExp, zExp;
2055     uint32_t aSig, bSig;
2056     uint64_t zSig64;
2057     uint32_t zSig;
2058
2059     a = float32_squash_input_denormal(a STATUS_VAR);
2060     b = float32_squash_input_denormal(b STATUS_VAR);
2061
2062     aSig = extractFloat32Frac( a );
2063     aExp = extractFloat32Exp( a );
2064     aSign = extractFloat32Sign( a );
2065     bSig = extractFloat32Frac( b );
2066     bExp = extractFloat32Exp( b );
2067     bSign = extractFloat32Sign( b );
2068     zSign = aSign ^ bSign;
2069     if ( aExp == 0xFF ) {
2070         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2071             return propagateFloat32NaN( a, b STATUS_VAR );
2072         }
2073         if ( ( bExp | bSig ) == 0 ) {
2074             float_raise( float_flag_invalid STATUS_VAR);
2075             return float32_default_nan;
2076         }
2077         return packFloat32( zSign, 0xFF, 0 );
2078     }
2079     if ( bExp == 0xFF ) {
2080         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2081         if ( ( aExp | aSig ) == 0 ) {
2082             float_raise( float_flag_invalid STATUS_VAR);
2083             return float32_default_nan;
2084         }
2085         return packFloat32( zSign, 0xFF, 0 );
2086     }
2087     if ( aExp == 0 ) {
2088         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2089         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2090     }
2091     if ( bExp == 0 ) {
2092         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2093         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2094     }
2095     zExp = aExp + bExp - 0x7F;
2096     aSig = ( aSig | 0x00800000 )<<7;
2097     bSig = ( bSig | 0x00800000 )<<8;
2098     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2099     zSig = zSig64;
2100     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2101         zSig <<= 1;
2102         --zExp;
2103     }
2104     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2105
2106 }
2107
2108 /*----------------------------------------------------------------------------
2109 | Returns the result of dividing the single-precision floating-point value `a'
2110 | by the corresponding value `b'.  The operation is performed according to the
2111 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2112 *----------------------------------------------------------------------------*/
2113
2114 float32 float32_div( float32 a, float32 b STATUS_PARAM )
2115 {
2116     flag aSign, bSign, zSign;
2117     int_fast16_t aExp, bExp, zExp;
2118     uint32_t aSig, bSig, zSig;
2119     a = float32_squash_input_denormal(a STATUS_VAR);
2120     b = float32_squash_input_denormal(b STATUS_VAR);
2121
2122     aSig = extractFloat32Frac( a );
2123     aExp = extractFloat32Exp( a );
2124     aSign = extractFloat32Sign( a );
2125     bSig = extractFloat32Frac( b );
2126     bExp = extractFloat32Exp( b );
2127     bSign = extractFloat32Sign( b );
2128     zSign = aSign ^ bSign;
2129     if ( aExp == 0xFF ) {
2130         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2131         if ( bExp == 0xFF ) {
2132             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2133             float_raise( float_flag_invalid STATUS_VAR);
2134             return float32_default_nan;
2135         }
2136         return packFloat32( zSign, 0xFF, 0 );
2137     }
2138     if ( bExp == 0xFF ) {
2139         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2140         return packFloat32( zSign, 0, 0 );
2141     }
2142     if ( bExp == 0 ) {
2143         if ( bSig == 0 ) {
2144             if ( ( aExp | aSig ) == 0 ) {
2145                 float_raise( float_flag_invalid STATUS_VAR);
2146                 return float32_default_nan;
2147             }
2148             float_raise( float_flag_divbyzero STATUS_VAR);
2149             return packFloat32( zSign, 0xFF, 0 );
2150         }
2151         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2152     }
2153     if ( aExp == 0 ) {
2154         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2155         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2156     }
2157     zExp = aExp - bExp + 0x7D;
2158     aSig = ( aSig | 0x00800000 )<<7;
2159     bSig = ( bSig | 0x00800000 )<<8;
2160     if ( bSig <= ( aSig + aSig ) ) {
2161         aSig >>= 1;
2162         ++zExp;
2163     }
2164     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2165     if ( ( zSig & 0x3F ) == 0 ) {
2166         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2167     }
2168     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2169
2170 }
2171
2172 /*----------------------------------------------------------------------------
2173 | Returns the remainder of the single-precision floating-point value `a'
2174 | with respect to the corresponding value `b'.  The operation is performed
2175 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2176 *----------------------------------------------------------------------------*/
2177
2178 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2179 {
2180     flag aSign, zSign;
2181     int_fast16_t aExp, bExp, expDiff;
2182     uint32_t aSig, bSig;
2183     uint32_t q;
2184     uint64_t aSig64, bSig64, q64;
2185     uint32_t alternateASig;
2186     int32_t sigMean;
2187     a = float32_squash_input_denormal(a STATUS_VAR);
2188     b = float32_squash_input_denormal(b STATUS_VAR);
2189
2190     aSig = extractFloat32Frac( a );
2191     aExp = extractFloat32Exp( a );
2192     aSign = extractFloat32Sign( a );
2193     bSig = extractFloat32Frac( b );
2194     bExp = extractFloat32Exp( b );
2195     if ( aExp == 0xFF ) {
2196         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2197             return propagateFloat32NaN( a, b STATUS_VAR );
2198         }
2199         float_raise( float_flag_invalid STATUS_VAR);
2200         return float32_default_nan;
2201     }
2202     if ( bExp == 0xFF ) {
2203         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2204         return a;
2205     }
2206     if ( bExp == 0 ) {
2207         if ( bSig == 0 ) {
2208             float_raise( float_flag_invalid STATUS_VAR);
2209             return float32_default_nan;
2210         }
2211         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2212     }
2213     if ( aExp == 0 ) {
2214         if ( aSig == 0 ) return a;
2215         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2216     }
2217     expDiff = aExp - bExp;
2218     aSig |= 0x00800000;
2219     bSig |= 0x00800000;
2220     if ( expDiff < 32 ) {
2221         aSig <<= 8;
2222         bSig <<= 8;
2223         if ( expDiff < 0 ) {
2224             if ( expDiff < -1 ) return a;
2225             aSig >>= 1;
2226         }
2227         q = ( bSig <= aSig );
2228         if ( q ) aSig -= bSig;
2229         if ( 0 < expDiff ) {
2230             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2231             q >>= 32 - expDiff;
2232             bSig >>= 2;
2233             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2234         }
2235         else {
2236             aSig >>= 2;
2237             bSig >>= 2;
2238         }
2239     }
2240     else {
2241         if ( bSig <= aSig ) aSig -= bSig;
2242         aSig64 = ( (uint64_t) aSig )<<40;
2243         bSig64 = ( (uint64_t) bSig )<<40;
2244         expDiff -= 64;
2245         while ( 0 < expDiff ) {
2246             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2247             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2248             aSig64 = - ( ( bSig * q64 )<<38 );
2249             expDiff -= 62;
2250         }
2251         expDiff += 64;
2252         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2253         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2254         q = q64>>( 64 - expDiff );
2255         bSig <<= 6;
2256         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2257     }
2258     do {
2259         alternateASig = aSig;
2260         ++q;
2261         aSig -= bSig;
2262     } while ( 0 <= (int32_t) aSig );
2263     sigMean = aSig + alternateASig;
2264     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2265         aSig = alternateASig;
2266     }
2267     zSign = ( (int32_t) aSig < 0 );
2268     if ( zSign ) aSig = - aSig;
2269     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2270
2271 }
2272
2273 /*----------------------------------------------------------------------------
2274 | Returns the result of multiplying the single-precision floating-point values
2275 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2276 | multiplication.  The operation is performed according to the IEC/IEEE
2277 | Standard for Binary Floating-Point Arithmetic 754-2008.
2278 | The flags argument allows the caller to select negation of the
2279 | addend, the intermediate product, or the final result. (The difference
2280 | between this and having the caller do a separate negation is that negating
2281 | externally will flip the sign bit on NaNs.)
2282 *----------------------------------------------------------------------------*/
2283
2284 float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2285 {
2286     flag aSign, bSign, cSign, zSign;
2287     int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2288     uint32_t aSig, bSig, cSig;
2289     flag pInf, pZero, pSign;
2290     uint64_t pSig64, cSig64, zSig64;
2291     uint32_t pSig;
2292     int shiftcount;
2293     flag signflip, infzero;
2294
2295     a = float32_squash_input_denormal(a STATUS_VAR);
2296     b = float32_squash_input_denormal(b STATUS_VAR);
2297     c = float32_squash_input_denormal(c STATUS_VAR);
2298     aSig = extractFloat32Frac(a);
2299     aExp = extractFloat32Exp(a);
2300     aSign = extractFloat32Sign(a);
2301     bSig = extractFloat32Frac(b);
2302     bExp = extractFloat32Exp(b);
2303     bSign = extractFloat32Sign(b);
2304     cSig = extractFloat32Frac(c);
2305     cExp = extractFloat32Exp(c);
2306     cSign = extractFloat32Sign(c);
2307
2308     infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2309                (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2310
2311     /* It is implementation-defined whether the cases of (0,inf,qnan)
2312      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2313      * they return if they do), so we have to hand this information
2314      * off to the target-specific pick-a-NaN routine.
2315      */
2316     if (((aExp == 0xff) && aSig) ||
2317         ((bExp == 0xff) && bSig) ||
2318         ((cExp == 0xff) && cSig)) {
2319         return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2320     }
2321
2322     if (infzero) {
2323         float_raise(float_flag_invalid STATUS_VAR);
2324         return float32_default_nan;
2325     }
2326
2327     if (flags & float_muladd_negate_c) {
2328         cSign ^= 1;
2329     }
2330
2331     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2332
2333     /* Work out the sign and type of the product */
2334     pSign = aSign ^ bSign;
2335     if (flags & float_muladd_negate_product) {
2336         pSign ^= 1;
2337     }
2338     pInf = (aExp == 0xff) || (bExp == 0xff);
2339     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2340
2341     if (cExp == 0xff) {
2342         if (pInf && (pSign ^ cSign)) {
2343             /* addition of opposite-signed infinities => InvalidOperation */
2344             float_raise(float_flag_invalid STATUS_VAR);
2345             return float32_default_nan;
2346         }
2347         /* Otherwise generate an infinity of the same sign */
2348         return packFloat32(cSign ^ signflip, 0xff, 0);
2349     }
2350
2351     if (pInf) {
2352         return packFloat32(pSign ^ signflip, 0xff, 0);
2353     }
2354
2355     if (pZero) {
2356         if (cExp == 0) {
2357             if (cSig == 0) {
2358                 /* Adding two exact zeroes */
2359                 if (pSign == cSign) {
2360                     zSign = pSign;
2361                 } else if (STATUS(float_rounding_mode) == float_round_down) {
2362                     zSign = 1;
2363                 } else {
2364                     zSign = 0;
2365                 }
2366                 return packFloat32(zSign ^ signflip, 0, 0);
2367             }
2368             /* Exact zero plus a denorm */
2369             if (STATUS(flush_to_zero)) {
2370                 float_raise(float_flag_output_denormal STATUS_VAR);
2371                 return packFloat32(cSign ^ signflip, 0, 0);
2372             }
2373         }
2374         /* Zero plus something non-zero : just return the something */
2375         if (flags & float_muladd_halve_result) {
2376             if (cExp == 0) {
2377                 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2378             }
2379             /* Subtract one to halve, and one again because roundAndPackFloat32
2380              * wants one less than the true exponent.
2381              */
2382             cExp -= 2;
2383             cSig = (cSig | 0x00800000) << 7;
2384             return roundAndPackFloat32(cSign ^ signflip, cExp, cSig STATUS_VAR);
2385         }
2386         return packFloat32(cSign ^ signflip, cExp, cSig);
2387     }
2388
2389     if (aExp == 0) {
2390         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2391     }
2392     if (bExp == 0) {
2393         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2394     }
2395
2396     /* Calculate the actual result a * b + c */
2397
2398     /* Multiply first; this is easy. */
2399     /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2400      * because we want the true exponent, not the "one-less-than"
2401      * flavour that roundAndPackFloat32() takes.
2402      */
2403     pExp = aExp + bExp - 0x7e;
2404     aSig = (aSig | 0x00800000) << 7;
2405     bSig = (bSig | 0x00800000) << 8;
2406     pSig64 = (uint64_t)aSig * bSig;
2407     if ((int64_t)(pSig64 << 1) >= 0) {
2408         pSig64 <<= 1;
2409         pExp--;
2410     }
2411
2412     zSign = pSign ^ signflip;
2413
2414     /* Now pSig64 is the significand of the multiply, with the explicit bit in
2415      * position 62.
2416      */
2417     if (cExp == 0) {
2418         if (!cSig) {
2419             /* Throw out the special case of c being an exact zero now */
2420             shift64RightJamming(pSig64, 32, &pSig64);
2421             pSig = pSig64;
2422             if (flags & float_muladd_halve_result) {
2423                 pExp--;
2424             }
2425             return roundAndPackFloat32(zSign, pExp - 1,
2426                                        pSig STATUS_VAR);
2427         }
2428         normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2429     }
2430
2431     cSig64 = (uint64_t)cSig << (62 - 23);
2432     cSig64 |= LIT64(0x4000000000000000);
2433     expDiff = pExp - cExp;
2434
2435     if (pSign == cSign) {
2436         /* Addition */
2437         if (expDiff > 0) {
2438             /* scale c to match p */
2439             shift64RightJamming(cSig64, expDiff, &cSig64);
2440             zExp = pExp;
2441         } else if (expDiff < 0) {
2442             /* scale p to match c */
2443             shift64RightJamming(pSig64, -expDiff, &pSig64);
2444             zExp = cExp;
2445         } else {
2446             /* no scaling needed */
2447             zExp = cExp;
2448         }
2449         /* Add significands and make sure explicit bit ends up in posn 62 */
2450         zSig64 = pSig64 + cSig64;
2451         if ((int64_t)zSig64 < 0) {
2452             shift64RightJamming(zSig64, 1, &zSig64);
2453         } else {
2454             zExp--;
2455         }
2456     } else {
2457         /* Subtraction */
2458         if (expDiff > 0) {
2459             shift64RightJamming(cSig64, expDiff, &cSig64);
2460             zSig64 = pSig64 - cSig64;
2461             zExp = pExp;
2462         } else if (expDiff < 0) {
2463             shift64RightJamming(pSig64, -expDiff, &pSig64);
2464             zSig64 = cSig64 - pSig64;
2465             zExp = cExp;
2466             zSign ^= 1;
2467         } else {
2468             zExp = pExp;
2469             if (cSig64 < pSig64) {
2470                 zSig64 = pSig64 - cSig64;
2471             } else if (pSig64 < cSig64) {
2472                 zSig64 = cSig64 - pSig64;
2473                 zSign ^= 1;
2474             } else {
2475                 /* Exact zero */
2476                 zSign = signflip;
2477                 if (STATUS(float_rounding_mode) == float_round_down) {
2478                     zSign ^= 1;
2479                 }
2480                 return packFloat32(zSign, 0, 0);
2481             }
2482         }
2483         --zExp;
2484         /* Normalize to put the explicit bit back into bit 62. */
2485         shiftcount = countLeadingZeros64(zSig64) - 1;
2486         zSig64 <<= shiftcount;
2487         zExp -= shiftcount;
2488     }
2489     if (flags & float_muladd_halve_result) {
2490         zExp--;
2491     }
2492
2493     shift64RightJamming(zSig64, 32, &zSig64);
2494     return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2495 }
2496
2497
2498 /*----------------------------------------------------------------------------
2499 | Returns the square root of the single-precision floating-point value `a'.
2500 | The operation is performed according to the IEC/IEEE Standard for Binary
2501 | Floating-Point Arithmetic.
2502 *----------------------------------------------------------------------------*/
2503
2504 float32 float32_sqrt( float32 a STATUS_PARAM )
2505 {
2506     flag aSign;
2507     int_fast16_t aExp, zExp;
2508     uint32_t aSig, zSig;
2509     uint64_t rem, term;
2510     a = float32_squash_input_denormal(a STATUS_VAR);
2511
2512     aSig = extractFloat32Frac( a );
2513     aExp = extractFloat32Exp( a );
2514     aSign = extractFloat32Sign( a );
2515     if ( aExp == 0xFF ) {
2516         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2517         if ( ! aSign ) return a;
2518         float_raise( float_flag_invalid STATUS_VAR);
2519         return float32_default_nan;
2520     }
2521     if ( aSign ) {
2522         if ( ( aExp | aSig ) == 0 ) return a;
2523         float_raise( float_flag_invalid STATUS_VAR);
2524         return float32_default_nan;
2525     }
2526     if ( aExp == 0 ) {
2527         if ( aSig == 0 ) return float32_zero;
2528         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2529     }
2530     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2531     aSig = ( aSig | 0x00800000 )<<8;
2532     zSig = estimateSqrt32( aExp, aSig ) + 2;
2533     if ( ( zSig & 0x7F ) <= 5 ) {
2534         if ( zSig < 2 ) {
2535             zSig = 0x7FFFFFFF;
2536             goto roundAndPack;
2537         }
2538         aSig >>= aExp & 1;
2539         term = ( (uint64_t) zSig ) * zSig;
2540         rem = ( ( (uint64_t) aSig )<<32 ) - term;
2541         while ( (int64_t) rem < 0 ) {
2542             --zSig;
2543             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2544         }
2545         zSig |= ( rem != 0 );
2546     }
2547     shift32RightJamming( zSig, 1, &zSig );
2548  roundAndPack:
2549     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2550
2551 }
2552
2553 /*----------------------------------------------------------------------------
2554 | Returns the binary exponential of the single-precision floating-point value
2555 | `a'. The operation is performed according to the IEC/IEEE Standard for
2556 | Binary Floating-Point Arithmetic.
2557 |
2558 | Uses the following identities:
2559 |
2560 | 1. -------------------------------------------------------------------------
2561 |      x    x*ln(2)
2562 |     2  = e
2563 |
2564 | 2. -------------------------------------------------------------------------
2565 |                      2     3     4     5           n
2566 |      x        x     x     x     x     x           x
2567 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2568 |               1!    2!    3!    4!    5!          n!
2569 *----------------------------------------------------------------------------*/
2570
2571 static const float64 float32_exp2_coefficients[15] =
2572 {
2573     const_float64( 0x3ff0000000000000ll ), /*  1 */
2574     const_float64( 0x3fe0000000000000ll ), /*  2 */
2575     const_float64( 0x3fc5555555555555ll ), /*  3 */
2576     const_float64( 0x3fa5555555555555ll ), /*  4 */
2577     const_float64( 0x3f81111111111111ll ), /*  5 */
2578     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2579     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2580     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2581     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2582     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2583     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2584     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2585     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2586     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2587     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2588 };
2589
2590 float32 float32_exp2( float32 a STATUS_PARAM )
2591 {
2592     flag aSign;
2593     int_fast16_t aExp;
2594     uint32_t aSig;
2595     float64 r, x, xn;
2596     int i;
2597     a = float32_squash_input_denormal(a STATUS_VAR);
2598
2599     aSig = extractFloat32Frac( a );
2600     aExp = extractFloat32Exp( a );
2601     aSign = extractFloat32Sign( a );
2602
2603     if ( aExp == 0xFF) {
2604         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2605         return (aSign) ? float32_zero : a;
2606     }
2607     if (aExp == 0) {
2608         if (aSig == 0) return float32_one;
2609     }
2610
2611     float_raise( float_flag_inexact STATUS_VAR);
2612
2613     /* ******************************* */
2614     /* using float64 for approximation */
2615     /* ******************************* */
2616     x = float32_to_float64(a STATUS_VAR);
2617     x = float64_mul(x, float64_ln2 STATUS_VAR);
2618
2619     xn = x;
2620     r = float64_one;
2621     for (i = 0 ; i < 15 ; i++) {
2622         float64 f;
2623
2624         f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2625         r = float64_add(r, f STATUS_VAR);
2626
2627         xn = float64_mul(xn, x STATUS_VAR);
2628     }
2629
2630     return float64_to_float32(r, status);
2631 }
2632
2633 /*----------------------------------------------------------------------------
2634 | Returns the binary log of the single-precision floating-point value `a'.
2635 | The operation is performed according to the IEC/IEEE Standard for Binary
2636 | Floating-Point Arithmetic.
2637 *----------------------------------------------------------------------------*/
2638 float32 float32_log2( float32 a STATUS_PARAM )
2639 {
2640     flag aSign, zSign;
2641     int_fast16_t aExp;
2642     uint32_t aSig, zSig, i;
2643
2644     a = float32_squash_input_denormal(a STATUS_VAR);
2645     aSig = extractFloat32Frac( a );
2646     aExp = extractFloat32Exp( a );
2647     aSign = extractFloat32Sign( a );
2648
2649     if ( aExp == 0 ) {
2650         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2651         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2652     }
2653     if ( aSign ) {
2654         float_raise( float_flag_invalid STATUS_VAR);
2655         return float32_default_nan;
2656     }
2657     if ( aExp == 0xFF ) {
2658         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2659         return a;
2660     }
2661
2662     aExp -= 0x7F;
2663     aSig |= 0x00800000;
2664     zSign = aExp < 0;
2665     zSig = aExp << 23;
2666
2667     for (i = 1 << 22; i > 0; i >>= 1) {
2668         aSig = ( (uint64_t)aSig * aSig ) >> 23;
2669         if ( aSig & 0x01000000 ) {
2670             aSig >>= 1;
2671             zSig |= i;
2672         }
2673     }
2674
2675     if ( zSign )
2676         zSig = -zSig;
2677
2678     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2679 }
2680
2681 /*----------------------------------------------------------------------------
2682 | Returns 1 if the single-precision floating-point value `a' is equal to
2683 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2684 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2685 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2686 *----------------------------------------------------------------------------*/
2687
2688 int float32_eq( float32 a, float32 b STATUS_PARAM )
2689 {
2690     uint32_t av, bv;
2691     a = float32_squash_input_denormal(a STATUS_VAR);
2692     b = float32_squash_input_denormal(b STATUS_VAR);
2693
2694     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2695          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2696        ) {
2697         float_raise( float_flag_invalid STATUS_VAR);
2698         return 0;
2699     }
2700     av = float32_val(a);
2701     bv = float32_val(b);
2702     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2703 }
2704
2705 /*----------------------------------------------------------------------------
2706 | Returns 1 if the single-precision floating-point value `a' is less than
2707 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
2708 | exception is raised if either operand is a NaN.  The comparison is performed
2709 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2710 *----------------------------------------------------------------------------*/
2711
2712 int float32_le( float32 a, float32 b STATUS_PARAM )
2713 {
2714     flag aSign, bSign;
2715     uint32_t av, bv;
2716     a = float32_squash_input_denormal(a STATUS_VAR);
2717     b = float32_squash_input_denormal(b STATUS_VAR);
2718
2719     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2720          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2721        ) {
2722         float_raise( float_flag_invalid STATUS_VAR);
2723         return 0;
2724     }
2725     aSign = extractFloat32Sign( a );
2726     bSign = extractFloat32Sign( b );
2727     av = float32_val(a);
2728     bv = float32_val(b);
2729     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2730     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2731
2732 }
2733
2734 /*----------------------------------------------------------------------------
2735 | Returns 1 if the single-precision floating-point value `a' is less than
2736 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2737 | raised if either operand is a NaN.  The comparison is performed according
2738 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2739 *----------------------------------------------------------------------------*/
2740
2741 int float32_lt( float32 a, float32 b STATUS_PARAM )
2742 {
2743     flag aSign, bSign;
2744     uint32_t av, bv;
2745     a = float32_squash_input_denormal(a STATUS_VAR);
2746     b = float32_squash_input_denormal(b STATUS_VAR);
2747
2748     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2749          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2750        ) {
2751         float_raise( float_flag_invalid STATUS_VAR);
2752         return 0;
2753     }
2754     aSign = extractFloat32Sign( a );
2755     bSign = extractFloat32Sign( b );
2756     av = float32_val(a);
2757     bv = float32_val(b);
2758     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2759     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2760
2761 }
2762
2763 /*----------------------------------------------------------------------------
2764 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2765 | be compared, and 0 otherwise.  The invalid exception is raised if either
2766 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
2767 | Standard for Binary Floating-Point Arithmetic.
2768 *----------------------------------------------------------------------------*/
2769
2770 int float32_unordered( float32 a, float32 b STATUS_PARAM )
2771 {
2772     a = float32_squash_input_denormal(a STATUS_VAR);
2773     b = float32_squash_input_denormal(b STATUS_VAR);
2774
2775     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2776          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2777        ) {
2778         float_raise( float_flag_invalid STATUS_VAR);
2779         return 1;
2780     }
2781     return 0;
2782 }
2783
2784 /*----------------------------------------------------------------------------
2785 | Returns 1 if the single-precision floating-point value `a' is equal to
2786 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2787 | exception.  The comparison is performed according to the IEC/IEEE Standard
2788 | for Binary Floating-Point Arithmetic.
2789 *----------------------------------------------------------------------------*/
2790
2791 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2792 {
2793     a = float32_squash_input_denormal(a STATUS_VAR);
2794     b = float32_squash_input_denormal(b STATUS_VAR);
2795
2796     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2797          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2798        ) {
2799         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2800             float_raise( float_flag_invalid STATUS_VAR);
2801         }
2802         return 0;
2803     }
2804     return ( float32_val(a) == float32_val(b) ) ||
2805             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2806 }
2807
2808 /*----------------------------------------------------------------------------
2809 | Returns 1 if the single-precision floating-point value `a' is less than or
2810 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2811 | cause an exception.  Otherwise, the comparison is performed according to the
2812 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2813 *----------------------------------------------------------------------------*/
2814
2815 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2816 {
2817     flag aSign, bSign;
2818     uint32_t av, bv;
2819     a = float32_squash_input_denormal(a STATUS_VAR);
2820     b = float32_squash_input_denormal(b STATUS_VAR);
2821
2822     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2823          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2824        ) {
2825         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2826             float_raise( float_flag_invalid STATUS_VAR);
2827         }
2828         return 0;
2829     }
2830     aSign = extractFloat32Sign( a );
2831     bSign = extractFloat32Sign( b );
2832     av = float32_val(a);
2833     bv = float32_val(b);
2834     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2835     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2836
2837 }
2838
2839 /*----------------------------------------------------------------------------
2840 | Returns 1 if the single-precision floating-point value `a' is less than
2841 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2842 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2843 | Standard for Binary Floating-Point Arithmetic.
2844 *----------------------------------------------------------------------------*/
2845
2846 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2847 {
2848     flag aSign, bSign;
2849     uint32_t av, bv;
2850     a = float32_squash_input_denormal(a STATUS_VAR);
2851     b = float32_squash_input_denormal(b STATUS_VAR);
2852
2853     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2854          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2855        ) {
2856         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2857             float_raise( float_flag_invalid STATUS_VAR);
2858         }
2859         return 0;
2860     }
2861     aSign = extractFloat32Sign( a );
2862     bSign = extractFloat32Sign( b );
2863     av = float32_val(a);
2864     bv = float32_val(b);
2865     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2866     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2867
2868 }
2869
2870 /*----------------------------------------------------------------------------
2871 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2872 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
2873 | comparison is performed according to the IEC/IEEE Standard for Binary
2874 | Floating-Point Arithmetic.
2875 *----------------------------------------------------------------------------*/
2876
2877 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2878 {
2879     a = float32_squash_input_denormal(a STATUS_VAR);
2880     b = float32_squash_input_denormal(b STATUS_VAR);
2881
2882     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2883          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2884        ) {
2885         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2886             float_raise( float_flag_invalid STATUS_VAR);
2887         }
2888         return 1;
2889     }
2890     return 0;
2891 }
2892
2893 /*----------------------------------------------------------------------------
2894 | Returns the result of converting the double-precision floating-point value
2895 | `a' to the 32-bit two's complement integer format.  The conversion is
2896 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2897 | Arithmetic---which means in particular that the conversion is rounded
2898 | according to the current rounding mode.  If `a' is a NaN, the largest
2899 | positive integer is returned.  Otherwise, if the conversion overflows, the
2900 | largest integer with the same sign as `a' is returned.
2901 *----------------------------------------------------------------------------*/
2902
2903 int32 float64_to_int32( float64 a STATUS_PARAM )
2904 {
2905     flag aSign;
2906     int_fast16_t aExp, shiftCount;
2907     uint64_t aSig;
2908     a = float64_squash_input_denormal(a STATUS_VAR);
2909
2910     aSig = extractFloat64Frac( a );
2911     aExp = extractFloat64Exp( a );
2912     aSign = extractFloat64Sign( a );
2913     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2914     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2915     shiftCount = 0x42C - aExp;
2916     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2917     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2918
2919 }
2920
2921 /*----------------------------------------------------------------------------
2922 | Returns the result of converting the double-precision floating-point value
2923 | `a' to the 32-bit two's complement integer format.  The conversion is
2924 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2925 | Arithmetic, except that the conversion is always rounded toward zero.
2926 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2927 | the conversion overflows, the largest integer with the same sign as `a' is
2928 | returned.
2929 *----------------------------------------------------------------------------*/
2930
2931 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2932 {
2933     flag aSign;
2934     int_fast16_t aExp, shiftCount;
2935     uint64_t aSig, savedASig;
2936     int32_t z;
2937     a = float64_squash_input_denormal(a STATUS_VAR);
2938
2939     aSig = extractFloat64Frac( a );
2940     aExp = extractFloat64Exp( a );
2941     aSign = extractFloat64Sign( a );
2942     if ( 0x41E < aExp ) {
2943         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2944         goto invalid;
2945     }
2946     else if ( aExp < 0x3FF ) {
2947         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2948         return 0;
2949     }
2950     aSig |= LIT64( 0x0010000000000000 );
2951     shiftCount = 0x433 - aExp;
2952     savedASig = aSig;
2953     aSig >>= shiftCount;
2954     z = aSig;
2955     if ( aSign ) z = - z;
2956     if ( ( z < 0 ) ^ aSign ) {
2957  invalid:
2958         float_raise( float_flag_invalid STATUS_VAR);
2959         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2960     }
2961     if ( ( aSig<<shiftCount ) != savedASig ) {
2962         STATUS(float_exception_flags) |= float_flag_inexact;
2963     }
2964     return z;
2965
2966 }
2967
2968 /*----------------------------------------------------------------------------
2969 | Returns the result of converting the double-precision floating-point value
2970 | `a' to the 16-bit two's complement integer format.  The conversion is
2971 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2972 | Arithmetic, except that the conversion is always rounded toward zero.
2973 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2974 | the conversion overflows, the largest integer with the same sign as `a' is
2975 | returned.
2976 *----------------------------------------------------------------------------*/
2977
2978 int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
2979 {
2980     flag aSign;
2981     int_fast16_t aExp, shiftCount;
2982     uint64_t aSig, savedASig;
2983     int32 z;
2984
2985     aSig = extractFloat64Frac( a );
2986     aExp = extractFloat64Exp( a );
2987     aSign = extractFloat64Sign( a );
2988     if ( 0x40E < aExp ) {
2989         if ( ( aExp == 0x7FF ) && aSig ) {
2990             aSign = 0;
2991         }
2992         goto invalid;
2993     }
2994     else if ( aExp < 0x3FF ) {
2995         if ( aExp || aSig ) {
2996             STATUS(float_exception_flags) |= float_flag_inexact;
2997         }
2998         return 0;
2999     }
3000     aSig |= LIT64( 0x0010000000000000 );
3001     shiftCount = 0x433 - aExp;
3002     savedASig = aSig;
3003     aSig >>= shiftCount;
3004     z = aSig;
3005     if ( aSign ) {
3006         z = - z;
3007     }
3008     if ( ( (int16_t)z < 0 ) ^ aSign ) {
3009  invalid:
3010         float_raise( float_flag_invalid STATUS_VAR);
3011         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3012     }
3013     if ( ( aSig<<shiftCount ) != savedASig ) {
3014         STATUS(float_exception_flags) |= float_flag_inexact;
3015     }
3016     return z;
3017 }
3018
3019 /*----------------------------------------------------------------------------
3020 | Returns the result of converting the double-precision floating-point value
3021 | `a' to the 64-bit two's complement integer format.  The conversion is
3022 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3023 | Arithmetic---which means in particular that the conversion is rounded
3024 | according to the current rounding mode.  If `a' is a NaN, the largest
3025 | positive integer is returned.  Otherwise, if the conversion overflows, the
3026 | largest integer with the same sign as `a' is returned.
3027 *----------------------------------------------------------------------------*/
3028
3029 int64 float64_to_int64( float64 a STATUS_PARAM )
3030 {
3031     flag aSign;
3032     int_fast16_t aExp, shiftCount;
3033     uint64_t aSig, aSigExtra;
3034     a = float64_squash_input_denormal(a STATUS_VAR);
3035
3036     aSig = extractFloat64Frac( a );
3037     aExp = extractFloat64Exp( a );
3038     aSign = extractFloat64Sign( a );
3039     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3040     shiftCount = 0x433 - aExp;
3041     if ( shiftCount <= 0 ) {
3042         if ( 0x43E < aExp ) {
3043             float_raise( float_flag_invalid STATUS_VAR);
3044             if (    ! aSign
3045                  || (    ( aExp == 0x7FF )
3046                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
3047                ) {
3048                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3049             }
3050             return (int64_t) LIT64( 0x8000000000000000 );
3051         }
3052         aSigExtra = 0;
3053         aSig <<= - shiftCount;
3054     }
3055     else {
3056         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3057     }
3058     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3059
3060 }
3061
3062 /*----------------------------------------------------------------------------
3063 | Returns the result of converting the double-precision floating-point value
3064 | `a' to the 64-bit two's complement integer format.  The conversion is
3065 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3066 | Arithmetic, except that the conversion is always rounded toward zero.
3067 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3068 | the conversion overflows, the largest integer with the same sign as `a' is
3069 | returned.
3070 *----------------------------------------------------------------------------*/
3071
3072 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
3073 {
3074     flag aSign;
3075     int_fast16_t aExp, shiftCount;
3076     uint64_t aSig;
3077     int64 z;
3078     a = float64_squash_input_denormal(a STATUS_VAR);
3079
3080     aSig = extractFloat64Frac( a );
3081     aExp = extractFloat64Exp( a );
3082     aSign = extractFloat64Sign( a );
3083     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3084     shiftCount = aExp - 0x433;
3085     if ( 0 <= shiftCount ) {
3086         if ( 0x43E <= aExp ) {
3087             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3088                 float_raise( float_flag_invalid STATUS_VAR);
3089                 if (    ! aSign
3090                      || (    ( aExp == 0x7FF )
3091                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
3092                    ) {
3093                     return LIT64( 0x7FFFFFFFFFFFFFFF );
3094                 }
3095             }
3096             return (int64_t) LIT64( 0x8000000000000000 );
3097         }
3098         z = aSig<<shiftCount;
3099     }
3100     else {
3101         if ( aExp < 0x3FE ) {
3102             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3103             return 0;
3104         }
3105         z = aSig>>( - shiftCount );
3106         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3107             STATUS(float_exception_flags) |= float_flag_inexact;
3108         }
3109     }
3110     if ( aSign ) z = - z;
3111     return z;
3112
3113 }
3114
3115 /*----------------------------------------------------------------------------
3116 | Returns the result of converting the double-precision floating-point value
3117 | `a' to the single-precision floating-point format.  The conversion is
3118 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3119 | Arithmetic.
3120 *----------------------------------------------------------------------------*/
3121
3122 float32 float64_to_float32( float64 a STATUS_PARAM )
3123 {
3124     flag aSign;
3125     int_fast16_t aExp;
3126     uint64_t aSig;
3127     uint32_t zSig;
3128     a = float64_squash_input_denormal(a STATUS_VAR);
3129
3130     aSig = extractFloat64Frac( a );
3131     aExp = extractFloat64Exp( a );
3132     aSign = extractFloat64Sign( a );
3133     if ( aExp == 0x7FF ) {
3134         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3135         return packFloat32( aSign, 0xFF, 0 );
3136     }
3137     shift64RightJamming( aSig, 22, &aSig );
3138     zSig = aSig;
3139     if ( aExp || zSig ) {
3140         zSig |= 0x40000000;
3141         aExp -= 0x381;
3142     }
3143     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
3144
3145 }
3146
3147
3148 /*----------------------------------------------------------------------------
3149 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3150 | half-precision floating-point value, returning the result.  After being
3151 | shifted into the proper positions, the three fields are simply added
3152 | together to form the result.  This means that any integer portion of `zSig'
3153 | will be added into the exponent.  Since a properly normalized significand
3154 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3155 | than the desired result exponent whenever `zSig' is a complete, normalized
3156 | significand.
3157 *----------------------------------------------------------------------------*/
3158 static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3159 {
3160     return make_float16(
3161         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3162 }
3163
3164 /*----------------------------------------------------------------------------
3165 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3166 | and significand `zSig', and returns the proper half-precision floating-
3167 | point value corresponding to the abstract input.  Ordinarily, the abstract
3168 | value is simply rounded and packed into the half-precision format, with
3169 | the inexact exception raised if the abstract input cannot be represented
3170 | exactly.  However, if the abstract value is too large, the overflow and
3171 | inexact exceptions are raised and an infinity or maximal finite value is
3172 | returned.  If the abstract value is too small, the input value is rounded to
3173 | a subnormal number, and the underflow and inexact exceptions are raised if
3174 | the abstract input cannot be represented exactly as a subnormal half-
3175 | precision floating-point number.
3176 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3177 | ARM-style "alternative representation", which omits the NaN and Inf
3178 | encodings in order to raise the maximum representable exponent by one.
3179 |     The input significand `zSig' has its binary point between bits 22
3180 | and 23, which is 13 bits to the left of the usual location.  This shifted
3181 | significand must be normalized or smaller.  If `zSig' is not normalized,
3182 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3183 | and it must not require rounding.  In the usual case that `zSig' is
3184 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3185 | Note the slightly odd position of the binary point in zSig compared with the
3186 | other roundAndPackFloat functions. This should probably be fixed if we
3187 | need to implement more float16 routines than just conversion.
3188 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3189 | Binary Floating-Point Arithmetic.
3190 *----------------------------------------------------------------------------*/
3191
3192 static float32 roundAndPackFloat16(flag zSign, int_fast16_t zExp,
3193                                    uint32_t zSig, flag ieee STATUS_PARAM)
3194 {
3195     int maxexp = ieee ? 29 : 30;
3196     uint32_t mask;
3197     uint32_t increment;
3198     bool rounding_bumps_exp;
3199     bool is_tiny = false;
3200
3201     /* Calculate the mask of bits of the mantissa which are not
3202      * representable in half-precision and will be lost.
3203      */
3204     if (zExp < 1) {
3205         /* Will be denormal in halfprec */
3206         mask = 0x00ffffff;
3207         if (zExp >= -11) {
3208             mask >>= 11 + zExp;
3209         }
3210     } else {
3211         /* Normal number in halfprec */
3212         mask = 0x00001fff;
3213     }
3214
3215     switch (STATUS(float_rounding_mode)) {
3216     case float_round_nearest_even:
3217         increment = (mask + 1) >> 1;
3218         if ((zSig & mask) == increment) {
3219             increment = zSig & (increment << 1);
3220         }
3221         break;
3222     case float_round_ties_away:
3223         increment = (mask + 1) >> 1;
3224         break;
3225     case float_round_up:
3226         increment = zSign ? 0 : mask;
3227         break;
3228     case float_round_down:
3229         increment = zSign ? mask : 0;
3230         break;
3231     default: /* round_to_zero */
3232         increment = 0;
3233         break;
3234     }
3235
3236     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3237
3238     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3239         if (ieee) {
3240             float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR);
3241             return packFloat16(zSign, 0x1f, 0);
3242         } else {
3243             float_raise(float_flag_invalid STATUS_VAR);
3244             return packFloat16(zSign, 0x1f, 0x3ff);
3245         }
3246     }
3247
3248     if (zExp < 0) {
3249         /* Note that flush-to-zero does not affect half-precision results */
3250         is_tiny =
3251             (STATUS(float_detect_tininess) == float_tininess_before_rounding)
3252             || (zExp < -1)
3253             || (!rounding_bumps_exp);
3254     }
3255     if (zSig & mask) {
3256         float_raise(float_flag_inexact STATUS_VAR);
3257         if (is_tiny) {
3258             float_raise(float_flag_underflow STATUS_VAR);
3259         }
3260     }
3261
3262     zSig += increment;
3263     if (rounding_bumps_exp) {
3264         zSig >>= 1;
3265         zExp++;
3266     }
3267
3268     if (zExp < -10) {
3269         return packFloat16(zSign, 0, 0);
3270     }
3271     if (zExp < 0) {
3272         zSig >>= -zExp;
3273         zExp = 0;
3274     }
3275     return packFloat16(zSign, zExp, zSig >> 13);
3276 }
3277
3278 static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr,
3279                                       uint32_t *zSigPtr)
3280 {
3281     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3282     *zSigPtr = aSig << shiftCount;
3283     *zExpPtr = 1 - shiftCount;
3284 }
3285
3286 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3287    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3288
3289 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
3290 {
3291     flag aSign;
3292     int_fast16_t aExp;
3293     uint32_t aSig;
3294
3295     aSign = extractFloat16Sign(a);
3296     aExp = extractFloat16Exp(a);
3297     aSig = extractFloat16Frac(a);
3298
3299     if (aExp == 0x1f && ieee) {
3300         if (aSig) {
3301             return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3302         }
3303         return packFloat32(aSign, 0xff, 0);
3304     }
3305     if (aExp == 0) {
3306         if (aSig == 0) {
3307             return packFloat32(aSign, 0, 0);
3308         }
3309
3310         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3311         aExp--;
3312     }
3313     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3314 }
3315
3316 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
3317 {
3318     flag aSign;
3319     int_fast16_t aExp;
3320     uint32_t aSig;
3321
3322     a = float32_squash_input_denormal(a STATUS_VAR);
3323
3324     aSig = extractFloat32Frac( a );
3325     aExp = extractFloat32Exp( a );
3326     aSign = extractFloat32Sign( a );
3327     if ( aExp == 0xFF ) {
3328         if (aSig) {
3329             /* Input is a NaN */
3330             if (!ieee) {
3331                 float_raise(float_flag_invalid STATUS_VAR);
3332                 return packFloat16(aSign, 0, 0);
3333             }
3334             return commonNaNToFloat16(
3335                 float32ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3336         }
3337         /* Infinity */
3338         if (!ieee) {
3339             float_raise(float_flag_invalid STATUS_VAR);
3340             return packFloat16(aSign, 0x1f, 0x3ff);
3341         }
3342         return packFloat16(aSign, 0x1f, 0);
3343     }
3344     if (aExp == 0 && aSig == 0) {
3345         return packFloat16(aSign, 0, 0);
3346     }
3347     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3348      * even if the input is denormal; however this is harmless because
3349      * the largest possible single-precision denormal is still smaller
3350      * than the smallest representable half-precision denormal, and so we
3351      * will end up ignoring aSig and returning via the "always return zero"
3352      * codepath.
3353      */
3354     aSig |= 0x00800000;
3355     aExp -= 0x71;
3356
3357     return roundAndPackFloat16(aSign, aExp, aSig, ieee STATUS_VAR);
3358 }
3359
3360 float64 float16_to_float64(float16 a, flag ieee STATUS_PARAM)
3361 {
3362     flag aSign;
3363     int_fast16_t aExp;
3364     uint32_t aSig;
3365
3366     aSign = extractFloat16Sign(a);
3367     aExp = extractFloat16Exp(a);
3368     aSig = extractFloat16Frac(a);
3369
3370     if (aExp == 0x1f && ieee) {
3371         if (aSig) {
3372             return commonNaNToFloat64(
3373                 float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3374         }
3375         return packFloat64(aSign, 0x7ff, 0);
3376     }
3377     if (aExp == 0) {
3378         if (aSig == 0) {
3379             return packFloat64(aSign, 0, 0);
3380         }
3381
3382         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3383         aExp--;
3384     }
3385     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3386 }
3387
3388 float16 float64_to_float16(float64 a, flag ieee STATUS_PARAM)
3389 {
3390     flag aSign;
3391     int_fast16_t aExp;
3392     uint64_t aSig;
3393     uint32_t zSig;
3394
3395     a = float64_squash_input_denormal(a STATUS_VAR);
3396
3397     aSig = extractFloat64Frac(a);
3398     aExp = extractFloat64Exp(a);
3399     aSign = extractFloat64Sign(a);
3400     if (aExp == 0x7FF) {
3401         if (aSig) {
3402             /* Input is a NaN */
3403             if (!ieee) {
3404                 float_raise(float_flag_invalid STATUS_VAR);
3405                 return packFloat16(aSign, 0, 0);
3406             }
3407             return commonNaNToFloat16(
3408                 float64ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3409         }
3410         /* Infinity */
3411         if (!ieee) {
3412             float_raise(float_flag_invalid STATUS_VAR);
3413             return packFloat16(aSign, 0x1f, 0x3ff);
3414         }
3415         return packFloat16(aSign, 0x1f, 0);
3416     }
3417     shift64RightJamming(aSig, 29, &aSig);
3418     zSig = aSig;
3419     if (aExp == 0 && zSig == 0) {
3420         return packFloat16(aSign, 0, 0);
3421     }
3422     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3423      * even if the input is denormal; however this is harmless because
3424      * the largest possible single-precision denormal is still smaller
3425      * than the smallest representable half-precision denormal, and so we
3426      * will end up ignoring aSig and returning via the "always return zero"
3427      * codepath.
3428      */
3429     zSig |= 0x00800000;
3430     aExp -= 0x3F1;
3431
3432     return roundAndPackFloat16(aSign, aExp, zSig, ieee STATUS_VAR);
3433 }
3434
3435 /*----------------------------------------------------------------------------
3436 | Returns the result of converting the double-precision floating-point value
3437 | `a' to the extended double-precision floating-point format.  The conversion
3438 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3439 | Arithmetic.
3440 *----------------------------------------------------------------------------*/
3441
3442 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3443 {
3444     flag aSign;
3445     int_fast16_t aExp;
3446     uint64_t aSig;
3447
3448     a = float64_squash_input_denormal(a STATUS_VAR);
3449     aSig = extractFloat64Frac( a );
3450     aExp = extractFloat64Exp( a );
3451     aSign = extractFloat64Sign( a );
3452     if ( aExp == 0x7FF ) {
3453         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3454         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3455     }
3456     if ( aExp == 0 ) {
3457         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3458         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3459     }
3460     return
3461         packFloatx80(
3462             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3463
3464 }
3465
3466 /*----------------------------------------------------------------------------
3467 | Returns the result of converting the double-precision floating-point value
3468 | `a' to the quadruple-precision floating-point format.  The conversion is
3469 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3470 | Arithmetic.
3471 *----------------------------------------------------------------------------*/
3472
3473 float128 float64_to_float128( float64 a STATUS_PARAM )
3474 {
3475     flag aSign;
3476     int_fast16_t aExp;
3477     uint64_t aSig, zSig0, zSig1;
3478
3479     a = float64_squash_input_denormal(a STATUS_VAR);
3480     aSig = extractFloat64Frac( a );
3481     aExp = extractFloat64Exp( a );
3482     aSign = extractFloat64Sign( a );
3483     if ( aExp == 0x7FF ) {
3484         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3485         return packFloat128( aSign, 0x7FFF, 0, 0 );
3486     }
3487     if ( aExp == 0 ) {
3488         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3489         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3490         --aExp;
3491     }
3492     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3493     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3494
3495 }
3496
3497 /*----------------------------------------------------------------------------
3498 | Rounds the double-precision floating-point value `a' to an integer, and
3499 | returns the result as a double-precision floating-point value.  The
3500 | operation is performed according to the IEC/IEEE Standard for Binary
3501 | Floating-Point Arithmetic.
3502 *----------------------------------------------------------------------------*/
3503
3504 float64 float64_round_to_int( float64 a STATUS_PARAM )
3505 {
3506     flag aSign;
3507     int_fast16_t aExp;
3508     uint64_t lastBitMask, roundBitsMask;
3509     uint64_t z;
3510     a = float64_squash_input_denormal(a STATUS_VAR);
3511
3512     aExp = extractFloat64Exp( a );
3513     if ( 0x433 <= aExp ) {
3514         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3515             return propagateFloat64NaN( a, a STATUS_VAR );
3516         }
3517         return a;
3518     }
3519     if ( aExp < 0x3FF ) {
3520         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3521         STATUS(float_exception_flags) |= float_flag_inexact;
3522         aSign = extractFloat64Sign( a );
3523         switch ( STATUS(float_rounding_mode) ) {
3524          case float_round_nearest_even:
3525             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3526                 return packFloat64( aSign, 0x3FF, 0 );
3527             }
3528             break;
3529         case float_round_ties_away:
3530             if (aExp == 0x3FE) {
3531                 return packFloat64(aSign, 0x3ff, 0);
3532             }
3533             break;
3534          case float_round_down:
3535             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3536          case float_round_up:
3537             return make_float64(
3538             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3539         }
3540         return packFloat64( aSign, 0, 0 );
3541     }
3542     lastBitMask = 1;
3543     lastBitMask <<= 0x433 - aExp;
3544     roundBitsMask = lastBitMask - 1;
3545     z = float64_val(a);
3546     switch (STATUS(float_rounding_mode)) {
3547     case float_round_nearest_even:
3548         z += lastBitMask >> 1;
3549         if ((z & roundBitsMask) == 0) {
3550             z &= ~lastBitMask;
3551         }
3552         break;
3553     case float_round_ties_away:
3554         z += lastBitMask >> 1;
3555         break;
3556     case float_round_to_zero:
3557         break;
3558     case float_round_up:
3559         if (!extractFloat64Sign(make_float64(z))) {
3560             z += roundBitsMask;
3561         }
3562         break;
3563     case float_round_down:
3564         if (extractFloat64Sign(make_float64(z))) {
3565             z += roundBitsMask;
3566         }
3567         break;
3568     default:
3569         abort();
3570     }
3571     z &= ~ roundBitsMask;
3572     if ( z != float64_val(a) )
3573         STATUS(float_exception_flags) |= float_flag_inexact;
3574     return make_float64(z);
3575
3576 }
3577
3578 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3579 {
3580     int oldmode;
3581     float64 res;
3582     oldmode = STATUS(float_rounding_mode);
3583     STATUS(float_rounding_mode) = float_round_to_zero;
3584     res = float64_round_to_int(a STATUS_VAR);
3585     STATUS(float_rounding_mode) = oldmode;
3586     return res;
3587 }
3588
3589 /*----------------------------------------------------------------------------
3590 | Returns the result of adding the absolute values of the double-precision
3591 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3592 | before being returned.  `zSign' is ignored if the result is a NaN.
3593 | The addition is performed according to the IEC/IEEE Standard for Binary
3594 | Floating-Point Arithmetic.
3595 *----------------------------------------------------------------------------*/
3596
3597 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3598 {
3599     int_fast16_t aExp, bExp, zExp;
3600     uint64_t aSig, bSig, zSig;
3601     int_fast16_t expDiff;
3602
3603     aSig = extractFloat64Frac( a );
3604     aExp = extractFloat64Exp( a );
3605     bSig = extractFloat64Frac( b );
3606     bExp = extractFloat64Exp( b );
3607     expDiff = aExp - bExp;
3608     aSig <<= 9;
3609     bSig <<= 9;
3610     if ( 0 < expDiff ) {
3611         if ( aExp == 0x7FF ) {
3612             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3613             return a;
3614         }
3615         if ( bExp == 0 ) {
3616             --expDiff;
3617         }
3618         else {
3619             bSig |= LIT64( 0x2000000000000000 );
3620         }
3621         shift64RightJamming( bSig, expDiff, &bSig );
3622         zExp = aExp;
3623     }
3624     else if ( expDiff < 0 ) {
3625         if ( bExp == 0x7FF ) {
3626             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3627             return packFloat64( zSign, 0x7FF, 0 );
3628         }
3629         if ( aExp == 0 ) {
3630             ++expDiff;
3631         }
3632         else {
3633             aSig |= LIT64( 0x2000000000000000 );
3634         }
3635         shift64RightJamming( aSig, - expDiff, &aSig );
3636         zExp = bExp;
3637     }
3638     else {
3639         if ( aExp == 0x7FF ) {
3640             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3641             return a;
3642         }
3643         if ( aExp == 0 ) {
3644             if (STATUS(flush_to_zero)) {
3645                 if (aSig | bSig) {
3646                     float_raise(float_flag_output_denormal STATUS_VAR);
3647                 }
3648                 return packFloat64(zSign, 0, 0);
3649             }
3650             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3651         }
3652         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3653         zExp = aExp;
3654         goto roundAndPack;
3655     }
3656     aSig |= LIT64( 0x2000000000000000 );
3657     zSig = ( aSig + bSig )<<1;
3658     --zExp;
3659     if ( (int64_t) zSig < 0 ) {
3660         zSig = aSig + bSig;
3661         ++zExp;
3662     }
3663  roundAndPack:
3664     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3665
3666 }
3667
3668 /*----------------------------------------------------------------------------
3669 | Returns the result of subtracting the absolute values of the double-
3670 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3671 | difference is negated before being returned.  `zSign' is ignored if the
3672 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3673 | Standard for Binary Floating-Point Arithmetic.
3674 *----------------------------------------------------------------------------*/
3675
3676 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3677 {
3678     int_fast16_t aExp, bExp, zExp;
3679     uint64_t aSig, bSig, zSig;
3680     int_fast16_t expDiff;
3681
3682     aSig = extractFloat64Frac( a );
3683     aExp = extractFloat64Exp( a );
3684     bSig = extractFloat64Frac( b );
3685     bExp = extractFloat64Exp( b );
3686     expDiff = aExp - bExp;
3687     aSig <<= 10;
3688     bSig <<= 10;
3689     if ( 0 < expDiff ) goto aExpBigger;
3690     if ( expDiff < 0 ) goto bExpBigger;
3691     if ( aExp == 0x7FF ) {
3692         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3693         float_raise( float_flag_invalid STATUS_VAR);
3694         return float64_default_nan;
3695     }
3696     if ( aExp == 0 ) {
3697         aExp = 1;
3698         bExp = 1;
3699     }
3700     if ( bSig < aSig ) goto aBigger;
3701     if ( aSig < bSig ) goto bBigger;
3702     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3703  bExpBigger:
3704     if ( bExp == 0x7FF ) {
3705         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3706         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3707     }
3708     if ( aExp == 0 ) {
3709         ++expDiff;
3710     }
3711     else {
3712         aSig |= LIT64( 0x4000000000000000 );
3713     }
3714     shift64RightJamming( aSig, - expDiff, &aSig );
3715     bSig |= LIT64( 0x4000000000000000 );
3716  bBigger:
3717     zSig = bSig - aSig;
3718     zExp = bExp;
3719     zSign ^= 1;
3720     goto normalizeRoundAndPack;
3721  aExpBigger:
3722     if ( aExp == 0x7FF ) {
3723         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3724         return a;
3725     }
3726     if ( bExp == 0 ) {
3727         --expDiff;
3728     }
3729     else {
3730         bSig |= LIT64( 0x4000000000000000 );
3731     }
3732     shift64RightJamming( bSig, expDiff, &bSig );
3733     aSig |= LIT64( 0x4000000000000000 );
3734  aBigger:
3735     zSig = aSig - bSig;
3736     zExp = aExp;
3737  normalizeRoundAndPack:
3738     --zExp;
3739     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3740
3741 }
3742
3743 /*----------------------------------------------------------------------------
3744 | Returns the result of adding the double-precision floating-point values `a'
3745 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3746 | Binary Floating-Point Arithmetic.
3747 *----------------------------------------------------------------------------*/
3748
3749 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3750 {
3751     flag aSign, bSign;
3752     a = float64_squash_input_denormal(a STATUS_VAR);
3753     b = float64_squash_input_denormal(b STATUS_VAR);
3754
3755     aSign = extractFloat64Sign( a );
3756     bSign = extractFloat64Sign( b );
3757     if ( aSign == bSign ) {
3758         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3759     }
3760     else {
3761         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3762     }
3763
3764 }
3765
3766 /*----------------------------------------------------------------------------
3767 | Returns the result of subtracting the double-precision floating-point values
3768 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3769 | for Binary Floating-Point Arithmetic.
3770 *----------------------------------------------------------------------------*/
3771
3772 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3773 {
3774     flag aSign, bSign;
3775     a = float64_squash_input_denormal(a STATUS_VAR);
3776     b = float64_squash_input_denormal(b STATUS_VAR);
3777
3778     aSign = extractFloat64Sign( a );
3779     bSign = extractFloat64Sign( b );
3780     if ( aSign == bSign ) {
3781         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3782     }
3783     else {
3784         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3785     }
3786
3787 }
3788
3789 /*----------------------------------------------------------------------------
3790 | Returns the result of multiplying the double-precision floating-point values
3791 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3792 | for Binary Floating-Point Arithmetic.
3793 *----------------------------------------------------------------------------*/
3794
3795 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3796 {
3797     flag aSign, bSign, zSign;
3798     int_fast16_t aExp, bExp, zExp;
3799     uint64_t aSig, bSig, zSig0, zSig1;
3800
3801     a = float64_squash_input_denormal(a STATUS_VAR);
3802     b = float64_squash_input_denormal(b STATUS_VAR);
3803
3804     aSig = extractFloat64Frac( a );
3805     aExp = extractFloat64Exp( a );
3806     aSign = extractFloat64Sign( a );
3807     bSig = extractFloat64Frac( b );
3808     bExp = extractFloat64Exp( b );
3809     bSign = extractFloat64Sign( b );
3810     zSign = aSign ^ bSign;
3811     if ( aExp == 0x7FF ) {
3812         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3813             return propagateFloat64NaN( a, b STATUS_VAR );
3814         }
3815         if ( ( bExp | bSig ) == 0 ) {
3816             float_raise( float_flag_invalid STATUS_VAR);
3817             return float64_default_nan;
3818         }
3819         return packFloat64( zSign, 0x7FF, 0 );
3820     }
3821     if ( bExp == 0x7FF ) {
3822         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3823         if ( ( aExp | aSig ) == 0 ) {
3824             float_raise( float_flag_invalid STATUS_VAR);
3825             return float64_default_nan;
3826         }
3827         return packFloat64( zSign, 0x7FF, 0 );
3828     }
3829     if ( aExp == 0 ) {
3830         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3831         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3832     }
3833     if ( bExp == 0 ) {
3834         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3835         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3836     }
3837     zExp = aExp + bExp - 0x3FF;
3838     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3839     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3840     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3841     zSig0 |= ( zSig1 != 0 );
3842     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3843         zSig0 <<= 1;
3844         --zExp;
3845     }
3846     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3847
3848 }
3849
3850 /*----------------------------------------------------------------------------
3851 | Returns the result of dividing the double-precision floating-point value `a'
3852 | by the corresponding value `b'.  The operation is performed according to
3853 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3854 *----------------------------------------------------------------------------*/
3855
3856 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3857 {
3858     flag aSign, bSign, zSign;
3859     int_fast16_t aExp, bExp, zExp;
3860     uint64_t aSig, bSig, zSig;
3861     uint64_t rem0, rem1;
3862     uint64_t term0, term1;
3863     a = float64_squash_input_denormal(a STATUS_VAR);
3864     b = float64_squash_input_denormal(b STATUS_VAR);
3865
3866     aSig = extractFloat64Frac( a );
3867     aExp = extractFloat64Exp( a );
3868     aSign = extractFloat64Sign( a );
3869     bSig = extractFloat64Frac( b );
3870     bExp = extractFloat64Exp( b );
3871     bSign = extractFloat64Sign( b );
3872     zSign = aSign ^ bSign;
3873     if ( aExp == 0x7FF ) {
3874         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3875         if ( bExp == 0x7FF ) {
3876             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3877             float_raise( float_flag_invalid STATUS_VAR);
3878             return float64_default_nan;
3879         }
3880         return packFloat64( zSign, 0x7FF, 0 );
3881     }
3882     if ( bExp == 0x7FF ) {
3883         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3884         return packFloat64( zSign, 0, 0 );
3885     }
3886     if ( bExp == 0 ) {
3887         if ( bSig == 0 ) {
3888             if ( ( aExp | aSig ) == 0 ) {
3889                 float_raise( float_flag_invalid STATUS_VAR);
3890                 return float64_default_nan;
3891             }
3892             float_raise( float_flag_divbyzero STATUS_VAR);
3893             return packFloat64( zSign, 0x7FF, 0 );
3894         }
3895         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3896     }
3897     if ( aExp == 0 ) {
3898         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3899         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3900     }
3901     zExp = aExp - bExp + 0x3FD;
3902     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3903     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3904     if ( bSig <= ( aSig + aSig ) ) {
3905         aSig >>= 1;
3906         ++zExp;
3907     }
3908     zSig = estimateDiv128To64( aSig, 0, bSig );
3909     if ( ( zSig & 0x1FF ) <= 2 ) {
3910         mul64To128( bSig, zSig, &term0, &term1 );
3911         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3912         while ( (int64_t) rem0 < 0 ) {
3913             --zSig;
3914             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3915         }
3916         zSig |= ( rem1 != 0 );
3917     }
3918     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3919
3920 }
3921
3922 /*----------------------------------------------------------------------------
3923 | Returns the remainder of the double-precision floating-point value `a'
3924 | with respect to the corresponding value `b'.  The operation is performed
3925 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3926 *----------------------------------------------------------------------------*/
3927
3928 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3929 {
3930     flag aSign, zSign;
3931     int_fast16_t aExp, bExp, expDiff;
3932     uint64_t aSig, bSig;
3933     uint64_t q, alternateASig;
3934     int64_t sigMean;
3935
3936     a = float64_squash_input_denormal(a STATUS_VAR);
3937     b = float64_squash_input_denormal(b STATUS_VAR);
3938     aSig = extractFloat64Frac( a );
3939     aExp = extractFloat64Exp( a );
3940     aSign = extractFloat64Sign( a );
3941     bSig = extractFloat64Frac( b );
3942     bExp = extractFloat64Exp( b );
3943     if ( aExp == 0x7FF ) {
3944         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3945             return propagateFloat64NaN( a, b STATUS_VAR );
3946         }
3947         float_raise( float_flag_invalid STATUS_VAR);
3948         return float64_default_nan;
3949     }
3950     if ( bExp == 0x7FF ) {
3951         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3952         return a;
3953     }
3954     if ( bExp == 0 ) {
3955         if ( bSig == 0 ) {
3956             float_raise( float_flag_invalid STATUS_VAR);
3957             return float64_default_nan;
3958         }
3959         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3960     }
3961     if ( aExp == 0 ) {
3962         if ( aSig == 0 ) return a;
3963         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3964     }
3965     expDiff = aExp - bExp;
3966     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3967     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3968     if ( expDiff < 0 ) {
3969         if ( expDiff < -1 ) return a;
3970         aSig >>= 1;
3971     }
3972     q = ( bSig <= aSig );
3973     if ( q ) aSig -= bSig;
3974     expDiff -= 64;
3975     while ( 0 < expDiff ) {
3976         q = estimateDiv128To64( aSig, 0, bSig );
3977         q = ( 2 < q ) ? q - 2 : 0;
3978         aSig = - ( ( bSig>>2 ) * q );
3979         expDiff -= 62;
3980     }
3981     expDiff += 64;
3982     if ( 0 < expDiff ) {
3983         q = estimateDiv128To64( aSig, 0, bSig );
3984         q = ( 2 < q ) ? q - 2 : 0;
3985         q >>= 64 - expDiff;
3986         bSig >>= 2;
3987         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3988     }
3989     else {
3990         aSig >>= 2;
3991         bSig >>= 2;
3992     }
3993     do {
3994         alternateASig = aSig;
3995         ++q;
3996         aSig -= bSig;
3997     } while ( 0 <= (int64_t) aSig );
3998     sigMean = aSig + alternateASig;
3999     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4000         aSig = alternateASig;
4001     }
4002     zSign = ( (int64_t) aSig < 0 );
4003     if ( zSign ) aSig = - aSig;
4004     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
4005
4006 }
4007
4008 /*----------------------------------------------------------------------------
4009 | Returns the result of multiplying the double-precision floating-point values
4010 | `a' and `b' then adding 'c', with no intermediate rounding step after the
4011 | multiplication.  The operation is performed according to the IEC/IEEE
4012 | Standard for Binary Floating-Point Arithmetic 754-2008.
4013 | The flags argument allows the caller to select negation of the
4014 | addend, the intermediate product, or the final result. (The difference
4015 | between this and having the caller do a separate negation is that negating
4016 | externally will flip the sign bit on NaNs.)
4017 *----------------------------------------------------------------------------*/
4018
4019 float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
4020 {
4021     flag aSign, bSign, cSign, zSign;
4022     int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
4023     uint64_t aSig, bSig, cSig;
4024     flag pInf, pZero, pSign;
4025     uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4026     int shiftcount;
4027     flag signflip, infzero;
4028
4029     a = float64_squash_input_denormal(a STATUS_VAR);
4030     b = float64_squash_input_denormal(b STATUS_VAR);
4031     c = float64_squash_input_denormal(c STATUS_VAR);
4032     aSig = extractFloat64Frac(a);
4033     aExp = extractFloat64Exp(a);
4034     aSign = extractFloat64Sign(a);
4035     bSig = extractFloat64Frac(b);
4036     bExp = extractFloat64Exp(b);
4037     bSign = extractFloat64Sign(b);
4038     cSig = extractFloat64Frac(c);
4039     cExp = extractFloat64Exp(c);
4040     cSign = extractFloat64Sign(c);
4041
4042     infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4043                (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4044
4045     /* It is implementation-defined whether the cases of (0,inf,qnan)
4046      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4047      * they return if they do), so we have to hand this information
4048      * off to the target-specific pick-a-NaN routine.
4049      */
4050     if (((aExp == 0x7ff) && aSig) ||
4051         ((bExp == 0x7ff) && bSig) ||
4052         ((cExp == 0x7ff) && cSig)) {
4053         return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
4054     }
4055
4056     if (infzero) {
4057         float_raise(float_flag_invalid STATUS_VAR);
4058         return float64_default_nan;
4059     }
4060
4061     if (flags & float_muladd_negate_c) {
4062         cSign ^= 1;
4063     }
4064
4065     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4066
4067     /* Work out the sign and type of the product */
4068     pSign = aSign ^ bSign;
4069     if (flags & float_muladd_negate_product) {
4070         pSign ^= 1;
4071     }
4072     pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4073     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4074
4075     if (cExp == 0x7ff) {
4076         if (pInf && (pSign ^ cSign)) {
4077             /* addition of opposite-signed infinities => InvalidOperation */
4078             float_raise(float_flag_invalid STATUS_VAR);
4079             return float64_default_nan;
4080         }
4081         /* Otherwise generate an infinity of the same sign */
4082         return packFloat64(cSign ^ signflip, 0x7ff, 0);
4083     }
4084
4085     if (pInf) {
4086         return packFloat64(pSign ^ signflip, 0x7ff, 0);
4087     }
4088
4089     if (pZero) {
4090         if (cExp == 0) {
4091             if (cSig == 0) {
4092                 /* Adding two exact zeroes */
4093                 if (pSign == cSign) {
4094                     zSign = pSign;
4095                 } else if (STATUS(float_rounding_mode) == float_round_down) {
4096                     zSign = 1;
4097                 } else {
4098                     zSign = 0;
4099                 }
4100                 return packFloat64(zSign ^ signflip, 0, 0);
4101             }
4102             /* Exact zero plus a denorm */
4103             if (STATUS(flush_to_zero)) {
4104                 float_raise(float_flag_output_denormal STATUS_VAR);
4105                 return packFloat64(cSign ^ signflip, 0, 0);
4106             }
4107         }
4108         /* Zero plus something non-zero : just return the something */
4109         if (flags & float_muladd_halve_result) {
4110             if (cExp == 0) {
4111                 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4112             }
4113             /* Subtract one to halve, and one again because roundAndPackFloat64
4114              * wants one less than the true exponent.
4115              */
4116             cExp -= 2;
4117             cSig = (cSig | 0x0010000000000000ULL) << 10;
4118             return roundAndPackFloat64(cSign ^ signflip, cExp, cSig STATUS_VAR);
4119         }
4120         return packFloat64(cSign ^ signflip, cExp, cSig);
4121     }
4122
4123     if (aExp == 0) {
4124         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4125     }
4126     if (bExp == 0) {
4127         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4128     }
4129
4130     /* Calculate the actual result a * b + c */
4131
4132     /* Multiply first; this is easy. */
4133     /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4134      * because we want the true exponent, not the "one-less-than"
4135      * flavour that roundAndPackFloat64() takes.
4136      */
4137     pExp = aExp + bExp - 0x3fe;
4138     aSig = (aSig | LIT64(0x0010000000000000))<<10;
4139     bSig = (bSig | LIT64(0x0010000000000000))<<11;
4140     mul64To128(aSig, bSig, &pSig0, &pSig1);
4141     if ((int64_t)(pSig0 << 1) >= 0) {
4142         shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4143         pExp--;
4144     }
4145
4146     zSign = pSign ^ signflip;
4147
4148     /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4149      * bit in position 126.
4150      */
4151     if (cExp == 0) {
4152         if (!cSig) {
4153             /* Throw out the special case of c being an exact zero now */
4154             shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4155             if (flags & float_muladd_halve_result) {
4156                 pExp--;
4157             }
4158             return roundAndPackFloat64(zSign, pExp - 1,
4159                                        pSig1 STATUS_VAR);
4160         }
4161         normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4162     }
4163
4164     /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4165      * significand of the addend, with the explicit bit in position 126.
4166      */
4167     cSig0 = cSig << (126 - 64 - 52);
4168     cSig1 = 0;
4169     cSig0 |= LIT64(0x4000000000000000);
4170     expDiff = pExp - cExp;
4171
4172     if (pSign == cSign) {
4173         /* Addition */
4174         if (expDiff > 0) {
4175             /* scale c to match p */
4176             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4177             zExp = pExp;
4178         } else if (expDiff < 0) {
4179             /* scale p to match c */
4180             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4181             zExp = cExp;
4182         } else {
4183             /* no scaling needed */
4184             zExp = cExp;
4185         }
4186         /* Add significands and make sure explicit bit ends up in posn 126 */
4187         add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4188         if ((int64_t)zSig0 < 0) {
4189             shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4190         } else {
4191             zExp--;
4192         }
4193         shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4194         if (flags & float_muladd_halve_result) {
4195             zExp--;
4196         }
4197         return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
4198     } else {
4199         /* Subtraction */
4200         if (expDiff > 0) {
4201             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4202             sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4203             zExp = pExp;
4204         } else if (expDiff < 0) {
4205             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4206             sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4207             zExp = cExp;
4208             zSign ^= 1;
4209         } else {
4210             zExp = pExp;
4211             if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4212                 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4213             } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4214                 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4215                 zSign ^= 1;
4216             } else {
4217                 /* Exact zero */
4218                 zSign = signflip;
4219                 if (STATUS(float_rounding_mode) == float_round_down) {
4220                     zSign ^= 1;
4221                 }
4222                 return packFloat64(zSign, 0, 0);
4223             }
4224         }
4225         --zExp;
4226         /* Do the equivalent of normalizeRoundAndPackFloat64() but
4227          * starting with the significand in a pair of uint64_t.
4228          */
4229         if (zSig0) {
4230             shiftcount = countLeadingZeros64(zSig0) - 1;
4231             shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4232             if (zSig1) {
4233                 zSig0 |= 1;
4234             }
4235             zExp -= shiftcount;
4236         } else {
4237             shiftcount = countLeadingZeros64(zSig1);
4238             if (shiftcount == 0) {
4239                 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4240                 zExp -= 63;
4241             } else {
4242                 shiftcount--;
4243                 zSig0 = zSig1 << shiftcount;
4244                 zExp -= (shiftcount + 64);
4245             }
4246         }
4247         if (flags & float_muladd_halve_result) {
4248             zExp--;
4249         }
4250         return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
4251     }
4252 }
4253
4254 /*----------------------------------------------------------------------------
4255 | Returns the square root of the double-precision floating-point value `a'.
4256 | The operation is performed according to the IEC/IEEE Standard for Binary
4257 | Floating-Point Arithmetic.
4258 *----------------------------------------------------------------------------*/
4259
4260 float64 float64_sqrt( float64 a STATUS_PARAM )
4261 {
4262     flag aSign;
4263     int_fast16_t aExp, zExp;
4264     uint64_t aSig, zSig, doubleZSig;
4265     uint64_t rem0, rem1, term0, term1;
4266     a = float64_squash_input_denormal(a STATUS_VAR);
4267
4268     aSig = extractFloat64Frac( a );
4269     aExp = extractFloat64Exp( a );
4270     aSign = extractFloat64Sign( a );
4271     if ( aExp == 0x7FF ) {
4272         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
4273         if ( ! aSign ) return a;
4274         float_raise( float_flag_invalid STATUS_VAR);
4275         return float64_default_nan;
4276     }
4277     if ( aSign ) {
4278         if ( ( aExp | aSig ) == 0 ) return a;
4279         float_raise( float_flag_invalid STATUS_VAR);
4280         return float64_default_nan;
4281     }
4282     if ( aExp == 0 ) {
4283         if ( aSig == 0 ) return float64_zero;
4284         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4285     }
4286     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4287     aSig |= LIT64( 0x0010000000000000 );
4288     zSig = estimateSqrt32( aExp, aSig>>21 );
4289     aSig <<= 9 - ( aExp & 1 );
4290     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4291     if ( ( zSig & 0x1FF ) <= 5 ) {
4292         doubleZSig = zSig<<1;
4293         mul64To128( zSig, zSig, &term0, &term1 );
4294         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4295         while ( (int64_t) rem0 < 0 ) {
4296             --zSig;
4297             doubleZSig -= 2;
4298             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4299         }
4300         zSig |= ( ( rem0 | rem1 ) != 0 );
4301     }
4302     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
4303
4304 }
4305
4306 /*----------------------------------------------------------------------------
4307 | Returns the binary log of the double-precision floating-point value `a'.
4308 | The operation is performed according to the IEC/IEEE Standard for Binary
4309 | Floating-Point Arithmetic.
4310 *----------------------------------------------------------------------------*/
4311 float64 float64_log2( float64 a STATUS_PARAM )
4312 {
4313     flag aSign, zSign;
4314     int_fast16_t aExp;
4315     uint64_t aSig, aSig0, aSig1, zSig, i;
4316     a = float64_squash_input_denormal(a STATUS_VAR);
4317
4318     aSig = extractFloat64Frac( a );
4319     aExp = extractFloat64Exp( a );
4320     aSign = extractFloat64Sign( a );
4321
4322     if ( aExp == 0 ) {
4323         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4324         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4325     }
4326     if ( aSign ) {
4327         float_raise( float_flag_invalid STATUS_VAR);
4328         return float64_default_nan;
4329     }
4330     if ( aExp == 0x7FF ) {
4331         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
4332         return a;
4333     }
4334
4335     aExp -= 0x3FF;
4336     aSig |= LIT64( 0x0010000000000000 );
4337     zSign = aExp < 0;
4338     zSig = (uint64_t)aExp << 52;
4339     for (i = 1LL << 51; i > 0; i >>= 1) {
4340         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4341         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4342         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4343             aSig >>= 1;
4344             zSig |= i;
4345         }
4346     }
4347
4348     if ( zSign )
4349         zSig = -zSig;
4350     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
4351 }
4352
4353 /*----------------------------------------------------------------------------
4354 | Returns 1 if the double-precision floating-point value `a' is equal to the
4355 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4356 | if either operand is a NaN.  Otherwise, the comparison is performed
4357 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4358 *----------------------------------------------------------------------------*/
4359
4360 int float64_eq( float64 a, float64 b STATUS_PARAM )
4361 {
4362     uint64_t av, bv;
4363     a = float64_squash_input_denormal(a STATUS_VAR);
4364     b = float64_squash_input_denormal(b STATUS_VAR);
4365
4366     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4367          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4368        ) {
4369         float_raise( float_flag_invalid STATUS_VAR);
4370         return 0;
4371     }
4372     av = float64_val(a);
4373     bv = float64_val(b);
4374     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4375
4376 }
4377
4378 /*----------------------------------------------------------------------------
4379 | Returns 1 if the double-precision floating-point value `a' is less than or
4380 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4381 | exception is raised if either operand is a NaN.  The comparison is performed
4382 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4383 *----------------------------------------------------------------------------*/
4384
4385 int float64_le( float64 a, float64 b STATUS_PARAM )
4386 {
4387     flag aSign, bSign;
4388     uint64_t av, bv;
4389     a = float64_squash_input_denormal(a STATUS_VAR);
4390     b = float64_squash_input_denormal(b STATUS_VAR);
4391
4392     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4393          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4394        ) {
4395         float_raise( float_flag_invalid STATUS_VAR);
4396         return 0;
4397     }
4398     aSign = extractFloat64Sign( a );
4399     bSign = extractFloat64Sign( b );
4400     av = float64_val(a);
4401     bv = float64_val(b);
4402     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4403     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4404
4405 }
4406
4407 /*----------------------------------------------------------------------------
4408 | Returns 1 if the double-precision floating-point value `a' is less than
4409 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4410 | raised if either operand is a NaN.  The comparison is performed according
4411 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4412 *----------------------------------------------------------------------------*/
4413
4414 int float64_lt( float64 a, float64 b STATUS_PARAM )
4415 {
4416     flag aSign, bSign;
4417     uint64_t av, bv;
4418
4419     a = float64_squash_input_denormal(a STATUS_VAR);
4420     b = float64_squash_input_denormal(b STATUS_VAR);
4421     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4422          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4423        ) {
4424         float_raise( float_flag_invalid STATUS_VAR);
4425         return 0;
4426     }
4427     aSign = extractFloat64Sign( a );
4428     bSign = extractFloat64Sign( b );
4429     av = float64_val(a);
4430     bv = float64_val(b);
4431     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4432     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4433
4434 }
4435
4436 /*----------------------------------------------------------------------------
4437 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4438 | be compared, and 0 otherwise.  The invalid exception is raised if either
4439 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4440 | Standard for Binary Floating-Point Arithmetic.
4441 *----------------------------------------------------------------------------*/
4442
4443 int float64_unordered( float64 a, float64 b STATUS_PARAM )
4444 {
4445     a = float64_squash_input_denormal(a STATUS_VAR);
4446     b = float64_squash_input_denormal(b STATUS_VAR);
4447
4448     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4449          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4450        ) {
4451         float_raise( float_flag_invalid STATUS_VAR);
4452         return 1;
4453     }
4454     return 0;
4455 }
4456
4457 /*----------------------------------------------------------------------------
4458 | Returns 1 if the double-precision floating-point value `a' is equal to the
4459 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4460 | exception.The comparison is performed according to the IEC/IEEE Standard
4461 | for Binary Floating-Point Arithmetic.
4462 *----------------------------------------------------------------------------*/
4463
4464 int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
4465 {
4466     uint64_t av, bv;
4467     a = float64_squash_input_denormal(a STATUS_VAR);
4468     b = float64_squash_input_denormal(b STATUS_VAR);
4469
4470     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4471          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4472        ) {
4473         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4474             float_raise( float_flag_invalid STATUS_VAR);
4475         }
4476         return 0;
4477     }
4478     av = float64_val(a);
4479     bv = float64_val(b);
4480     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4481
4482 }
4483
4484 /*----------------------------------------------------------------------------
4485 | Returns 1 if the double-precision floating-point value `a' is less than or
4486 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4487 | cause an exception.  Otherwise, the comparison is performed according to the
4488 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4489 *----------------------------------------------------------------------------*/
4490
4491 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
4492 {
4493     flag aSign, bSign;
4494     uint64_t av, bv;
4495     a = float64_squash_input_denormal(a STATUS_VAR);
4496     b = float64_squash_input_denormal(b STATUS_VAR);
4497
4498     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4499          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4500        ) {
4501         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4502             float_raise( float_flag_invalid STATUS_VAR);
4503         }
4504         return 0;
4505     }
4506     aSign = extractFloat64Sign( a );
4507     bSign = extractFloat64Sign( b );
4508     av = float64_val(a);
4509     bv = float64_val(b);
4510     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4511     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4512
4513 }
4514
4515 /*----------------------------------------------------------------------------
4516 | Returns 1 if the double-precision floating-point value `a' is less than
4517 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4518 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4519 | Standard for Binary Floating-Point Arithmetic.
4520 *----------------------------------------------------------------------------*/
4521
4522 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
4523 {
4524     flag aSign, bSign;
4525     uint64_t av, bv;
4526     a = float64_squash_input_denormal(a STATUS_VAR);
4527     b = float64_squash_input_denormal(b STATUS_VAR);
4528
4529     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4530          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4531        ) {
4532         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4533             float_raise( float_flag_invalid STATUS_VAR);
4534         }
4535         return 0;
4536     }
4537     aSign = extractFloat64Sign( a );
4538     bSign = extractFloat64Sign( b );
4539     av = float64_val(a);
4540     bv = float64_val(b);
4541     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4542     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4543
4544 }
4545
4546 /*----------------------------------------------------------------------------
4547 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4548 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4549 | comparison is performed according to the IEC/IEEE Standard for Binary
4550 | Floating-Point Arithmetic.
4551 *----------------------------------------------------------------------------*/
4552
4553 int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4554 {
4555     a = float64_squash_input_denormal(a STATUS_VAR);
4556     b = float64_squash_input_denormal(b STATUS_VAR);
4557
4558     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4559          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4560        ) {
4561         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4562             float_raise( float_flag_invalid STATUS_VAR);
4563         }
4564         return 1;
4565     }
4566     return 0;
4567 }
4568
4569 /*----------------------------------------------------------------------------
4570 | Returns the result of converting the extended double-precision floating-
4571 | point value `a' to the 32-bit two's complement integer format.  The
4572 | conversion is performed according to the IEC/IEEE Standard for Binary
4573 | Floating-Point Arithmetic---which means in particular that the conversion
4574 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4575 | largest positive integer is returned.  Otherwise, if the conversion
4576 | overflows, the largest integer with the same sign as `a' is returned.
4577 *----------------------------------------------------------------------------*/
4578
4579 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4580 {
4581     flag aSign;
4582     int32 aExp, shiftCount;
4583     uint64_t aSig;
4584
4585     aSig = extractFloatx80Frac( a );
4586     aExp = extractFloatx80Exp( a );
4587     aSign = extractFloatx80Sign( a );
4588     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4589     shiftCount = 0x4037 - aExp;
4590     if ( shiftCount <= 0 ) shiftCount = 1;
4591     shift64RightJamming( aSig, shiftCount, &aSig );
4592     return roundAndPackInt32( aSign, aSig STATUS_VAR );
4593
4594 }
4595
4596 /*----------------------------------------------------------------------------
4597 | Returns the result of converting the extended double-precision floating-
4598 | point value `a' to the 32-bit two's complement integer format.  The
4599 | conversion is performed according to the IEC/IEEE Standard for Binary
4600 | Floating-Point Arithmetic, except that the conversion is always rounded
4601 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4602 | Otherwise, if the conversion overflows, the largest integer with the same
4603 | sign as `a' is returned.
4604 *----------------------------------------------------------------------------*/
4605
4606 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4607 {
4608     flag aSign;
4609     int32 aExp, shiftCount;
4610     uint64_t aSig, savedASig;
4611     int32_t z;
4612
4613     aSig = extractFloatx80Frac( a );
4614     aExp = extractFloatx80Exp( a );
4615     aSign = extractFloatx80Sign( a );
4616     if ( 0x401E < aExp ) {
4617         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4618         goto invalid;
4619     }
4620     else if ( aExp < 0x3FFF ) {
4621         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4622         return 0;
4623     }
4624     shiftCount = 0x403E - aExp;
4625     savedASig = aSig;
4626     aSig >>= shiftCount;
4627     z = aSig;
4628     if ( aSign ) z = - z;
4629     if ( ( z < 0 ) ^ aSign ) {
4630  invalid:
4631         float_raise( float_flag_invalid STATUS_VAR);
4632         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4633     }
4634     if ( ( aSig<<shiftCount ) != savedASig ) {
4635         STATUS(float_exception_flags) |= float_flag_inexact;
4636     }
4637     return z;
4638
4639 }
4640
4641 /*----------------------------------------------------------------------------
4642 | Returns the result of converting the extended double-precision floating-
4643 | point value `a' to the 64-bit two's complement integer format.  The
4644 | conversion is performed according to the IEC/IEEE Standard for Binary
4645 | Floating-Point Arithmetic---which means in particular that the conversion
4646 | is rounded according to the current rounding mode.  If `a' is a NaN,
4647 | the largest positive integer is returned.  Otherwise, if the conversion
4648 | overflows, the largest integer with the same sign as `a' is returned.
4649 *----------------------------------------------------------------------------*/
4650
4651 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4652 {
4653     flag aSign;
4654     int32 aExp, shiftCount;
4655     uint64_t aSig, aSigExtra;
4656
4657     aSig = extractFloatx80Frac( a );
4658     aExp = extractFloatx80Exp( a );
4659     aSign = extractFloatx80Sign( a );
4660     shiftCount = 0x403E - aExp;
4661     if ( shiftCount <= 0 ) {
4662         if ( shiftCount ) {
4663             float_raise( float_flag_invalid STATUS_VAR);
4664             if (    ! aSign
4665                  || (    ( aExp == 0x7FFF )
4666                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
4667                ) {
4668                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4669             }
4670             return (int64_t) LIT64( 0x8000000000000000 );
4671         }
4672         aSigExtra = 0;
4673     }
4674     else {
4675         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4676     }
4677     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4678
4679 }
4680
4681 /*----------------------------------------------------------------------------
4682 | Returns the result of converting the extended double-precision floating-
4683 | point value `a' to the 64-bit two's complement integer format.  The
4684 | conversion is performed according to the IEC/IEEE Standard for Binary
4685 | Floating-Point Arithmetic, except that the conversion is always rounded
4686 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4687 | Otherwise, if the conversion overflows, the largest integer with the same
4688 | sign as `a' is returned.
4689 *----------------------------------------------------------------------------*/
4690
4691 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4692 {
4693     flag aSign;
4694     int32 aExp, shiftCount;
4695     uint64_t aSig;
4696     int64 z;
4697
4698     aSig = extractFloatx80Frac( a );
4699     aExp = extractFloatx80Exp( a );
4700     aSign = extractFloatx80Sign( a );
4701     shiftCount = aExp - 0x403E;
4702     if ( 0 <= shiftCount ) {
4703         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4704         if ( ( a.high != 0xC03E ) || aSig ) {
4705             float_raise( float_flag_invalid STATUS_VAR);
4706             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4707                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4708             }
4709         }
4710         return (int64_t) LIT64( 0x8000000000000000 );
4711     }
4712     else if ( aExp < 0x3FFF ) {
4713         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4714         return 0;
4715     }
4716     z = aSig>>( - shiftCount );
4717     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4718         STATUS(float_exception_flags) |= float_flag_inexact;
4719     }
4720     if ( aSign ) z = - z;
4721     return z;
4722
4723 }
4724
4725 /*----------------------------------------------------------------------------
4726 | Returns the result of converting the extended double-precision floating-
4727 | point value `a' to the single-precision floating-point format.  The
4728 | conversion is performed according to the IEC/IEEE Standard for Binary
4729 | Floating-Point Arithmetic.
4730 *----------------------------------------------------------------------------*/
4731
4732 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4733 {
4734     flag aSign;
4735     int32 aExp;
4736     uint64_t aSig;
4737
4738     aSig = extractFloatx80Frac( a );
4739     aExp = extractFloatx80Exp( a );
4740     aSign = extractFloatx80Sign( a );
4741     if ( aExp == 0x7FFF ) {
4742         if ( (uint64_t) ( aSig<<1 ) ) {
4743             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4744         }
4745         return packFloat32( aSign, 0xFF, 0 );
4746     }
4747     shift64RightJamming( aSig, 33, &aSig );
4748     if ( aExp || aSig ) aExp -= 0x3F81;
4749     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4750
4751 }
4752
4753 /*----------------------------------------------------------------------------
4754 | Returns the result of converting the extended double-precision floating-
4755 | point value `a' to the double-precision floating-point format.  The
4756 | conversion is performed according to the IEC/IEEE Standard for Binary
4757 | Floating-Point Arithmetic.
4758 *----------------------------------------------------------------------------*/
4759
4760 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4761 {
4762     flag aSign;
4763     int32 aExp;
4764     uint64_t aSig, zSig;
4765
4766     aSig = extractFloatx80Frac( a );
4767     aExp = extractFloatx80Exp( a );
4768     aSign = extractFloatx80Sign( a );
4769     if ( aExp == 0x7FFF ) {
4770         if ( (uint64_t) ( aSig<<1 ) ) {
4771             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4772         }
4773         return packFloat64( aSign, 0x7FF, 0 );
4774     }
4775     shift64RightJamming( aSig, 1, &zSig );
4776     if ( aExp || aSig ) aExp -= 0x3C01;
4777     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4778
4779 }
4780
4781 /*----------------------------------------------------------------------------
4782 | Returns the result of converting the extended double-precision floating-
4783 | point value `a' to the quadruple-precision floating-point format.  The
4784 | conversion is performed according to the IEC/IEEE Standard for Binary
4785 | Floating-Point Arithmetic.
4786 *----------------------------------------------------------------------------*/
4787
4788 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4789 {
4790     flag aSign;
4791     int_fast16_t aExp;
4792     uint64_t aSig, zSig0, zSig1;
4793
4794     aSig = extractFloatx80Frac( a );
4795     aExp = extractFloatx80Exp( a );
4796     aSign = extractFloatx80Sign( a );
4797     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4798         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4799     }
4800     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4801     return packFloat128( aSign, aExp, zSig0, zSig1 );
4802
4803 }
4804
4805 /*----------------------------------------------------------------------------
4806 | Rounds the extended double-precision floating-point value `a' to an integer,
4807 | and returns the result as an extended quadruple-precision floating-point
4808 | value.  The operation is performed according to the IEC/IEEE Standard for
4809 | Binary Floating-Point Arithmetic.
4810 *----------------------------------------------------------------------------*/
4811
4812 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4813 {
4814     flag aSign;
4815     int32 aExp;
4816     uint64_t lastBitMask, roundBitsMask;
4817     floatx80 z;
4818
4819     aExp = extractFloatx80Exp( a );
4820     if ( 0x403E <= aExp ) {
4821         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4822             return propagateFloatx80NaN( a, a STATUS_VAR );
4823         }
4824         return a;
4825     }
4826     if ( aExp < 0x3FFF ) {
4827         if (    ( aExp == 0 )
4828              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4829             return a;
4830         }
4831         STATUS(float_exception_flags) |= float_flag_inexact;
4832         aSign = extractFloatx80Sign( a );
4833         switch ( STATUS(float_rounding_mode) ) {
4834          case float_round_nearest_even:
4835             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4836                ) {
4837                 return
4838                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4839             }
4840             break;
4841         case float_round_ties_away:
4842             if (aExp == 0x3FFE) {
4843                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4844             }
4845             break;
4846          case float_round_down:
4847             return
4848                   aSign ?
4849                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4850                 : packFloatx80( 0, 0, 0 );
4851          case float_round_up:
4852             return
4853                   aSign ? packFloatx80( 1, 0, 0 )
4854                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4855         }
4856         return packFloatx80( aSign, 0, 0 );
4857     }
4858     lastBitMask = 1;
4859     lastBitMask <<= 0x403E - aExp;
4860     roundBitsMask = lastBitMask - 1;
4861     z = a;
4862     switch (STATUS(float_rounding_mode)) {
4863     case float_round_nearest_even:
4864         z.low += lastBitMask>>1;
4865         if ((z.low & roundBitsMask) == 0) {
4866             z.low &= ~lastBitMask;
4867         }
4868         break;
4869     case float_round_ties_away:
4870         z.low += lastBitMask >> 1;
4871         break;
4872     case float_round_to_zero:
4873         break;
4874     case float_round_up:
4875         if (!extractFloatx80Sign(z)) {
4876             z.low += roundBitsMask;
4877         }
4878         break;
4879     case float_round_down:
4880         if (extractFloatx80Sign(z)) {
4881             z.low += roundBitsMask;
4882         }
4883         break;
4884     default:
4885         abort();
4886     }
4887     z.low &= ~ roundBitsMask;
4888     if ( z.low == 0 ) {
4889         ++z.high;
4890         z.low = LIT64( 0x8000000000000000 );
4891     }
4892     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4893     return z;
4894
4895 }
4896
4897 /*----------------------------------------------------------------------------
4898 | Returns the result of adding the absolute values of the extended double-
4899 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4900 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4901 | The addition is performed according to the IEC/IEEE Standard for Binary
4902 | Floating-Point Arithmetic.
4903 *----------------------------------------------------------------------------*/
4904
4905 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4906 {
4907     int32 aExp, bExp, zExp;
4908     uint64_t aSig, bSig, zSig0, zSig1;
4909     int32 expDiff;
4910
4911     aSig = extractFloatx80Frac( a );
4912     aExp = extractFloatx80Exp( a );
4913     bSig = extractFloatx80Frac( b );
4914     bExp = extractFloatx80Exp( b );
4915     expDiff = aExp - bExp;
4916     if ( 0 < expDiff ) {
4917         if ( aExp == 0x7FFF ) {
4918             if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4919             return a;
4920         }
4921         if ( bExp == 0 ) --expDiff;
4922         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4923         zExp = aExp;
4924     }
4925     else if ( expDiff < 0 ) {
4926         if ( bExp == 0x7FFF ) {
4927             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4928             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4929         }
4930         if ( aExp == 0 ) ++expDiff;
4931         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4932         zExp = bExp;
4933     }
4934     else {
4935         if ( aExp == 0x7FFF ) {
4936             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4937                 return propagateFloatx80NaN( a, b STATUS_VAR );
4938             }
4939             return a;
4940         }
4941         zSig1 = 0;
4942         zSig0 = aSig + bSig;
4943         if ( aExp == 0 ) {
4944             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4945             goto roundAndPack;
4946         }
4947         zExp = aExp;
4948         goto shiftRight1;
4949     }
4950     zSig0 = aSig + bSig;
4951     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4952  shiftRight1:
4953     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4954     zSig0 |= LIT64( 0x8000000000000000 );
4955     ++zExp;
4956  roundAndPack:
4957     return
4958         roundAndPackFloatx80(
4959             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4960
4961 }
4962
4963 /*----------------------------------------------------------------------------
4964 | Returns the result of subtracting the absolute values of the extended
4965 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4966 | difference is negated before being returned.  `zSign' is ignored if the
4967 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4968 | Standard for Binary Floating-Point Arithmetic.
4969 *----------------------------------------------------------------------------*/
4970
4971 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4972 {
4973     int32 aExp, bExp, zExp;
4974     uint64_t aSig, bSig, zSig0, zSig1;
4975     int32 expDiff;
4976     floatx80 z;
4977
4978     aSig = extractFloatx80Frac( a );
4979     aExp = extractFloatx80Exp( a );
4980     bSig = extractFloatx80Frac( b );
4981     bExp = extractFloatx80Exp( b );
4982     expDiff = aExp - bExp;
4983     if ( 0 < expDiff ) goto aExpBigger;
4984     if ( expDiff < 0 ) goto bExpBigger;
4985     if ( aExp == 0x7FFF ) {
4986         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4987             return propagateFloatx80NaN( a, b STATUS_VAR );
4988         }
4989         float_raise( float_flag_invalid STATUS_VAR);
4990         z.low = floatx80_default_nan_low;
4991         z.high = floatx80_default_nan_high;
4992         return z;
4993     }
4994     if ( aExp == 0 ) {
4995         aExp = 1;
4996         bExp = 1;
4997     }
4998     zSig1 = 0;
4999     if ( bSig < aSig ) goto aBigger;
5000     if ( aSig < bSig ) goto bBigger;
5001     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
5002  bExpBigger:
5003     if ( bExp == 0x7FFF ) {
5004         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5005         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5006     }
5007     if ( aExp == 0 ) ++expDiff;
5008     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5009  bBigger:
5010     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5011     zExp = bExp;
5012     zSign ^= 1;
5013     goto normalizeRoundAndPack;
5014  aExpBigger:
5015     if ( aExp == 0x7FFF ) {
5016         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5017         return a;
5018     }
5019     if ( bExp == 0 ) --expDiff;
5020     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5021  aBigger:
5022     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5023     zExp = aExp;
5024  normalizeRoundAndPack:
5025     return
5026         normalizeRoundAndPackFloatx80(
5027             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5028
5029 }
5030
5031 /*----------------------------------------------------------------------------
5032 | Returns the result of adding the extended double-precision floating-point
5033 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5034 | Standard for Binary Floating-Point Arithmetic.
5035 *----------------------------------------------------------------------------*/
5036
5037 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
5038 {
5039     flag aSign, bSign;
5040
5041     aSign = extractFloatx80Sign( a );
5042     bSign = extractFloatx80Sign( b );
5043     if ( aSign == bSign ) {
5044         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
5045     }
5046     else {
5047         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
5048     }
5049
5050 }
5051
5052 /*----------------------------------------------------------------------------
5053 | Returns the result of subtracting the extended double-precision floating-
5054 | point values `a' and `b'.  The operation is performed according to the
5055 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5056 *----------------------------------------------------------------------------*/
5057
5058 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
5059 {
5060     flag aSign, bSign;
5061
5062     aSign = extractFloatx80Sign( a );
5063     bSign = extractFloatx80Sign( b );
5064     if ( aSign == bSign ) {
5065         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
5066     }
5067     else {
5068         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
5069     }
5070
5071 }
5072
5073 /*----------------------------------------------------------------------------
5074 | Returns the result of multiplying the extended double-precision floating-
5075 | point values `a' and `b'.  The operation is performed according to the
5076 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5077 *----------------------------------------------------------------------------*/
5078
5079 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
5080 {
5081     flag aSign, bSign, zSign;
5082     int32 aExp, bExp, zExp;
5083     uint64_t aSig, bSig, zSig0, zSig1;
5084     floatx80 z;
5085
5086     aSig = extractFloatx80Frac( a );
5087     aExp = extractFloatx80Exp( a );
5088     aSign = extractFloatx80Sign( a );
5089     bSig = extractFloatx80Frac( b );
5090     bExp = extractFloatx80Exp( b );
5091     bSign = extractFloatx80Sign( b );
5092     zSign = aSign ^ bSign;
5093     if ( aExp == 0x7FFF ) {
5094         if (    (uint64_t) ( aSig<<1 )
5095              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5096             return propagateFloatx80NaN( a, b STATUS_VAR );
5097         }
5098         if ( ( bExp | bSig ) == 0 ) goto invalid;
5099         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5100     }
5101     if ( bExp == 0x7FFF ) {
5102         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5103         if ( ( aExp | aSig ) == 0 ) {
5104  invalid:
5105             float_raise( float_flag_invalid STATUS_VAR);
5106             z.low = floatx80_default_nan_low;
5107             z.high = floatx80_default_nan_high;
5108             return z;
5109         }
5110         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5111     }
5112     if ( aExp == 0 ) {
5113         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5114         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5115     }
5116     if ( bExp == 0 ) {
5117         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5118         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5119     }
5120     zExp = aExp + bExp - 0x3FFE;
5121     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5122     if ( 0 < (int64_t) zSig0 ) {
5123         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5124         --zExp;
5125     }
5126     return
5127         roundAndPackFloatx80(
5128             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5129
5130 }
5131
5132 /*----------------------------------------------------------------------------
5133 | Returns the result of dividing the extended double-precision floating-point
5134 | value `a' by the corresponding value `b'.  The operation is performed
5135 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5136 *----------------------------------------------------------------------------*/
5137
5138 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
5139 {
5140     flag aSign, bSign, zSign;
5141     int32 aExp, bExp, zExp;
5142     uint64_t aSig, bSig, zSig0, zSig1;
5143     uint64_t rem0, rem1, rem2, term0, term1, term2;
5144     floatx80 z;
5145
5146     aSig = extractFloatx80Frac( a );
5147     aExp = extractFloatx80Exp( a );
5148     aSign = extractFloatx80Sign( a );
5149     bSig = extractFloatx80Frac( b );
5150     bExp = extractFloatx80Exp( b );
5151     bSign = extractFloatx80Sign( b );
5152     zSign = aSign ^ bSign;
5153     if ( aExp == 0x7FFF ) {
5154         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5155         if ( bExp == 0x7FFF ) {
5156             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5157             goto invalid;
5158         }
5159         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5160     }
5161     if ( bExp == 0x7FFF ) {
5162         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5163         return packFloatx80( zSign, 0, 0 );
5164     }
5165     if ( bExp == 0 ) {
5166         if ( bSig == 0 ) {
5167             if ( ( aExp | aSig ) == 0 ) {
5168  invalid:
5169                 float_raise( float_flag_invalid STATUS_VAR);
5170                 z.low = floatx80_default_nan_low;
5171                 z.high = floatx80_default_nan_high;
5172                 return z;
5173             }
5174             float_raise( float_flag_divbyzero STATUS_VAR);
5175             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5176         }
5177         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5178     }
5179     if ( aExp == 0 ) {
5180         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5181         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5182     }
5183     zExp = aExp - bExp + 0x3FFE;
5184     rem1 = 0;
5185     if ( bSig <= aSig ) {
5186         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5187         ++zExp;
5188     }
5189     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5190     mul64To128( bSig, zSig0, &term0, &term1 );
5191     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5192     while ( (int64_t) rem0 < 0 ) {
5193         --zSig0;
5194         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5195     }
5196     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5197     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5198         mul64To128( bSig, zSig1, &term1, &term2 );
5199         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5200         while ( (int64_t) rem1 < 0 ) {
5201             --zSig1;
5202             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5203         }
5204         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5205     }
5206     return
5207         roundAndPackFloatx80(
5208             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5209
5210 }
5211
5212 /*----------------------------------------------------------------------------
5213 | Returns the remainder of the extended double-precision floating-point value
5214 | `a' with respect to the corresponding value `b'.  The operation is performed
5215 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5216 *----------------------------------------------------------------------------*/
5217
5218 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
5219 {
5220     flag aSign, zSign;
5221     int32 aExp, bExp, expDiff;
5222     uint64_t aSig0, aSig1, bSig;
5223     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5224     floatx80 z;
5225
5226     aSig0 = extractFloatx80Frac( a );
5227     aExp = extractFloatx80Exp( a );
5228     aSign = extractFloatx80Sign( a );
5229     bSig = extractFloatx80Frac( b );
5230     bExp = extractFloatx80Exp( b );
5231     if ( aExp == 0x7FFF ) {
5232         if (    (uint64_t) ( aSig0<<1 )
5233              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5234             return propagateFloatx80NaN( a, b STATUS_VAR );
5235         }
5236         goto invalid;
5237     }
5238     if ( bExp == 0x7FFF ) {
5239         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5240         return a;
5241     }
5242     if ( bExp == 0 ) {
5243         if ( bSig == 0 ) {
5244  invalid:
5245             float_raise( float_flag_invalid STATUS_VAR);
5246             z.low = floatx80_default_nan_low;
5247             z.high = floatx80_default_nan_high;
5248             return z;
5249         }
5250         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5251     }
5252     if ( aExp == 0 ) {
5253         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5254         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5255     }
5256     bSig |= LIT64( 0x8000000000000000 );
5257     zSign = aSign;
5258     expDiff = aExp - bExp;
5259     aSig1 = 0;
5260     if ( expDiff < 0 ) {
5261         if ( expDiff < -1 ) return a;
5262         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5263         expDiff = 0;
5264     }
5265     q = ( bSig <= aSig0 );
5266     if ( q ) aSig0 -= bSig;
5267     expDiff -= 64;
5268     while ( 0 < expDiff ) {
5269         q = estimateDiv128To64( aSig0, aSig1, bSig );
5270         q = ( 2 < q ) ? q - 2 : 0;
5271         mul64To128( bSig, q, &term0, &term1 );
5272         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5273         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5274         expDiff -= 62;
5275     }
5276     expDiff += 64;
5277     if ( 0 < expDiff ) {
5278         q = estimateDiv128To64( aSig0, aSig1, bSig );
5279         q = ( 2 < q ) ? q - 2 : 0;
5280         q >>= 64 - expDiff;
5281         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5282         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5283         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5284         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5285             ++q;
5286             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5287         }
5288     }
5289     else {
5290         term1 = 0;
5291         term0 = bSig;
5292     }
5293     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5294     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5295          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5296               && ( q & 1 ) )
5297        ) {
5298         aSig0 = alternateASig0;
5299         aSig1 = alternateASig1;
5300         zSign = ! zSign;
5301     }
5302     return
5303         normalizeRoundAndPackFloatx80(
5304             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
5305
5306 }
5307
5308 /*----------------------------------------------------------------------------
5309 | Returns the square root of the extended double-precision floating-point
5310 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5311 | for Binary Floating-Point Arithmetic.
5312 *----------------------------------------------------------------------------*/
5313
5314 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
5315 {
5316     flag aSign;
5317     int32 aExp, zExp;
5318     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5319     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5320     floatx80 z;
5321
5322     aSig0 = extractFloatx80Frac( a );
5323     aExp = extractFloatx80Exp( a );
5324     aSign = extractFloatx80Sign( a );
5325     if ( aExp == 0x7FFF ) {
5326         if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
5327         if ( ! aSign ) return a;
5328         goto invalid;
5329     }
5330     if ( aSign ) {
5331         if ( ( aExp | aSig0 ) == 0 ) return a;
5332  invalid:
5333         float_raise( float_flag_invalid STATUS_VAR);
5334         z.low = floatx80_default_nan_low;
5335         z.high = floatx80_default_nan_high;
5336         return z;
5337     }
5338     if ( aExp == 0 ) {
5339         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5340         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5341     }
5342     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5343     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5344     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5345     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5346     doubleZSig0 = zSig0<<1;
5347     mul64To128( zSig0, zSig0, &term0, &term1 );
5348     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5349     while ( (int64_t) rem0 < 0 ) {
5350         --zSig0;
5351         doubleZSig0 -= 2;
5352         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5353     }
5354     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5355     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5356         if ( zSig1 == 0 ) zSig1 = 1;
5357         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5358         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5359         mul64To128( zSig1, zSig1, &term2, &term3 );
5360         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5361         while ( (int64_t) rem1 < 0 ) {
5362             --zSig1;
5363             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5364             term3 |= 1;
5365             term2 |= doubleZSig0;
5366             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5367         }
5368         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5369     }
5370     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5371     zSig0 |= doubleZSig0;
5372     return
5373         roundAndPackFloatx80(
5374             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
5375
5376 }
5377
5378 /*----------------------------------------------------------------------------
5379 | Returns 1 if the extended double-precision floating-point value `a' is equal
5380 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5381 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5382 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5383 *----------------------------------------------------------------------------*/
5384
5385 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
5386 {
5387
5388     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5389               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5390          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5391               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5392        ) {
5393         float_raise( float_flag_invalid STATUS_VAR);
5394         return 0;
5395     }
5396     return
5397            ( a.low == b.low )
5398         && (    ( a.high == b.high )
5399              || (    ( a.low == 0 )
5400                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5401            );
5402
5403 }
5404
5405 /*----------------------------------------------------------------------------
5406 | Returns 1 if the extended double-precision floating-point value `a' is
5407 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5408 | invalid exception is raised if either operand is a NaN.  The comparison is
5409 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5410 | Arithmetic.
5411 *----------------------------------------------------------------------------*/
5412
5413 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
5414 {
5415     flag aSign, bSign;
5416
5417     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5418               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5419          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5420               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5421        ) {
5422         float_raise( float_flag_invalid STATUS_VAR);
5423         return 0;
5424     }
5425     aSign = extractFloatx80Sign( a );
5426     bSign = extractFloatx80Sign( b );
5427     if ( aSign != bSign ) {
5428         return
5429                aSign
5430             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5431                  == 0 );
5432     }
5433     return
5434           aSign ? le128( b.high, b.low, a.high, a.low )
5435         : le128( a.high, a.low, b.high, b.low );
5436
5437 }
5438
5439 /*----------------------------------------------------------------------------
5440 | Returns 1 if the extended double-precision floating-point value `a' is
5441 | less than the corresponding value `b', and 0 otherwise.  The invalid
5442 | exception is raised if either operand is a NaN.  The comparison is performed
5443 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5444 *----------------------------------------------------------------------------*/
5445
5446 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
5447 {
5448     flag aSign, bSign;
5449
5450     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5451               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5452          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5453               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5454        ) {
5455         float_raise( float_flag_invalid STATUS_VAR);
5456         return 0;
5457     }
5458     aSign = extractFloatx80Sign( a );
5459     bSign = extractFloatx80Sign( b );
5460     if ( aSign != bSign ) {
5461         return
5462                aSign
5463             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5464                  != 0 );
5465     }
5466     return
5467           aSign ? lt128( b.high, b.low, a.high, a.low )
5468         : lt128( a.high, a.low, b.high, b.low );
5469
5470 }
5471
5472 /*----------------------------------------------------------------------------
5473 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5474 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5475 | either operand is a NaN.   The comparison is performed according to the
5476 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5477 *----------------------------------------------------------------------------*/
5478 int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5479 {
5480     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5481               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5482          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5483               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5484        ) {
5485         float_raise( float_flag_invalid STATUS_VAR);
5486         return 1;
5487     }
5488     return 0;
5489 }
5490
5491 /*----------------------------------------------------------------------------
5492 | Returns 1 if the extended double-precision floating-point value `a' is
5493 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5494 | cause an exception.  The comparison is performed according to the IEC/IEEE
5495 | Standard for Binary Floating-Point Arithmetic.
5496 *----------------------------------------------------------------------------*/
5497
5498 int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5499 {
5500
5501     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5502               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5503          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5504               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5505        ) {
5506         if (    floatx80_is_signaling_nan( a )
5507              || floatx80_is_signaling_nan( b ) ) {
5508             float_raise( float_flag_invalid STATUS_VAR);
5509         }
5510         return 0;
5511     }
5512     return
5513            ( a.low == b.low )
5514         && (    ( a.high == b.high )
5515              || (    ( a.low == 0 )
5516                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5517            );
5518
5519 }
5520
5521 /*----------------------------------------------------------------------------
5522 | Returns 1 if the extended double-precision floating-point value `a' is less
5523 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5524 | do not cause an exception.  Otherwise, the comparison is performed according
5525 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5526 *----------------------------------------------------------------------------*/
5527
5528 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5529 {
5530     flag aSign, bSign;
5531
5532     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5533               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5534          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5535               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5536        ) {
5537         if (    floatx80_is_signaling_nan( a )
5538              || floatx80_is_signaling_nan( b ) ) {
5539             float_raise( float_flag_invalid STATUS_VAR);
5540         }
5541         return 0;
5542     }
5543     aSign = extractFloatx80Sign( a );
5544     bSign = extractFloatx80Sign( b );
5545     if ( aSign != bSign ) {
5546         return
5547                aSign
5548             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5549                  == 0 );
5550     }
5551     return
5552           aSign ? le128( b.high, b.low, a.high, a.low )
5553         : le128( a.high, a.low, b.high, b.low );
5554
5555 }
5556
5557 /*----------------------------------------------------------------------------
5558 | Returns 1 if the extended double-precision floating-point value `a' is less
5559 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5560 | an exception.  Otherwise, the comparison is performed according to the
5561 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5562 *----------------------------------------------------------------------------*/
5563
5564 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5565 {
5566     flag aSign, bSign;
5567
5568     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5569               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5570          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5571               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5572        ) {
5573         if (    floatx80_is_signaling_nan( a )
5574              || floatx80_is_signaling_nan( b ) ) {
5575             float_raise( float_flag_invalid STATUS_VAR);
5576         }
5577         return 0;
5578     }
5579     aSign = extractFloatx80Sign( a );
5580     bSign = extractFloatx80Sign( b );
5581     if ( aSign != bSign ) {
5582         return
5583                aSign
5584             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5585                  != 0 );
5586     }
5587     return
5588           aSign ? lt128( b.high, b.low, a.high, a.low )
5589         : lt128( a.high, a.low, b.high, b.low );
5590
5591 }
5592
5593 /*----------------------------------------------------------------------------
5594 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5595 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5596 | The comparison is performed according to the IEC/IEEE Standard for Binary
5597 | Floating-Point Arithmetic.
5598 *----------------------------------------------------------------------------*/
5599 int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5600 {
5601     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5602               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5603          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5604               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5605        ) {
5606         if (    floatx80_is_signaling_nan( a )
5607              || floatx80_is_signaling_nan( b ) ) {
5608             float_raise( float_flag_invalid STATUS_VAR);
5609         }
5610         return 1;
5611     }
5612     return 0;
5613 }
5614
5615 /*----------------------------------------------------------------------------
5616 | Returns the result of converting the quadruple-precision floating-point
5617 | value `a' to the 32-bit two's complement integer format.  The conversion
5618 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5619 | Arithmetic---which means in particular that the conversion is rounded
5620 | according to the current rounding mode.  If `a' is a NaN, the largest
5621 | positive integer is returned.  Otherwise, if the conversion overflows, the
5622 | largest integer with the same sign as `a' is returned.
5623 *----------------------------------------------------------------------------*/
5624
5625 int32 float128_to_int32( float128 a STATUS_PARAM )
5626 {
5627     flag aSign;
5628     int32 aExp, shiftCount;
5629     uint64_t aSig0, aSig1;
5630
5631     aSig1 = extractFloat128Frac1( a );
5632     aSig0 = extractFloat128Frac0( a );
5633     aExp = extractFloat128Exp( a );
5634     aSign = extractFloat128Sign( a );
5635     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5636     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5637     aSig0 |= ( aSig1 != 0 );
5638     shiftCount = 0x4028 - aExp;
5639     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5640     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5641
5642 }
5643
5644 /*----------------------------------------------------------------------------
5645 | Returns the result of converting the quadruple-precision floating-point
5646 | value `a' to the 32-bit two's complement integer format.  The conversion
5647 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5648 | Arithmetic, except that the conversion is always rounded toward zero.  If
5649 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5650 | conversion overflows, the largest integer with the same sign as `a' is
5651 | returned.
5652 *----------------------------------------------------------------------------*/
5653
5654 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5655 {
5656     flag aSign;
5657     int32 aExp, shiftCount;
5658     uint64_t aSig0, aSig1, savedASig;
5659     int32_t z;
5660
5661     aSig1 = extractFloat128Frac1( a );
5662     aSig0 = extractFloat128Frac0( a );
5663     aExp = extractFloat128Exp( a );
5664     aSign = extractFloat128Sign( a );
5665     aSig0 |= ( aSig1 != 0 );
5666     if ( 0x401E < aExp ) {
5667         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5668         goto invalid;
5669     }
5670     else if ( aExp < 0x3FFF ) {
5671         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5672         return 0;
5673     }
5674     aSig0 |= LIT64( 0x0001000000000000 );
5675     shiftCount = 0x402F - aExp;
5676     savedASig = aSig0;
5677     aSig0 >>= shiftCount;
5678     z = aSig0;
5679     if ( aSign ) z = - z;
5680     if ( ( z < 0 ) ^ aSign ) {
5681  invalid:
5682         float_raise( float_flag_invalid STATUS_VAR);
5683         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5684     }
5685     if ( ( aSig0<<shiftCount ) != savedASig ) {
5686         STATUS(float_exception_flags) |= float_flag_inexact;
5687     }
5688     return z;
5689
5690 }
5691
5692 /*----------------------------------------------------------------------------
5693 | Returns the result of converting the quadruple-precision floating-point
5694 | value `a' to the 64-bit two's complement integer format.  The conversion
5695 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5696 | Arithmetic---which means in particular that the conversion is rounded
5697 | according to the current rounding mode.  If `a' is a NaN, the largest
5698 | positive integer is returned.  Otherwise, if the conversion overflows, the
5699 | largest integer with the same sign as `a' is returned.
5700 *----------------------------------------------------------------------------*/
5701
5702 int64 float128_to_int64( float128 a STATUS_PARAM )
5703 {
5704     flag aSign;
5705     int32 aExp, shiftCount;
5706     uint64_t aSig0, aSig1;
5707
5708     aSig1 = extractFloat128Frac1( a );
5709     aSig0 = extractFloat128Frac0( a );
5710     aExp = extractFloat128Exp( a );
5711     aSign = extractFloat128Sign( a );
5712     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5713     shiftCount = 0x402F - aExp;
5714     if ( shiftCount <= 0 ) {
5715         if ( 0x403E < aExp ) {
5716             float_raise( float_flag_invalid STATUS_VAR);
5717             if (    ! aSign
5718                  || (    ( aExp == 0x7FFF )
5719                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5720                     )
5721                ) {
5722                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5723             }
5724             return (int64_t) LIT64( 0x8000000000000000 );
5725         }
5726         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5727     }
5728     else {
5729         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5730     }
5731     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5732
5733 }
5734
5735 /*----------------------------------------------------------------------------
5736 | Returns the result of converting the quadruple-precision floating-point
5737 | value `a' to the 64-bit two's complement integer format.  The conversion
5738 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5739 | Arithmetic, except that the conversion is always rounded toward zero.
5740 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5741 | the conversion overflows, the largest integer with the same sign as `a' is
5742 | returned.
5743 *----------------------------------------------------------------------------*/
5744
5745 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5746 {
5747     flag aSign;
5748     int32 aExp, shiftCount;
5749     uint64_t aSig0, aSig1;
5750     int64 z;
5751
5752     aSig1 = extractFloat128Frac1( a );
5753     aSig0 = extractFloat128Frac0( a );
5754     aExp = extractFloat128Exp( a );
5755     aSign = extractFloat128Sign( a );
5756     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5757     shiftCount = aExp - 0x402F;
5758     if ( 0 < shiftCount ) {
5759         if ( 0x403E <= aExp ) {
5760             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5761             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5762                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5763                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5764             }
5765             else {
5766                 float_raise( float_flag_invalid STATUS_VAR);
5767                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5768                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5769                 }
5770             }
5771             return (int64_t) LIT64( 0x8000000000000000 );
5772         }
5773         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5774         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5775             STATUS(float_exception_flags) |= float_flag_inexact;
5776         }
5777     }
5778     else {
5779         if ( aExp < 0x3FFF ) {
5780             if ( aExp | aSig0 | aSig1 ) {
5781                 STATUS(float_exception_flags) |= float_flag_inexact;
5782             }
5783             return 0;
5784         }
5785         z = aSig0>>( - shiftCount );
5786         if (    aSig1
5787              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5788             STATUS(float_exception_flags) |= float_flag_inexact;
5789         }
5790     }
5791     if ( aSign ) z = - z;
5792     return z;
5793
5794 }
5795
5796 /*----------------------------------------------------------------------------
5797 | Returns the result of converting the quadruple-precision floating-point
5798 | value `a' to the single-precision floating-point format.  The conversion
5799 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5800 | Arithmetic.
5801 *----------------------------------------------------------------------------*/
5802
5803 float32 float128_to_float32( float128 a STATUS_PARAM )
5804 {
5805     flag aSign;
5806     int32 aExp;
5807     uint64_t aSig0, aSig1;
5808     uint32_t zSig;
5809
5810     aSig1 = extractFloat128Frac1( a );
5811     aSig0 = extractFloat128Frac0( a );
5812     aExp = extractFloat128Exp( a );
5813     aSign = extractFloat128Sign( a );
5814     if ( aExp == 0x7FFF ) {
5815         if ( aSig0 | aSig1 ) {
5816             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5817         }
5818         return packFloat32( aSign, 0xFF, 0 );
5819     }
5820     aSig0 |= ( aSig1 != 0 );
5821     shift64RightJamming( aSig0, 18, &aSig0 );
5822     zSig = aSig0;
5823     if ( aExp || zSig ) {
5824         zSig |= 0x40000000;
5825         aExp -= 0x3F81;
5826     }
5827     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5828
5829 }
5830
5831 /*----------------------------------------------------------------------------
5832 | Returns the result of converting the quadruple-precision floating-point
5833 | value `a' to the double-precision floating-point format.  The conversion
5834 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5835 | Arithmetic.
5836 *----------------------------------------------------------------------------*/
5837
5838 float64 float128_to_float64( float128 a STATUS_PARAM )
5839 {
5840     flag aSign;
5841     int32 aExp;
5842     uint64_t aSig0, aSig1;
5843
5844     aSig1 = extractFloat128Frac1( a );
5845     aSig0 = extractFloat128Frac0( a );
5846     aExp = extractFloat128Exp( a );
5847     aSign = extractFloat128Sign( a );
5848     if ( aExp == 0x7FFF ) {
5849         if ( aSig0 | aSig1 ) {
5850             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5851         }
5852         return packFloat64( aSign, 0x7FF, 0 );
5853     }
5854     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5855     aSig0 |= ( aSig1 != 0 );
5856     if ( aExp || aSig0 ) {
5857         aSig0 |= LIT64( 0x4000000000000000 );
5858         aExp -= 0x3C01;
5859     }
5860     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5861
5862 }
5863
5864 /*----------------------------------------------------------------------------
5865 | Returns the result of converting the quadruple-precision floating-point
5866 | value `a' to the extended double-precision floating-point format.  The
5867 | conversion is performed according to the IEC/IEEE Standard for Binary
5868 | Floating-Point Arithmetic.
5869 *----------------------------------------------------------------------------*/
5870
5871 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5872 {
5873     flag aSign;
5874     int32 aExp;
5875     uint64_t aSig0, aSig1;
5876
5877     aSig1 = extractFloat128Frac1( a );
5878     aSig0 = extractFloat128Frac0( a );
5879     aExp = extractFloat128Exp( a );
5880     aSign = extractFloat128Sign( a );
5881     if ( aExp == 0x7FFF ) {
5882         if ( aSig0 | aSig1 ) {
5883             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5884         }
5885         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5886     }
5887     if ( aExp == 0 ) {
5888         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5889         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5890     }
5891     else {
5892         aSig0 |= LIT64( 0x0001000000000000 );
5893     }
5894     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5895     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5896
5897 }
5898
5899 /*----------------------------------------------------------------------------
5900 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5901 | returns the result as a quadruple-precision floating-point value.  The
5902 | operation is performed according to the IEC/IEEE Standard for Binary
5903 | Floating-Point Arithmetic.
5904 *----------------------------------------------------------------------------*/
5905
5906 float128 float128_round_to_int( float128 a STATUS_PARAM )
5907 {
5908     flag aSign;
5909     int32 aExp;
5910     uint64_t lastBitMask, roundBitsMask;
5911     float128 z;
5912
5913     aExp = extractFloat128Exp( a );
5914     if ( 0x402F <= aExp ) {
5915         if ( 0x406F <= aExp ) {
5916             if (    ( aExp == 0x7FFF )
5917                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5918                ) {
5919                 return propagateFloat128NaN( a, a STATUS_VAR );
5920             }
5921             return a;
5922         }
5923         lastBitMask = 1;
5924         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5925         roundBitsMask = lastBitMask - 1;
5926         z = a;
5927         switch (STATUS(float_rounding_mode)) {
5928         case float_round_nearest_even:
5929             if ( lastBitMask ) {
5930                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5931                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5932             }
5933             else {
5934                 if ( (int64_t) z.low < 0 ) {
5935                     ++z.high;
5936                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5937                 }
5938             }
5939             break;
5940         case float_round_ties_away:
5941             if (lastBitMask) {
5942                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
5943             } else {
5944                 if ((int64_t) z.low < 0) {
5945                     ++z.high;
5946                 }
5947             }
5948             break;
5949         case float_round_to_zero:
5950             break;
5951         case float_round_up:
5952             if (!extractFloat128Sign(z)) {
5953                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5954             }
5955             break;
5956         case float_round_down:
5957             if (extractFloat128Sign(z)) {
5958                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5959             }
5960             break;
5961         default:
5962             abort();
5963         }
5964         z.low &= ~ roundBitsMask;
5965     }
5966     else {
5967         if ( aExp < 0x3FFF ) {
5968             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5969             STATUS(float_exception_flags) |= float_flag_inexact;
5970             aSign = extractFloat128Sign( a );
5971             switch ( STATUS(float_rounding_mode) ) {
5972              case float_round_nearest_even:
5973                 if (    ( aExp == 0x3FFE )
5974                      && (   extractFloat128Frac0( a )
5975                           | extractFloat128Frac1( a ) )
5976                    ) {
5977                     return packFloat128( aSign, 0x3FFF, 0, 0 );
5978                 }
5979                 break;
5980             case float_round_ties_away:
5981                 if (aExp == 0x3FFE) {
5982                     return packFloat128(aSign, 0x3FFF, 0, 0);
5983                 }
5984                 break;
5985              case float_round_down:
5986                 return
5987                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5988                     : packFloat128( 0, 0, 0, 0 );
5989              case float_round_up:
5990                 return
5991                       aSign ? packFloat128( 1, 0, 0, 0 )
5992                     : packFloat128( 0, 0x3FFF, 0, 0 );
5993             }
5994             return packFloat128( aSign, 0, 0, 0 );
5995         }
5996         lastBitMask = 1;
5997         lastBitMask <<= 0x402F - aExp;
5998         roundBitsMask = lastBitMask - 1;
5999         z.low = 0;
6000         z.high = a.high;
6001         switch (STATUS(float_rounding_mode)) {
6002         case float_round_nearest_even:
6003             z.high += lastBitMask>>1;
6004             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6005                 z.high &= ~ lastBitMask;
6006             }
6007             break;
6008         case float_round_ties_away:
6009             z.high += lastBitMask>>1;
6010             break;
6011         case float_round_to_zero:
6012             break;
6013         case float_round_up:
6014             if (!extractFloat128Sign(z)) {
6015                 z.high |= ( a.low != 0 );
6016                 z.high += roundBitsMask;
6017             }
6018             break;
6019         case float_round_down:
6020             if (extractFloat128Sign(z)) {
6021                 z.high |= (a.low != 0);
6022                 z.high += roundBitsMask;
6023             }
6024             break;
6025         default:
6026             abort();
6027         }
6028         z.high &= ~ roundBitsMask;
6029     }
6030     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6031         STATUS(float_exception_flags) |= float_flag_inexact;
6032     }
6033     return z;
6034
6035 }
6036
6037 /*----------------------------------------------------------------------------
6038 | Returns the result of adding the absolute values of the quadruple-precision
6039 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6040 | before being returned.  `zSign' is ignored if the result is a NaN.
6041 | The addition is performed according to the IEC/IEEE Standard for Binary
6042 | Floating-Point Arithmetic.
6043 *----------------------------------------------------------------------------*/
6044
6045 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
6046 {
6047     int32 aExp, bExp, zExp;
6048     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6049     int32 expDiff;
6050
6051     aSig1 = extractFloat128Frac1( a );
6052     aSig0 = extractFloat128Frac0( a );
6053     aExp = extractFloat128Exp( a );
6054     bSig1 = extractFloat128Frac1( b );
6055     bSig0 = extractFloat128Frac0( b );
6056     bExp = extractFloat128Exp( b );
6057     expDiff = aExp - bExp;
6058     if ( 0 < expDiff ) {
6059         if ( aExp == 0x7FFF ) {
6060             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6061             return a;
6062         }
6063         if ( bExp == 0 ) {
6064             --expDiff;
6065         }
6066         else {
6067             bSig0 |= LIT64( 0x0001000000000000 );
6068         }
6069         shift128ExtraRightJamming(
6070             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6071         zExp = aExp;
6072     }
6073     else if ( expDiff < 0 ) {
6074         if ( bExp == 0x7FFF ) {
6075             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6076             return packFloat128( zSign, 0x7FFF, 0, 0 );
6077         }
6078         if ( aExp == 0 ) {
6079             ++expDiff;
6080         }
6081         else {
6082             aSig0 |= LIT64( 0x0001000000000000 );
6083         }
6084         shift128ExtraRightJamming(
6085             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6086         zExp = bExp;
6087     }
6088     else {
6089         if ( aExp == 0x7FFF ) {
6090             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6091                 return propagateFloat128NaN( a, b STATUS_VAR );
6092             }
6093             return a;
6094         }
6095         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6096         if ( aExp == 0 ) {
6097             if (STATUS(flush_to_zero)) {
6098                 if (zSig0 | zSig1) {
6099                     float_raise(float_flag_output_denormal STATUS_VAR);
6100                 }
6101                 return packFloat128(zSign, 0, 0, 0);
6102             }
6103             return packFloat128( zSign, 0, zSig0, zSig1 );
6104         }
6105         zSig2 = 0;
6106         zSig0 |= LIT64( 0x0002000000000000 );
6107         zExp = aExp;
6108         goto shiftRight1;
6109     }
6110     aSig0 |= LIT64( 0x0001000000000000 );
6111     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6112     --zExp;
6113     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6114     ++zExp;
6115  shiftRight1:
6116     shift128ExtraRightJamming(
6117         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6118  roundAndPack:
6119     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6120
6121 }
6122
6123 /*----------------------------------------------------------------------------
6124 | Returns the result of subtracting the absolute values of the quadruple-
6125 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6126 | difference is negated before being returned.  `zSign' is ignored if the
6127 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6128 | Standard for Binary Floating-Point Arithmetic.
6129 *----------------------------------------------------------------------------*/
6130
6131 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
6132 {
6133     int32 aExp, bExp, zExp;
6134     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6135     int32 expDiff;
6136     float128 z;
6137
6138     aSig1 = extractFloat128Frac1( a );
6139     aSig0 = extractFloat128Frac0( a );
6140     aExp = extractFloat128Exp( a );
6141     bSig1 = extractFloat128Frac1( b );
6142     bSig0 = extractFloat128Frac0( b );
6143     bExp = extractFloat128Exp( b );
6144     expDiff = aExp - bExp;
6145     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6146     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6147     if ( 0 < expDiff ) goto aExpBigger;
6148     if ( expDiff < 0 ) goto bExpBigger;
6149     if ( aExp == 0x7FFF ) {
6150         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6151             return propagateFloat128NaN( a, b STATUS_VAR );
6152         }
6153         float_raise( float_flag_invalid STATUS_VAR);
6154         z.low = float128_default_nan_low;
6155         z.high = float128_default_nan_high;
6156         return z;
6157     }
6158     if ( aExp == 0 ) {
6159         aExp = 1;
6160         bExp = 1;
6161     }
6162     if ( bSig0 < aSig0 ) goto aBigger;
6163     if ( aSig0 < bSig0 ) goto bBigger;
6164     if ( bSig1 < aSig1 ) goto aBigger;
6165     if ( aSig1 < bSig1 ) goto bBigger;
6166     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
6167  bExpBigger:
6168     if ( bExp == 0x7FFF ) {
6169         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6170         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6171     }
6172     if ( aExp == 0 ) {
6173         ++expDiff;
6174     }
6175     else {
6176         aSig0 |= LIT64( 0x4000000000000000 );
6177     }
6178     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6179     bSig0 |= LIT64( 0x4000000000000000 );
6180  bBigger:
6181     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6182     zExp = bExp;
6183     zSign ^= 1;
6184     goto normalizeRoundAndPack;
6185  aExpBigger:
6186     if ( aExp == 0x7FFF ) {
6187         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6188         return a;
6189     }
6190     if ( bExp == 0 ) {
6191         --expDiff;
6192     }
6193     else {
6194         bSig0 |= LIT64( 0x4000000000000000 );
6195     }
6196     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6197     aSig0 |= LIT64( 0x4000000000000000 );
6198  aBigger:
6199     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6200     zExp = aExp;
6201  normalizeRoundAndPack:
6202     --zExp;
6203     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
6204
6205 }
6206
6207 /*----------------------------------------------------------------------------
6208 | Returns the result of adding the quadruple-precision floating-point values
6209 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6210 | for Binary Floating-Point Arithmetic.
6211 *----------------------------------------------------------------------------*/
6212
6213 float128 float128_add( float128 a, float128 b STATUS_PARAM )
6214 {
6215     flag aSign, bSign;
6216
6217     aSign = extractFloat128Sign( a );
6218     bSign = extractFloat128Sign( b );
6219     if ( aSign == bSign ) {
6220         return addFloat128Sigs( a, b, aSign STATUS_VAR );
6221     }
6222     else {
6223         return subFloat128Sigs( a, b, aSign STATUS_VAR );
6224     }
6225
6226 }
6227
6228 /*----------------------------------------------------------------------------
6229 | Returns the result of subtracting the quadruple-precision floating-point
6230 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6231 | Standard for Binary Floating-Point Arithmetic.
6232 *----------------------------------------------------------------------------*/
6233
6234 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
6235 {
6236     flag aSign, bSign;
6237
6238     aSign = extractFloat128Sign( a );
6239     bSign = extractFloat128Sign( b );
6240     if ( aSign == bSign ) {
6241         return subFloat128Sigs( a, b, aSign STATUS_VAR );
6242     }
6243     else {
6244         return addFloat128Sigs( a, b, aSign STATUS_VAR );
6245     }
6246
6247 }
6248
6249 /*----------------------------------------------------------------------------
6250 | Returns the result of multiplying the quadruple-precision floating-point
6251 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6252 | Standard for Binary Floating-Point Arithmetic.
6253 *----------------------------------------------------------------------------*/
6254
6255 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
6256 {
6257     flag aSign, bSign, zSign;
6258     int32 aExp, bExp, zExp;
6259     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6260     float128 z;
6261
6262     aSig1 = extractFloat128Frac1( a );
6263     aSig0 = extractFloat128Frac0( a );
6264     aExp = extractFloat128Exp( a );
6265     aSign = extractFloat128Sign( a );
6266     bSig1 = extractFloat128Frac1( b );
6267     bSig0 = extractFloat128Frac0( b );
6268     bExp = extractFloat128Exp( b );
6269     bSign = extractFloat128Sign( b );
6270     zSign = aSign ^ bSign;
6271     if ( aExp == 0x7FFF ) {
6272         if (    ( aSig0 | aSig1 )
6273              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6274             return propagateFloat128NaN( a, b STATUS_VAR );
6275         }
6276         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6277         return packFloat128( zSign, 0x7FFF, 0, 0 );
6278     }
6279     if ( bExp == 0x7FFF ) {
6280         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6281         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6282  invalid:
6283             float_raise( float_flag_invalid STATUS_VAR);
6284             z.low = float128_default_nan_low;
6285             z.high = float128_default_nan_high;
6286             return z;
6287         }
6288         return packFloat128( zSign, 0x7FFF, 0, 0 );
6289     }
6290     if ( aExp == 0 ) {
6291         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6292         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6293     }
6294     if ( bExp == 0 ) {
6295         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6296         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6297     }
6298     zExp = aExp + bExp - 0x4000;
6299     aSig0 |= LIT64( 0x0001000000000000 );
6300     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6301     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6302     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6303     zSig2 |= ( zSig3 != 0 );
6304     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6305         shift128ExtraRightJamming(
6306             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6307         ++zExp;
6308     }
6309     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6310
6311 }
6312
6313 /*----------------------------------------------------------------------------
6314 | Returns the result of dividing the quadruple-precision floating-point value
6315 | `a' by the corresponding value `b'.  The operation is performed according to
6316 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6317 *----------------------------------------------------------------------------*/
6318
6319 float128 float128_div( float128 a, float128 b STATUS_PARAM )
6320 {
6321     flag aSign, bSign, zSign;
6322     int32 aExp, bExp, zExp;
6323     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6324     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6325     float128 z;
6326
6327     aSig1 = extractFloat128Frac1( a );
6328     aSig0 = extractFloat128Frac0( a );
6329     aExp = extractFloat128Exp( a );
6330     aSign = extractFloat128Sign( a );
6331     bSig1 = extractFloat128Frac1( b );
6332     bSig0 = extractFloat128Frac0( b );
6333     bExp = extractFloat128Exp( b );
6334     bSign = extractFloat128Sign( b );
6335     zSign = aSign ^ bSign;
6336     if ( aExp == 0x7FFF ) {
6337         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6338         if ( bExp == 0x7FFF ) {
6339             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6340             goto invalid;
6341         }
6342         return packFloat128( zSign, 0x7FFF, 0, 0 );
6343     }
6344     if ( bExp == 0x7FFF ) {
6345         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6346         return packFloat128( zSign, 0, 0, 0 );
6347     }
6348     if ( bExp == 0 ) {
6349         if ( ( bSig0 | bSig1 ) == 0 ) {
6350             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6351  invalid:
6352                 float_raise( float_flag_invalid STATUS_VAR);
6353                 z.low = float128_default_nan_low;
6354                 z.high = float128_default_nan_high;
6355                 return z;
6356             }
6357             float_raise( float_flag_divbyzero STATUS_VAR);
6358             return packFloat128( zSign, 0x7FFF, 0, 0 );
6359         }
6360         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6361     }
6362     if ( aExp == 0 ) {
6363         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6364         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6365     }
6366     zExp = aExp - bExp + 0x3FFD;
6367     shortShift128Left(
6368         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6369     shortShift128Left(
6370         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6371     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6372         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6373         ++zExp;
6374     }
6375     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6376     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6377     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6378     while ( (int64_t) rem0 < 0 ) {
6379         --zSig0;
6380         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6381     }
6382     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6383     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6384         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6385         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6386         while ( (int64_t) rem1 < 0 ) {
6387             --zSig1;
6388             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6389         }
6390         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6391     }
6392     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6393     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6394
6395 }
6396
6397 /*----------------------------------------------------------------------------
6398 | Returns the remainder of the quadruple-precision floating-point value `a'
6399 | with respect to the corresponding value `b'.  The operation is performed
6400 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6401 *----------------------------------------------------------------------------*/
6402
6403 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
6404 {
6405     flag aSign, zSign;
6406     int32 aExp, bExp, expDiff;
6407     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6408     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6409     int64_t sigMean0;
6410     float128 z;
6411
6412     aSig1 = extractFloat128Frac1( a );
6413     aSig0 = extractFloat128Frac0( a );
6414     aExp = extractFloat128Exp( a );
6415     aSign = extractFloat128Sign( a );
6416     bSig1 = extractFloat128Frac1( b );
6417     bSig0 = extractFloat128Frac0( b );
6418     bExp = extractFloat128Exp( b );
6419     if ( aExp == 0x7FFF ) {
6420         if (    ( aSig0 | aSig1 )
6421              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6422             return propagateFloat128NaN( a, b STATUS_VAR );
6423         }
6424         goto invalid;
6425     }
6426     if ( bExp == 0x7FFF ) {
6427         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6428         return a;
6429     }
6430     if ( bExp == 0 ) {
6431         if ( ( bSig0 | bSig1 ) == 0 ) {
6432  invalid:
6433             float_raise( float_flag_invalid STATUS_VAR);
6434             z.low = float128_default_nan_low;
6435             z.high = float128_default_nan_high;
6436             return z;
6437         }
6438         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6439     }
6440     if ( aExp == 0 ) {
6441         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6442         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6443     }
6444     expDiff = aExp - bExp;
6445     if ( expDiff < -1 ) return a;
6446     shortShift128Left(
6447         aSig0 | LIT64( 0x0001000000000000 ),
6448         aSig1,
6449         15 - ( expDiff < 0 ),
6450         &aSig0,
6451         &aSig1
6452     );
6453     shortShift128Left(
6454         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6455     q = le128( bSig0, bSig1, aSig0, aSig1 );
6456     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6457     expDiff -= 64;
6458     while ( 0 < expDiff ) {
6459         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6460         q = ( 4 < q ) ? q - 4 : 0;
6461         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6462         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6463         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6464         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6465         expDiff -= 61;
6466     }
6467     if ( -64 < expDiff ) {
6468         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6469         q = ( 4 < q ) ? q - 4 : 0;
6470         q >>= - expDiff;
6471         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6472         expDiff += 52;
6473         if ( expDiff < 0 ) {
6474             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6475         }
6476         else {
6477             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6478         }
6479         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6480         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6481     }
6482     else {
6483         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6484         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6485     }
6486     do {
6487         alternateASig0 = aSig0;
6488         alternateASig1 = aSig1;
6489         ++q;
6490         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6491     } while ( 0 <= (int64_t) aSig0 );
6492     add128(
6493         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6494     if (    ( sigMean0 < 0 )
6495          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6496         aSig0 = alternateASig0;
6497         aSig1 = alternateASig1;
6498     }
6499     zSign = ( (int64_t) aSig0 < 0 );
6500     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6501     return
6502         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6503
6504 }
6505
6506 /*----------------------------------------------------------------------------
6507 | Returns the square root of the quadruple-precision floating-point value `a'.
6508 | The operation is performed according to the IEC/IEEE Standard for Binary
6509 | Floating-Point Arithmetic.
6510 *----------------------------------------------------------------------------*/
6511
6512 float128 float128_sqrt( float128 a STATUS_PARAM )
6513 {
6514     flag aSign;
6515     int32 aExp, zExp;
6516     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6517     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6518     float128 z;
6519
6520     aSig1 = extractFloat128Frac1( a );
6521     aSig0 = extractFloat128Frac0( a );
6522     aExp = extractFloat128Exp( a );
6523     aSign = extractFloat128Sign( a );
6524     if ( aExp == 0x7FFF ) {
6525         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6526         if ( ! aSign ) return a;
6527         goto invalid;
6528     }
6529     if ( aSign ) {
6530         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6531  invalid:
6532         float_raise( float_flag_invalid STATUS_VAR);
6533         z.low = float128_default_nan_low;
6534         z.high = float128_default_nan_high;
6535         return z;
6536     }
6537     if ( aExp == 0 ) {
6538         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6539         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6540     }
6541     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6542     aSig0 |= LIT64( 0x0001000000000000 );
6543     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6544     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6545     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6546     doubleZSig0 = zSig0<<1;
6547     mul64To128( zSig0, zSig0, &term0, &term1 );
6548     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6549     while ( (int64_t) rem0 < 0 ) {
6550         --zSig0;
6551         doubleZSig0 -= 2;
6552         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6553     }
6554     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6555     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6556         if ( zSig1 == 0 ) zSig1 = 1;
6557         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6558         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6559         mul64To128( zSig1, zSig1, &term2, &term3 );
6560         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6561         while ( (int64_t) rem1 < 0 ) {
6562             --zSig1;
6563             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6564             term3 |= 1;
6565             term2 |= doubleZSig0;
6566             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6567         }
6568         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6569     }
6570     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6571     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6572
6573 }
6574
6575 /*----------------------------------------------------------------------------
6576 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6577 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6578 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6579 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6580 *----------------------------------------------------------------------------*/
6581
6582 int float128_eq( float128 a, float128 b STATUS_PARAM )
6583 {
6584
6585     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6586               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6587          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6588               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6589        ) {
6590         float_raise( float_flag_invalid STATUS_VAR);
6591         return 0;
6592     }
6593     return
6594            ( a.low == b.low )
6595         && (    ( a.high == b.high )
6596              || (    ( a.low == 0 )
6597                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6598            );
6599
6600 }
6601
6602 /*----------------------------------------------------------------------------
6603 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6604 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6605 | exception is raised if either operand is a NaN.  The comparison is performed
6606 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6607 *----------------------------------------------------------------------------*/
6608
6609 int float128_le( float128 a, float128 b STATUS_PARAM )
6610 {
6611     flag aSign, bSign;
6612
6613     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6614               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6615          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6616               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6617        ) {
6618         float_raise( float_flag_invalid STATUS_VAR);
6619         return 0;
6620     }
6621     aSign = extractFloat128Sign( a );
6622     bSign = extractFloat128Sign( b );
6623     if ( aSign != bSign ) {
6624         return
6625                aSign
6626             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6627                  == 0 );
6628     }
6629     return
6630           aSign ? le128( b.high, b.low, a.high, a.low )
6631         : le128( a.high, a.low, b.high, b.low );
6632
6633 }
6634
6635 /*----------------------------------------------------------------------------
6636 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6637 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6638 | raised if either operand is a NaN.  The comparison is performed according
6639 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6640 *----------------------------------------------------------------------------*/
6641
6642 int float128_lt( float128 a, float128 b STATUS_PARAM )
6643 {
6644     flag aSign, bSign;
6645
6646     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6647               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6648          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6649               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6650        ) {
6651         float_raise( float_flag_invalid STATUS_VAR);
6652         return 0;
6653     }
6654     aSign = extractFloat128Sign( a );
6655     bSign = extractFloat128Sign( b );
6656     if ( aSign != bSign ) {
6657         return
6658                aSign
6659             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6660                  != 0 );
6661     }
6662     return
6663           aSign ? lt128( b.high, b.low, a.high, a.low )
6664         : lt128( a.high, a.low, b.high, b.low );
6665
6666 }
6667
6668 /*----------------------------------------------------------------------------
6669 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6670 | be compared, and 0 otherwise.  The invalid exception is raised if either
6671 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6672 | Standard for Binary Floating-Point Arithmetic.
6673 *----------------------------------------------------------------------------*/
6674
6675 int float128_unordered( float128 a, float128 b STATUS_PARAM )
6676 {
6677     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6678               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6679          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6680               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6681        ) {
6682         float_raise( float_flag_invalid STATUS_VAR);
6683         return 1;
6684     }
6685     return 0;
6686 }
6687
6688 /*----------------------------------------------------------------------------
6689 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6690 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6691 | exception.  The comparison is performed according to the IEC/IEEE Standard
6692 | for Binary Floating-Point Arithmetic.
6693 *----------------------------------------------------------------------------*/
6694
6695 int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
6696 {
6697
6698     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6699               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6700          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6701               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6702        ) {
6703         if (    float128_is_signaling_nan( a )
6704              || float128_is_signaling_nan( b ) ) {
6705             float_raise( float_flag_invalid STATUS_VAR);
6706         }
6707         return 0;
6708     }
6709     return
6710            ( a.low == b.low )
6711         && (    ( a.high == b.high )
6712              || (    ( a.low == 0 )
6713                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6714            );
6715
6716 }
6717
6718 /*----------------------------------------------------------------------------
6719 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6720 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6721 | cause an exception.  Otherwise, the comparison is performed according to the
6722 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6723 *----------------------------------------------------------------------------*/
6724
6725 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
6726 {
6727     flag aSign, bSign;
6728
6729     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6730               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6731          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6732               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6733        ) {
6734         if (    float128_is_signaling_nan( a )
6735              || float128_is_signaling_nan( b ) ) {
6736             float_raise( float_flag_invalid STATUS_VAR);
6737         }
6738         return 0;
6739     }
6740     aSign = extractFloat128Sign( a );
6741     bSign = extractFloat128Sign( b );
6742     if ( aSign != bSign ) {
6743         return
6744                aSign
6745             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6746                  == 0 );
6747     }
6748     return
6749           aSign ? le128( b.high, b.low, a.high, a.low )
6750         : le128( a.high, a.low, b.high, b.low );
6751
6752 }
6753
6754 /*----------------------------------------------------------------------------
6755 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6756 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6757 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6758 | Standard for Binary Floating-Point Arithmetic.
6759 *----------------------------------------------------------------------------*/
6760
6761 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
6762 {
6763     flag aSign, bSign;
6764
6765     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6766               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6767          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6768               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6769        ) {
6770         if (    float128_is_signaling_nan( a )
6771              || float128_is_signaling_nan( b ) ) {
6772             float_raise( float_flag_invalid STATUS_VAR);
6773         }
6774         return 0;
6775     }
6776     aSign = extractFloat128Sign( a );
6777     bSign = extractFloat128Sign( b );
6778     if ( aSign != bSign ) {
6779         return
6780                aSign
6781             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6782                  != 0 );
6783     }
6784     return
6785           aSign ? lt128( b.high, b.low, a.high, a.low )
6786         : lt128( a.high, a.low, b.high, b.low );
6787
6788 }
6789
6790 /*----------------------------------------------------------------------------
6791 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6792 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6793 | comparison is performed according to the IEC/IEEE Standard for Binary
6794 | Floating-Point Arithmetic.
6795 *----------------------------------------------------------------------------*/
6796
6797 int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6798 {
6799     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6800               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6801          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6802               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6803        ) {
6804         if (    float128_is_signaling_nan( a )
6805              || float128_is_signaling_nan( b ) ) {
6806             float_raise( float_flag_invalid STATUS_VAR);
6807         }
6808         return 1;
6809     }
6810     return 0;
6811 }
6812
6813 /* misc functions */
6814 float32 uint32_to_float32(uint32_t a STATUS_PARAM)
6815 {
6816     return int64_to_float32(a STATUS_VAR);
6817 }
6818
6819 float64 uint32_to_float64(uint32_t a STATUS_PARAM)
6820 {
6821     return int64_to_float64(a STATUS_VAR);
6822 }
6823
6824 uint32 float32_to_uint32( float32 a STATUS_PARAM )
6825 {
6826     int64_t v;
6827     uint32 res;
6828     int old_exc_flags = get_float_exception_flags(status);
6829
6830     v = float32_to_int64(a STATUS_VAR);
6831     if (v < 0) {
6832         res = 0;
6833     } else if (v > 0xffffffff) {
6834         res = 0xffffffff;
6835     } else {
6836         return v;
6837     }
6838     set_float_exception_flags(old_exc_flags, status);
6839     float_raise(float_flag_invalid STATUS_VAR);
6840     return res;
6841 }
6842
6843 uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6844 {
6845     int64_t v;
6846     uint32 res;
6847     int old_exc_flags = get_float_exception_flags(status);
6848
6849     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6850     if (v < 0) {
6851         res = 0;
6852     } else if (v > 0xffffffff) {
6853         res = 0xffffffff;
6854     } else {
6855         return v;
6856     }
6857     set_float_exception_flags(old_exc_flags, status);
6858     float_raise(float_flag_invalid STATUS_VAR);
6859     return res;
6860 }
6861
6862 int_fast16_t float32_to_int16(float32 a STATUS_PARAM)
6863 {
6864     int32_t v;
6865     int_fast16_t res;
6866     int old_exc_flags = get_float_exception_flags(status);
6867
6868     v = float32_to_int32(a STATUS_VAR);
6869     if (v < -0x8000) {
6870         res = -0x8000;
6871     } else if (v > 0x7fff) {
6872         res = 0x7fff;
6873     } else {
6874         return v;
6875     }
6876
6877     set_float_exception_flags(old_exc_flags, status);
6878     float_raise(float_flag_invalid STATUS_VAR);
6879     return res;
6880 }
6881
6882 uint_fast16_t float32_to_uint16(float32 a STATUS_PARAM)
6883 {
6884     int32_t v;
6885     uint_fast16_t res;
6886     int old_exc_flags = get_float_exception_flags(status);
6887
6888     v = float32_to_int32(a STATUS_VAR);
6889     if (v < 0) {
6890         res = 0;
6891     } else if (v > 0xffff) {
6892         res = 0xffff;
6893     } else {
6894         return v;
6895     }
6896
6897     set_float_exception_flags(old_exc_flags, status);
6898     float_raise(float_flag_invalid STATUS_VAR);
6899     return res;
6900 }
6901
6902 uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
6903 {
6904     int64_t v;
6905     uint_fast16_t res;
6906     int old_exc_flags = get_float_exception_flags(status);
6907
6908     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6909     if (v < 0) {
6910         res = 0;
6911     } else if (v > 0xffff) {
6912         res = 0xffff;
6913     } else {
6914         return v;
6915     }
6916     set_float_exception_flags(old_exc_flags, status);
6917     float_raise(float_flag_invalid STATUS_VAR);
6918     return res;
6919 }
6920
6921 uint32 float64_to_uint32( float64 a STATUS_PARAM )
6922 {
6923     uint64_t v;
6924     uint32 res;
6925     int old_exc_flags = get_float_exception_flags(status);
6926
6927     v = float64_to_uint64(a STATUS_VAR);
6928     if (v > 0xffffffff) {
6929         res = 0xffffffff;
6930     } else {
6931         return v;
6932     }
6933     set_float_exception_flags(old_exc_flags, status);
6934     float_raise(float_flag_invalid STATUS_VAR);
6935     return res;
6936 }
6937
6938 uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6939 {
6940     uint64_t v;
6941     uint32 res;
6942     int old_exc_flags = get_float_exception_flags(status);
6943
6944     v = float64_to_uint64_round_to_zero(a STATUS_VAR);
6945     if (v > 0xffffffff) {
6946         res = 0xffffffff;
6947     } else {
6948         return v;
6949     }
6950     set_float_exception_flags(old_exc_flags, status);
6951     float_raise(float_flag_invalid STATUS_VAR);
6952     return res;
6953 }
6954
6955 int_fast16_t float64_to_int16(float64 a STATUS_PARAM)
6956 {
6957     int64_t v;
6958     int_fast16_t res;
6959     int old_exc_flags = get_float_exception_flags(status);
6960
6961     v = float64_to_int32(a STATUS_VAR);
6962     if (v < -0x8000) {
6963         res = -0x8000;
6964     } else if (v > 0x7fff) {
6965         res = 0x7fff;
6966     } else {
6967         return v;
6968     }
6969
6970     set_float_exception_flags(old_exc_flags, status);
6971     float_raise(float_flag_invalid STATUS_VAR);
6972     return res;
6973 }
6974
6975 uint_fast16_t float64_to_uint16(float64 a STATUS_PARAM)
6976 {
6977     int64_t v;
6978     uint_fast16_t res;
6979     int old_exc_flags = get_float_exception_flags(status);
6980
6981     v = float64_to_int32(a STATUS_VAR);
6982     if (v < 0) {
6983         res = 0;
6984     } else if (v > 0xffff) {
6985         res = 0xffff;
6986     } else {
6987         return v;
6988     }
6989
6990     set_float_exception_flags(old_exc_flags, status);
6991     float_raise(float_flag_invalid STATUS_VAR);
6992     return res;
6993 }
6994
6995 uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
6996 {
6997     int64_t v;
6998     uint_fast16_t res;
6999     int old_exc_flags = get_float_exception_flags(status);
7000
7001     v = float64_to_int64_round_to_zero(a STATUS_VAR);
7002     if (v < 0) {
7003         res = 0;
7004     } else if (v > 0xffff) {
7005         res = 0xffff;
7006     } else {
7007         return v;
7008     }
7009     set_float_exception_flags(old_exc_flags, status);
7010     float_raise(float_flag_invalid STATUS_VAR);
7011     return res;
7012 }
7013
7014 /*----------------------------------------------------------------------------
7015 | Returns the result of converting the double-precision floating-point value
7016 | `a' to the 64-bit unsigned integer format.  The conversion is
7017 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7018 | Arithmetic---which means in particular that the conversion is rounded
7019 | according to the current rounding mode.  If `a' is a NaN, the largest
7020 | positive integer is returned.  If the conversion overflows, the
7021 | largest unsigned integer is returned.  If 'a' is negative, the value is
7022 | rounded and zero is returned; negative values that do not round to zero
7023 | will raise the inexact exception.
7024 *----------------------------------------------------------------------------*/
7025
7026 uint64_t float64_to_uint64(float64 a STATUS_PARAM)
7027 {
7028     flag aSign;
7029     int_fast16_t aExp, shiftCount;
7030     uint64_t aSig, aSigExtra;
7031     a = float64_squash_input_denormal(a STATUS_VAR);
7032
7033     aSig = extractFloat64Frac(a);
7034     aExp = extractFloat64Exp(a);
7035     aSign = extractFloat64Sign(a);
7036     if (aSign && (aExp > 1022)) {
7037         float_raise(float_flag_invalid STATUS_VAR);
7038         if (float64_is_any_nan(a)) {
7039             return LIT64(0xFFFFFFFFFFFFFFFF);
7040         } else {
7041             return 0;
7042         }
7043     }
7044     if (aExp) {
7045         aSig |= LIT64(0x0010000000000000);
7046     }
7047     shiftCount = 0x433 - aExp;
7048     if (shiftCount <= 0) {
7049         if (0x43E < aExp) {
7050             float_raise(float_flag_invalid STATUS_VAR);
7051             return LIT64(0xFFFFFFFFFFFFFFFF);
7052         }
7053         aSigExtra = 0;
7054         aSig <<= -shiftCount;
7055     } else {
7056         shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7057     }
7058     return roundAndPackUint64(aSign, aSig, aSigExtra STATUS_VAR);
7059 }
7060
7061 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
7062 {
7063     signed char current_rounding_mode = STATUS(float_rounding_mode);
7064     set_float_rounding_mode(float_round_to_zero STATUS_VAR);
7065     int64_t v = float64_to_uint64(a STATUS_VAR);
7066     set_float_rounding_mode(current_rounding_mode STATUS_VAR);
7067     return v;
7068 }
7069
7070 #define COMPARE(s, nan_exp)                                                  \
7071 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
7072                                       int is_quiet STATUS_PARAM )            \
7073 {                                                                            \
7074     flag aSign, bSign;                                                       \
7075     uint ## s ## _t av, bv;                                                  \
7076     a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
7077     b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
7078                                                                              \
7079     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
7080          extractFloat ## s ## Frac( a ) ) ||                                 \
7081         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
7082           extractFloat ## s ## Frac( b ) )) {                                \
7083         if (!is_quiet ||                                                     \
7084             float ## s ## _is_signaling_nan( a ) ||                          \
7085             float ## s ## _is_signaling_nan( b ) ) {                         \
7086             float_raise( float_flag_invalid STATUS_VAR);                     \
7087         }                                                                    \
7088         return float_relation_unordered;                                     \
7089     }                                                                        \
7090     aSign = extractFloat ## s ## Sign( a );                                  \
7091     bSign = extractFloat ## s ## Sign( b );                                  \
7092     av = float ## s ## _val(a);                                              \
7093     bv = float ## s ## _val(b);                                              \
7094     if ( aSign != bSign ) {                                                  \
7095         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
7096             /* zero case */                                                  \
7097             return float_relation_equal;                                     \
7098         } else {                                                             \
7099             return 1 - (2 * aSign);                                          \
7100         }                                                                    \
7101     } else {                                                                 \
7102         if (av == bv) {                                                      \
7103             return float_relation_equal;                                     \
7104         } else {                                                             \
7105             return 1 - 2 * (aSign ^ ( av < bv ));                            \
7106         }                                                                    \
7107     }                                                                        \
7108 }                                                                            \
7109                                                                              \
7110 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
7111 {                                                                            \
7112     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
7113 }                                                                            \
7114                                                                              \
7115 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
7116 {                                                                            \
7117     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
7118 }
7119
7120 COMPARE(32, 0xff)
7121 COMPARE(64, 0x7ff)
7122
7123 INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
7124                                       int is_quiet STATUS_PARAM )
7125 {
7126     flag aSign, bSign;
7127
7128     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7129           ( extractFloatx80Frac( a )<<1 ) ) ||
7130         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7131           ( extractFloatx80Frac( b )<<1 ) )) {
7132         if (!is_quiet ||
7133             floatx80_is_signaling_nan( a ) ||
7134             floatx80_is_signaling_nan( b ) ) {
7135             float_raise( float_flag_invalid STATUS_VAR);
7136         }
7137         return float_relation_unordered;
7138     }
7139     aSign = extractFloatx80Sign( a );
7140     bSign = extractFloatx80Sign( b );
7141     if ( aSign != bSign ) {
7142
7143         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7144              ( ( a.low | b.low ) == 0 ) ) {
7145             /* zero case */
7146             return float_relation_equal;
7147         } else {
7148             return 1 - (2 * aSign);
7149         }
7150     } else {
7151         if (a.low == b.low && a.high == b.high) {
7152             return float_relation_equal;
7153         } else {
7154             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7155         }
7156     }
7157 }
7158
7159 int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
7160 {
7161     return floatx80_compare_internal(a, b, 0 STATUS_VAR);
7162 }
7163
7164 int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
7165 {
7166     return floatx80_compare_internal(a, b, 1 STATUS_VAR);
7167 }
7168
7169 INLINE int float128_compare_internal( float128 a, float128 b,
7170                                       int is_quiet STATUS_PARAM )
7171 {
7172     flag aSign, bSign;
7173
7174     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7175           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7176         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7177           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7178         if (!is_quiet ||
7179             float128_is_signaling_nan( a ) ||
7180             float128_is_signaling_nan( b ) ) {
7181             float_raise( float_flag_invalid STATUS_VAR);
7182         }
7183         return float_relation_unordered;
7184     }
7185     aSign = extractFloat128Sign( a );
7186     bSign = extractFloat128Sign( b );
7187     if ( aSign != bSign ) {
7188         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7189             /* zero case */
7190             return float_relation_equal;
7191         } else {
7192             return 1 - (2 * aSign);
7193         }
7194     } else {
7195         if (a.low == b.low && a.high == b.high) {
7196             return float_relation_equal;
7197         } else {
7198             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7199         }
7200     }
7201 }
7202
7203 int float128_compare( float128 a, float128 b STATUS_PARAM )
7204 {
7205     return float128_compare_internal(a, b, 0 STATUS_VAR);
7206 }
7207
7208 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
7209 {
7210     return float128_compare_internal(a, b, 1 STATUS_VAR);
7211 }
7212
7213 /* min() and max() functions. These can't be implemented as
7214  * 'compare and pick one input' because that would mishandle
7215  * NaNs and +0 vs -0.
7216  *
7217  * minnum() and maxnum() functions. These are similar to the min()
7218  * and max() functions but if one of the arguments is a QNaN and
7219  * the other is numerical then the numerical argument is returned.
7220  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7221  * and maxNum() operations. min() and max() are the typical min/max
7222  * semantics provided by many CPUs which predate that specification.
7223  */
7224 #define MINMAX(s)                                                       \
7225 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
7226                                         int ismin, int isieee STATUS_PARAM) \
7227 {                                                                       \
7228     flag aSign, bSign;                                                  \
7229     uint ## s ## _t av, bv;                                             \
7230     a = float ## s ## _squash_input_denormal(a STATUS_VAR);             \
7231     b = float ## s ## _squash_input_denormal(b STATUS_VAR);             \
7232     if (float ## s ## _is_any_nan(a) ||                                 \
7233         float ## s ## _is_any_nan(b)) {                                 \
7234         if (isieee) {                                                   \
7235             if (float ## s ## _is_quiet_nan(a) &&                       \
7236                 !float ## s ##_is_any_nan(b)) {                         \
7237                 return b;                                               \
7238             } else if (float ## s ## _is_quiet_nan(b) &&                \
7239                        !float ## s ## _is_any_nan(a)) {                 \
7240                 return a;                                               \
7241             }                                                           \
7242         }                                                               \
7243         return propagateFloat ## s ## NaN(a, b STATUS_VAR);             \
7244     }                                                                   \
7245     aSign = extractFloat ## s ## Sign(a);                               \
7246     bSign = extractFloat ## s ## Sign(b);                               \
7247     av = float ## s ## _val(a);                                         \
7248     bv = float ## s ## _val(b);                                         \
7249     if (aSign != bSign) {                                               \
7250         if (ismin) {                                                    \
7251             return aSign ? a : b;                                       \
7252         } else {                                                        \
7253             return aSign ? b : a;                                       \
7254         }                                                               \
7255     } else {                                                            \
7256         if (ismin) {                                                    \
7257             return (aSign ^ (av < bv)) ? a : b;                         \
7258         } else {                                                        \
7259             return (aSign ^ (av < bv)) ? b : a;                         \
7260         }                                                               \
7261     }                                                                   \
7262 }                                                                       \
7263                                                                         \
7264 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM)  \
7265 {                                                                       \
7266     return float ## s ## _minmax(a, b, 1, 0 STATUS_VAR);                \
7267 }                                                                       \
7268                                                                         \
7269 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM)  \
7270 {                                                                       \
7271     return float ## s ## _minmax(a, b, 0, 0 STATUS_VAR);                \
7272 }                                                                       \
7273                                                                         \
7274 float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \
7275 {                                                                       \
7276     return float ## s ## _minmax(a, b, 1, 1 STATUS_VAR);                \
7277 }                                                                       \
7278                                                                         \
7279 float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \
7280 {                                                                       \
7281     return float ## s ## _minmax(a, b, 0, 1 STATUS_VAR);                \
7282 }
7283
7284 MINMAX(32)
7285 MINMAX(64)
7286
7287
7288 /* Multiply A by 2 raised to the power N.  */
7289 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
7290 {
7291     flag aSign;
7292     int16_t aExp;
7293     uint32_t aSig;
7294
7295     a = float32_squash_input_denormal(a STATUS_VAR);
7296     aSig = extractFloat32Frac( a );
7297     aExp = extractFloat32Exp( a );
7298     aSign = extractFloat32Sign( a );
7299
7300     if ( aExp == 0xFF ) {
7301         if ( aSig ) {
7302             return propagateFloat32NaN( a, a STATUS_VAR );
7303         }
7304         return a;
7305     }
7306     if (aExp != 0) {
7307         aSig |= 0x00800000;
7308     } else if (aSig == 0) {
7309         return a;
7310     } else {
7311         aExp++;
7312     }
7313
7314     if (n > 0x200) {
7315         n = 0x200;
7316     } else if (n < -0x200) {
7317         n = -0x200;
7318     }
7319
7320     aExp += n - 1;
7321     aSig <<= 7;
7322     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
7323 }
7324
7325 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
7326 {
7327     flag aSign;
7328     int16_t aExp;
7329     uint64_t aSig;
7330
7331     a = float64_squash_input_denormal(a STATUS_VAR);
7332     aSig = extractFloat64Frac( a );
7333     aExp = extractFloat64Exp( a );
7334     aSign = extractFloat64Sign( a );
7335
7336     if ( aExp == 0x7FF ) {
7337         if ( aSig ) {
7338             return propagateFloat64NaN( a, a STATUS_VAR );
7339         }
7340         return a;
7341     }
7342     if (aExp != 0) {
7343         aSig |= LIT64( 0x0010000000000000 );
7344     } else if (aSig == 0) {
7345         return a;
7346     } else {
7347         aExp++;
7348     }
7349
7350     if (n > 0x1000) {
7351         n = 0x1000;
7352     } else if (n < -0x1000) {
7353         n = -0x1000;
7354     }
7355
7356     aExp += n - 1;
7357     aSig <<= 10;
7358     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
7359 }
7360
7361 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
7362 {
7363     flag aSign;
7364     int32_t aExp;
7365     uint64_t aSig;
7366
7367     aSig = extractFloatx80Frac( a );
7368     aExp = extractFloatx80Exp( a );
7369     aSign = extractFloatx80Sign( a );
7370
7371     if ( aExp == 0x7FFF ) {
7372         if ( aSig<<1 ) {
7373             return propagateFloatx80NaN( a, a STATUS_VAR );
7374         }
7375         return a;
7376     }
7377
7378     if (aExp == 0) {
7379         if (aSig == 0) {
7380             return a;
7381         }
7382         aExp++;
7383     }
7384
7385     if (n > 0x10000) {
7386         n = 0x10000;
7387     } else if (n < -0x10000) {
7388         n = -0x10000;
7389     }
7390
7391     aExp += n;
7392     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
7393                                           aSign, aExp, aSig, 0 STATUS_VAR );
7394 }
7395
7396 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
7397 {
7398     flag aSign;
7399     int32_t aExp;
7400     uint64_t aSig0, aSig1;
7401
7402     aSig1 = extractFloat128Frac1( a );
7403     aSig0 = extractFloat128Frac0( a );
7404     aExp = extractFloat128Exp( a );
7405     aSign = extractFloat128Sign( a );
7406     if ( aExp == 0x7FFF ) {
7407         if ( aSig0 | aSig1 ) {
7408             return propagateFloat128NaN( a, a STATUS_VAR );
7409         }
7410         return a;
7411     }
7412     if (aExp != 0) {
7413         aSig0 |= LIT64( 0x0001000000000000 );
7414     } else if (aSig0 == 0 && aSig1 == 0) {
7415         return a;
7416     } else {
7417         aExp++;
7418     }
7419
7420     if (n > 0x10000) {
7421         n = 0x10000;
7422     } else if (n < -0x10000) {
7423         n = -0x10000;
7424     }
7425
7426     aExp += n - 1;
7427     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7428                                           STATUS_VAR );
7429
7430 }