fpu/softfloat.c

   1
   2 /*============================================================================
   3
   4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
   5 Package, Release 2b.
   6
   7 Written by John R. Hauser.  This work was made possible in part by the
   8 International Computer Science Institute, located at Suite 600, 1947 Center
   9 Street, Berkeley, California 94704.  Funding was partially provided by the
  10 National Science Foundation under grant MIP-9311980.  The original version
  11 of this code was written as part of a project to build a fixed-point vector
  12 processor in collaboration with the University of California at Berkeley,
  13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  15 arithmetic/SoftFloat.html'.
  16
  17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  19 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  25
  26 Derivative works are acceptable, even for commercial purposes, so long as
  27 (1) the source code for the derivative work includes prominent notice that
  28 the work is derivative, and (2) the source code includes prominent notice with
  29 these four paragraphs for those parts of this code that are retained.
  30
  31 =============================================================================*/
  32
  33 #include "softfloat.h"
  34
  35 /*----------------------------------------------------------------------------
  36 | Primitive arithmetic functions, including multi-word arithmetic, and
  37 | division and square root approximations.  (Can be specialized to target if
  38 | desired.)
  39 *----------------------------------------------------------------------------*/
  40 #include "softfloat-macros.h"
  41
  42 /*----------------------------------------------------------------------------
  43 | Functions and definitions to determine:  (1) whether tininess for underflow
  44 | is detected before or after rounding by default, (2) what (if anything)
  45 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  46 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  47 | are propagated from function inputs to output.  These details are target-
  48 | specific.
  49 *----------------------------------------------------------------------------*/
  50 #include "softfloat-specialize.h"
  51
  52 void set_float_rounding_mode(int val STATUS_PARAM)
  53 {
  54     STATUS(float_rounding_mode) = val;
  55 }
  56
  57 void set_float_exception_flags(int val STATUS_PARAM)
  58 {
  59     STATUS(float_exception_flags) = val;
  60 }
  61
  62 #ifdef FLOATX80
  63 void set_floatx80_rounding_precision(int val STATUS_PARAM)
  64 {
  65     STATUS(floatx80_rounding_precision) = val;
  66 }
  67 #endif
  68
  69 /*----------------------------------------------------------------------------
  70 | Returns the fraction bits of the half-precision floating-point value `a'.
  71 *----------------------------------------------------------------------------*/
  72
  73 INLINE uint32_t extractFloat16Frac(float16 a)
  74 {
  75     return float16_val(a) & 0x3ff;
  76 }
  77
  78 /*----------------------------------------------------------------------------
  79 | Returns the exponent bits of the half-precision floating-point value `a'.
  80 *----------------------------------------------------------------------------*/
  81
  82 INLINE int16 extractFloat16Exp(float16 a)
  83 {
  84     return (float16_val(a) >> 10) & 0x1f;
  85 }
  86
  87 /*----------------------------------------------------------------------------
  88 | Returns the sign bit of the single-precision floating-point value `a'.
  89 *----------------------------------------------------------------------------*/
  90
  91 INLINE flag extractFloat16Sign(float16 a)
  92 {
  93     return float16_val(a)>>15;
  94 }
  95
  96 /*----------------------------------------------------------------------------
  97 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  98 | and 7, and returns the properly rounded 32-bit integer corresponding to the
  99 | input.  If `zSign' is 1, the input is negated before being converted to an
 100 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 101 | is simply rounded to an integer, with the inexact exception raised if the
 102 | input cannot be represented exactly as an integer.  However, if the fixed-
 103 | point input is too large, the invalid exception is raised and the largest
 104 | positive or negative integer is returned.
 105 *----------------------------------------------------------------------------*/
 106
 107 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
 108 {
 109     int8 roundingMode;
 110     flag roundNearestEven;
 111     int8 roundIncrement, roundBits;
 112     int32 z;
 113
 114     roundingMode = STATUS(float_rounding_mode);
 115     roundNearestEven = ( roundingMode == float_round_nearest_even );
 116     roundIncrement = 0x40;
 117     if ( ! roundNearestEven ) {
 118         if ( roundingMode == float_round_to_zero ) {
 119             roundIncrement = 0;
 120         }
 121         else {
 122             roundIncrement = 0x7F;
 123             if ( zSign ) {
 124                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 125             }
 126             else {
 127                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 128             }
 129         }
 130     }
 131     roundBits = absZ & 0x7F;
 132     absZ = ( absZ + roundIncrement )>>7;
 133     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 134     z = absZ;
 135     if ( zSign ) z = - z;
 136     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 137         float_raise( float_flag_invalid STATUS_VAR);
 138         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 139     }
 140     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 141     return z;
 142
 143 }
 144
 145 /*----------------------------------------------------------------------------
 146 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 147 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 148 | and returns the properly rounded 64-bit integer corresponding to the input.
 149 | If `zSign' is 1, the input is negated before being converted to an integer.
 150 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 151 | the inexact exception raised if the input cannot be represented exactly as
 152 | an integer.  However, if the fixed-point input is too large, the invalid
 153 | exception is raised and the largest positive or negative integer is
 154 | returned.
 155 *----------------------------------------------------------------------------*/
 156
 157 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
 158 {
 159     int8 roundingMode;
 160     flag roundNearestEven, increment;
 161     int64 z;
 162
 163     roundingMode = STATUS(float_rounding_mode);
 164     roundNearestEven = ( roundingMode == float_round_nearest_even );
 165     increment = ( (sbits64) absZ1 < 0 );
 166     if ( ! roundNearestEven ) {
 167         if ( roundingMode == float_round_to_zero ) {
 168             increment = 0;
 169         }
 170         else {
 171             if ( zSign ) {
 172                 increment = ( roundingMode == float_round_down ) && absZ1;
 173             }
 174             else {
 175                 increment = ( roundingMode == float_round_up ) && absZ1;
 176             }
 177         }
 178     }
 179     if ( increment ) {
 180         ++absZ0;
 181         if ( absZ0 == 0 ) goto overflow;
 182         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 183     }
 184     z = absZ0;
 185     if ( zSign ) z = - z;
 186     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 187  overflow:
 188         float_raise( float_flag_invalid STATUS_VAR);
 189         return
 190               zSign ? (sbits64) LIT64( 0x8000000000000000 )
 191             : LIT64( 0x7FFFFFFFFFFFFFFF );
 192     }
 193     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 194     return z;
 195
 196 }
 197
 198 /*----------------------------------------------------------------------------
 199 | Returns the fraction bits of the single-precision floating-point value `a'.
 200 *----------------------------------------------------------------------------*/
 201
 202 INLINE bits32 extractFloat32Frac( float32 a )
 203 {
 204
 205     return float32_val(a) & 0x007FFFFF;
 206
 207 }
 208
 209 /*----------------------------------------------------------------------------
 210 | Returns the exponent bits of the single-precision floating-point value `a'.
 211 *----------------------------------------------------------------------------*/
 212
 213 INLINE int16 extractFloat32Exp( float32 a )
 214 {
 215
 216     return ( float32_val(a)>>23 ) & 0xFF;
 217
 218 }
 219
 220 /*----------------------------------------------------------------------------
 221 | Returns the sign bit of the single-precision floating-point value `a'.
 222 *----------------------------------------------------------------------------*/
 223
 224 INLINE flag extractFloat32Sign( float32 a )
 225 {
 226
 227     return float32_val(a)>>31;
 228
 229 }
 230
 231 /*----------------------------------------------------------------------------
 232 | If `a' is denormal and we are in flush-to-zero mode then set the
 233 | input-denormal exception and return zero. Otherwise just return the value.
 234 *----------------------------------------------------------------------------*/
 235 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
 236 {
 237     if (STATUS(flush_inputs_to_zero)) {
 238         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
 239             float_raise(float_flag_input_denormal STATUS_VAR);
 240             return make_float32(float32_val(a) & 0x80000000);
 241         }
 242     }
 243     return a;
 244 }
 245
 246 /*----------------------------------------------------------------------------
 247 | Normalizes the subnormal single-precision floating-point value represented
 248 | by the denormalized significand `aSig'.  The normalized exponent and
 249 | significand are stored at the locations pointed to by `zExpPtr' and
 250 | `zSigPtr', respectively.
 251 *----------------------------------------------------------------------------*/
 252
 253 static void
 254  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 255 {
 256     int8 shiftCount;
 257
 258     shiftCount = countLeadingZeros32( aSig ) - 8;
 259     *zSigPtr = aSig<<shiftCount;
 260     *zExpPtr = 1 - shiftCount;
 261
 262 }
 263
 264 /*----------------------------------------------------------------------------
 265 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 266 | single-precision floating-point value, returning the result.  After being
 267 | shifted into the proper positions, the three fields are simply added
 268 | together to form the result.  This means that any integer portion of `zSig'
 269 | will be added into the exponent.  Since a properly normalized significand
 270 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 271 | than the desired result exponent whenever `zSig' is a complete, normalized
 272 | significand.
 273 *----------------------------------------------------------------------------*/
 274
 275 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 276 {
 277
 278     return make_float32(
 279           ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
 280
 281 }
 282
 283 /*----------------------------------------------------------------------------
 284 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 285 | and significand `zSig', and returns the proper single-precision floating-
 286 | point value corresponding to the abstract input.  Ordinarily, the abstract
 287 | value is simply rounded and packed into the single-precision format, with
 288 | the inexact exception raised if the abstract input cannot be represented
 289 | exactly.  However, if the abstract value is too large, the overflow and
 290 | inexact exceptions are raised and an infinity or maximal finite value is
 291 | returned.  If the abstract value is too small, the input value is rounded to
 292 | a subnormal number, and the underflow and inexact exceptions are raised if
 293 | the abstract input cannot be represented exactly as a subnormal single-
 294 | precision floating-point number.
 295 |     The input significand `zSig' has its binary point between bits 30
 296 | and 29, which is 7 bits to the left of the usual location.  This shifted
 297 | significand must be normalized or smaller.  If `zSig' is not normalized,
 298 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 299 | and it must not require rounding.  In the usual case that `zSig' is
 300 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 301 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 302 | Binary Floating-Point Arithmetic.
 303 *----------------------------------------------------------------------------*/
 304
 305 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 306 {
 307     int8 roundingMode;
 308     flag roundNearestEven;
 309     int8 roundIncrement, roundBits;
 310     flag isTiny;
 311
 312     roundingMode = STATUS(float_rounding_mode);
 313     roundNearestEven = ( roundingMode == float_round_nearest_even );
 314     roundIncrement = 0x40;
 315     if ( ! roundNearestEven ) {
 316         if ( roundingMode == float_round_to_zero ) {
 317             roundIncrement = 0;
 318         }
 319         else {
 320             roundIncrement = 0x7F;
 321             if ( zSign ) {
 322                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 323             }
 324             else {
 325                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 326             }
 327         }
 328     }
 329     roundBits = zSig & 0x7F;
 330     if ( 0xFD <= (bits16) zExp ) {
 331         if (    ( 0xFD < zExp )
 332              || (    ( zExp == 0xFD )
 333                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 334            ) {
 335             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 336             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 337         }
 338         if ( zExp < 0 ) {
 339             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
 340             isTiny =
 341                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 342                 || ( zExp < -1 )
 343                 || ( zSig + roundIncrement < 0x80000000 );
 344             shift32RightJamming( zSig, - zExp, &zSig );
 345             zExp = 0;
 346             roundBits = zSig & 0x7F;
 347             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 348         }
 349     }
 350     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 351     zSig = ( zSig + roundIncrement )>>7;
 352     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 353     if ( zSig == 0 ) zExp = 0;
 354     return packFloat32( zSign, zExp, zSig );
 355
 356 }
 357
 358 /*----------------------------------------------------------------------------
 359 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 360 | and significand `zSig', and returns the proper single-precision floating-
 361 | point value corresponding to the abstract input.  This routine is just like
 362 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 363 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 364 | floating-point exponent.
 365 *----------------------------------------------------------------------------*/
 366
 367 static float32
 368  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 369 {
 370     int8 shiftCount;
 371
 372     shiftCount = countLeadingZeros32( zSig ) - 1;
 373     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 374
 375 }
 376
 377 /*----------------------------------------------------------------------------
 378 | Returns the fraction bits of the double-precision floating-point value `a'.
 379 *----------------------------------------------------------------------------*/
 380
 381 INLINE bits64 extractFloat64Frac( float64 a )
 382 {
 383
 384     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 385
 386 }
 387
 388 /*----------------------------------------------------------------------------
 389 | Returns the exponent bits of the double-precision floating-point value `a'.
 390 *----------------------------------------------------------------------------*/
 391
 392 INLINE int16 extractFloat64Exp( float64 a )
 393 {
 394
 395     return ( float64_val(a)>>52 ) & 0x7FF;
 396
 397 }
 398
 399 /*----------------------------------------------------------------------------
 400 | Returns the sign bit of the double-precision floating-point value `a'.
 401 *----------------------------------------------------------------------------*/
 402
 403 INLINE flag extractFloat64Sign( float64 a )
 404 {
 405
 406     return float64_val(a)>>63;
 407
 408 }
 409
 410 /*----------------------------------------------------------------------------
 411 | If `a' is denormal and we are in flush-to-zero mode then set the
 412 | input-denormal exception and return zero. Otherwise just return the value.
 413 *----------------------------------------------------------------------------*/
 414 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
 415 {
 416     if (STATUS(flush_inputs_to_zero)) {
 417         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
 418             float_raise(float_flag_input_denormal STATUS_VAR);
 419             return make_float64(float64_val(a) & (1ULL << 63));
 420         }
 421     }
 422     return a;
 423 }
 424
 425 /*----------------------------------------------------------------------------
 426 | Normalizes the subnormal double-precision floating-point value represented
 427 | by the denormalized significand `aSig'.  The normalized exponent and
 428 | significand are stored at the locations pointed to by `zExpPtr' and
 429 | `zSigPtr', respectively.
 430 *----------------------------------------------------------------------------*/
 431
 432 static void
 433  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 434 {
 435     int8 shiftCount;
 436
 437     shiftCount = countLeadingZeros64( aSig ) - 11;
 438     *zSigPtr = aSig<<shiftCount;
 439     *zExpPtr = 1 - shiftCount;
 440
 441 }
 442
 443 /*----------------------------------------------------------------------------
 444 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 445 | double-precision floating-point value, returning the result.  After being
 446 | shifted into the proper positions, the three fields are simply added
 447 | together to form the result.  This means that any integer portion of `zSig'
 448 | will be added into the exponent.  Since a properly normalized significand
 449 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 450 | than the desired result exponent whenever `zSig' is a complete, normalized
 451 | significand.
 452 *----------------------------------------------------------------------------*/
 453
 454 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 455 {
 456
 457     return make_float64(
 458         ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
 459
 460 }
 461
 462 /*----------------------------------------------------------------------------
 463 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 464 | and significand `zSig', and returns the proper double-precision floating-
 465 | point value corresponding to the abstract input.  Ordinarily, the abstract
 466 | value is simply rounded and packed into the double-precision format, with
 467 | the inexact exception raised if the abstract input cannot be represented
 468 | exactly.  However, if the abstract value is too large, the overflow and
 469 | inexact exceptions are raised and an infinity or maximal finite value is
 470 | returned.  If the abstract value is too small, the input value is rounded
 471 | to a subnormal number, and the underflow and inexact exceptions are raised
 472 | if the abstract input cannot be represented exactly as a subnormal double-
 473 | precision floating-point number.
 474 |     The input significand `zSig' has its binary point between bits 62
 475 | and 61, which is 10 bits to the left of the usual location.  This shifted
 476 | significand must be normalized or smaller.  If `zSig' is not normalized,
 477 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 478 | and it must not require rounding.  In the usual case that `zSig' is
 479 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 480 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 481 | Binary Floating-Point Arithmetic.
 482 *----------------------------------------------------------------------------*/
 483
 484 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 485 {
 486     int8 roundingMode;
 487     flag roundNearestEven;
 488     int16 roundIncrement, roundBits;
 489     flag isTiny;
 490
 491     roundingMode = STATUS(float_rounding_mode);
 492     roundNearestEven = ( roundingMode == float_round_nearest_even );
 493     roundIncrement = 0x200;
 494     if ( ! roundNearestEven ) {
 495         if ( roundingMode == float_round_to_zero ) {
 496             roundIncrement = 0;
 497         }
 498         else {
 499             roundIncrement = 0x3FF;
 500             if ( zSign ) {
 501                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 502             }
 503             else {
 504                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 505             }
 506         }
 507     }
 508     roundBits = zSig & 0x3FF;
 509     if ( 0x7FD <= (bits16) zExp ) {
 510         if (    ( 0x7FD < zExp )
 511              || (    ( zExp == 0x7FD )
 512                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 513            ) {
 514             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 515             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 516         }
 517         if ( zExp < 0 ) {
 518             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
 519             isTiny =
 520                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 521                 || ( zExp < -1 )
 522                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 523             shift64RightJamming( zSig, - zExp, &zSig );
 524             zExp = 0;
 525             roundBits = zSig & 0x3FF;
 526             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 527         }
 528     }
 529     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 530     zSig = ( zSig + roundIncrement )>>10;
 531     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 532     if ( zSig == 0 ) zExp = 0;
 533     return packFloat64( zSign, zExp, zSig );
 534
 535 }
 536
 537 /*----------------------------------------------------------------------------
 538 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 539 | and significand `zSig', and returns the proper double-precision floating-
 540 | point value corresponding to the abstract input.  This routine is just like
 541 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 542 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 543 | floating-point exponent.
 544 *----------------------------------------------------------------------------*/
 545
 546 static float64
 547  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 548 {
 549     int8 shiftCount;
 550
 551     shiftCount = countLeadingZeros64( zSig ) - 1;
 552     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 553
 554 }
 555
 556 #ifdef FLOATX80
 557
 558 /*----------------------------------------------------------------------------
 559 | Returns the fraction bits of the extended double-precision floating-point
 560 | value `a'.
 561 *----------------------------------------------------------------------------*/
 562
 563 INLINE bits64 extractFloatx80Frac( floatx80 a )
 564 {
 565
 566     return a.low;
 567
 568 }
 569
 570 /*----------------------------------------------------------------------------
 571 | Returns the exponent bits of the extended double-precision floating-point
 572 | value `a'.
 573 *----------------------------------------------------------------------------*/
 574
 575 INLINE int32 extractFloatx80Exp( floatx80 a )
 576 {
 577
 578     return a.high & 0x7FFF;
 579
 580 }
 581
 582 /*----------------------------------------------------------------------------
 583 | Returns the sign bit of the extended double-precision floating-point value
 584 | `a'.
 585 *----------------------------------------------------------------------------*/
 586
 587 INLINE flag extractFloatx80Sign( floatx80 a )
 588 {
 589
 590     return a.high>>15;
 591
 592 }
 593
 594 /*----------------------------------------------------------------------------
 595 | Normalizes the subnormal extended double-precision floating-point value
 596 | represented by the denormalized significand `aSig'.  The normalized exponent
 597 | and significand are stored at the locations pointed to by `zExpPtr' and
 598 | `zSigPtr', respectively.
 599 *----------------------------------------------------------------------------*/
 600
 601 static void
 602  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 603 {
 604     int8 shiftCount;
 605
 606     shiftCount = countLeadingZeros64( aSig );
 607     *zSigPtr = aSig<<shiftCount;
 608     *zExpPtr = 1 - shiftCount;
 609
 610 }
 611
 612 /*----------------------------------------------------------------------------
 613 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 614 | extended double-precision floating-point value, returning the result.
 615 *----------------------------------------------------------------------------*/
 616
 617 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 618 {
 619     floatx80 z;
 620
 621     z.low = zSig;
 622     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 623     return z;
 624
 625 }
 626
 627 /*----------------------------------------------------------------------------
 628 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 629 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 630 | and returns the proper extended double-precision floating-point value
 631 | corresponding to the abstract input.  Ordinarily, the abstract value is
 632 | rounded and packed into the extended double-precision format, with the
 633 | inexact exception raised if the abstract input cannot be represented
 634 | exactly.  However, if the abstract value is too large, the overflow and
 635 | inexact exceptions are raised and an infinity or maximal finite value is
 636 | returned.  If the abstract value is too small, the input value is rounded to
 637 | a subnormal number, and the underflow and inexact exceptions are raised if
 638 | the abstract input cannot be represented exactly as a subnormal extended
 639 | double-precision floating-point number.
 640 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 641 | number of bits as single or double precision, respectively.  Otherwise, the
 642 | result is rounded to the full precision of the extended double-precision
 643 | format.
 644 |     The input significand must be normalized or smaller.  If the input
 645 | significand is not normalized, `zExp' must be 0; in that case, the result
 646 | returned is a subnormal number, and it must not require rounding.  The
 647 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 648 | Floating-Point Arithmetic.
 649 *----------------------------------------------------------------------------*/
 650
 651 static floatx80
 652  roundAndPackFloatx80(
 653      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 654  STATUS_PARAM)
 655 {
 656     int8 roundingMode;
 657     flag roundNearestEven, increment, isTiny;
 658     int64 roundIncrement, roundMask, roundBits;
 659
 660     roundingMode = STATUS(float_rounding_mode);
 661     roundNearestEven = ( roundingMode == float_round_nearest_even );
 662     if ( roundingPrecision == 80 ) goto precision80;
 663     if ( roundingPrecision == 64 ) {
 664         roundIncrement = LIT64( 0x0000000000000400 );
 665         roundMask = LIT64( 0x00000000000007FF );
 666     }
 667     else if ( roundingPrecision == 32 ) {
 668         roundIncrement = LIT64( 0x0000008000000000 );
 669         roundMask = LIT64( 0x000000FFFFFFFFFF );
 670     }
 671     else {
 672         goto precision80;
 673     }
 674     zSig0 |= ( zSig1 != 0 );
 675     if ( ! roundNearestEven ) {
 676         if ( roundingMode == float_round_to_zero ) {
 677             roundIncrement = 0;
 678         }
 679         else {
 680             roundIncrement = roundMask;
 681             if ( zSign ) {
 682                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 683             }
 684             else {
 685                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 686             }
 687         }
 688     }
 689     roundBits = zSig0 & roundMask;
 690     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 691         if (    ( 0x7FFE < zExp )
 692              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 693            ) {
 694             goto overflow;
 695         }
 696         if ( zExp <= 0 ) {
 697             if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
 698             isTiny =
 699                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 700                 || ( zExp < 0 )
 701                 || ( zSig0 <= zSig0 + roundIncrement );
 702             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 703             zExp = 0;
 704             roundBits = zSig0 & roundMask;
 705             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 706             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 707             zSig0 += roundIncrement;
 708             if ( (sbits64) zSig0 < 0 ) zExp = 1;
 709             roundIncrement = roundMask + 1;
 710             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 711                 roundMask |= roundIncrement;
 712             }
 713             zSig0 &= ~ roundMask;
 714             return packFloatx80( zSign, zExp, zSig0 );
 715         }
 716     }
 717     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 718     zSig0 += roundIncrement;
 719     if ( zSig0 < roundIncrement ) {
 720         ++zExp;
 721         zSig0 = LIT64( 0x8000000000000000 );
 722     }
 723     roundIncrement = roundMask + 1;
 724     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 725         roundMask |= roundIncrement;
 726     }
 727     zSig0 &= ~ roundMask;
 728     if ( zSig0 == 0 ) zExp = 0;
 729     return packFloatx80( zSign, zExp, zSig0 );
 730  precision80:
 731     increment = ( (sbits64) zSig1 < 0 );
 732     if ( ! roundNearestEven ) {
 733         if ( roundingMode == float_round_to_zero ) {
 734             increment = 0;
 735         }
 736         else {
 737             if ( zSign ) {
 738                 increment = ( roundingMode == float_round_down ) && zSig1;
 739             }
 740             else {
 741                 increment = ( roundingMode == float_round_up ) && zSig1;
 742             }
 743         }
 744     }
 745     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 746         if (    ( 0x7FFE < zExp )
 747              || (    ( zExp == 0x7FFE )
 748                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 749                   && increment
 750                 )
 751            ) {
 752             roundMask = 0;
 753  overflow:
 754             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 755             if (    ( roundingMode == float_round_to_zero )
 756                  || ( zSign && ( roundingMode == float_round_up ) )
 757                  || ( ! zSign && ( roundingMode == float_round_down ) )
 758                ) {
 759                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 760             }
 761             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 762         }
 763         if ( zExp <= 0 ) {
 764             isTiny =
 765                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 766                 || ( zExp < 0 )
 767                 || ! increment
 768                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 769             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 770             zExp = 0;
 771             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 772             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 773             if ( roundNearestEven ) {
 774                 increment = ( (sbits64) zSig1 < 0 );
 775             }
 776             else {
 777                 if ( zSign ) {
 778                     increment = ( roundingMode == float_round_down ) && zSig1;
 779                 }
 780                 else {
 781                     increment = ( roundingMode == float_round_up ) && zSig1;
 782                 }
 783             }
 784             if ( increment ) {
 785                 ++zSig0;
 786                 zSig0 &=
 787                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 788                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
 789             }
 790             return packFloatx80( zSign, zExp, zSig0 );
 791         }
 792     }
 793     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 794     if ( increment ) {
 795         ++zSig0;
 796         if ( zSig0 == 0 ) {
 797             ++zExp;
 798             zSig0 = LIT64( 0x8000000000000000 );
 799         }
 800         else {
 801             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 802         }
 803     }
 804     else {
 805         if ( zSig0 == 0 ) zExp = 0;
 806     }
 807     return packFloatx80( zSign, zExp, zSig0 );
 808
 809 }
 810
 811 /*----------------------------------------------------------------------------
 812 | Takes an abstract floating-point value having sign `zSign', exponent
 813 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 814 | and returns the proper extended double-precision floating-point value
 815 | corresponding to the abstract input.  This routine is just like
 816 | `roundAndPackFloatx80' except that the input significand does not have to be
 817 | normalized.
 818 *----------------------------------------------------------------------------*/
 819
 820 static floatx80
 821  normalizeRoundAndPackFloatx80(
 822      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 823  STATUS_PARAM)
 824 {
 825     int8 shiftCount;
 826
 827     if ( zSig0 == 0 ) {
 828         zSig0 = zSig1;
 829         zSig1 = 0;
 830         zExp -= 64;
 831     }
 832     shiftCount = countLeadingZeros64( zSig0 );
 833     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 834     zExp -= shiftCount;
 835     return
 836         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 837
 838 }
 839
 840 #endif
 841
 842 #ifdef FLOAT128
 843
 844 /*----------------------------------------------------------------------------
 845 | Returns the least-significant 64 fraction bits of the quadruple-precision
 846 | floating-point value `a'.
 847 *----------------------------------------------------------------------------*/
 848
 849 INLINE bits64 extractFloat128Frac1( float128 a )
 850 {
 851
 852     return a.low;
 853
 854 }
 855
 856 /*----------------------------------------------------------------------------
 857 | Returns the most-significant 48 fraction bits of the quadruple-precision
 858 | floating-point value `a'.
 859 *----------------------------------------------------------------------------*/
 860
 861 INLINE bits64 extractFloat128Frac0( float128 a )
 862 {
 863
 864     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 865
 866 }
 867
 868 /*----------------------------------------------------------------------------
 869 | Returns the exponent bits of the quadruple-precision floating-point value
 870 | `a'.
 871 *----------------------------------------------------------------------------*/
 872
 873 INLINE int32 extractFloat128Exp( float128 a )
 874 {
 875
 876     return ( a.high>>48 ) & 0x7FFF;
 877
 878 }
 879
 880 /*----------------------------------------------------------------------------
 881 | Returns the sign bit of the quadruple-precision floating-point value `a'.
 882 *----------------------------------------------------------------------------*/
 883
 884 INLINE flag extractFloat128Sign( float128 a )
 885 {
 886
 887     return a.high>>63;
 888
 889 }
 890
 891 /*----------------------------------------------------------------------------
 892 | Normalizes the subnormal quadruple-precision floating-point value
 893 | represented by the denormalized significand formed by the concatenation of
 894 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 895 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 896 | significand are stored at the location pointed to by `zSig0Ptr', and the
 897 | least significant 64 bits of the normalized significand are stored at the
 898 | location pointed to by `zSig1Ptr'.
 899 *----------------------------------------------------------------------------*/
 900
 901 static void
 902  normalizeFloat128Subnormal(
 903      bits64 aSig0,
 904      bits64 aSig1,
 905      int32 *zExpPtr,
 906      bits64 *zSig0Ptr,
 907      bits64 *zSig1Ptr
 908  )
 909 {
 910     int8 shiftCount;
 911
 912     if ( aSig0 == 0 ) {
 913         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 914         if ( shiftCount < 0 ) {
 915             *zSig0Ptr = aSig1>>( - shiftCount );
 916             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 917         }
 918         else {
 919             *zSig0Ptr = aSig1<<shiftCount;
 920             *zSig1Ptr = 0;
 921         }
 922         *zExpPtr = - shiftCount - 63;
 923     }
 924     else {
 925         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 926         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 927         *zExpPtr = 1 - shiftCount;
 928     }
 929
 930 }
 931
 932 /*----------------------------------------------------------------------------
 933 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 934 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 935 | floating-point value, returning the result.  After being shifted into the
 936 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 937 | added together to form the most significant 32 bits of the result.  This
 938 | means that any integer portion of `zSig0' will be added into the exponent.
 939 | Since a properly normalized significand will have an integer portion equal
 940 | to 1, the `zExp' input should be 1 less than the desired result exponent
 941 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 942 | significand.
 943 *----------------------------------------------------------------------------*/
 944
 945 INLINE float128
 946  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 947 {
 948     float128 z;
 949
 950     z.low = zSig1;
 951     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
 952     return z;
 953
 954 }
 955
 956 /*----------------------------------------------------------------------------
 957 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 958 | and extended significand formed by the concatenation of `zSig0', `zSig1',
 959 | and `zSig2', and returns the proper quadruple-precision floating-point value
 960 | corresponding to the abstract input.  Ordinarily, the abstract value is
 961 | simply rounded and packed into the quadruple-precision format, with the
 962 | inexact exception raised if the abstract input cannot be represented
 963 | exactly.  However, if the abstract value is too large, the overflow and
 964 | inexact exceptions are raised and an infinity or maximal finite value is
 965 | returned.  If the abstract value is too small, the input value is rounded to
 966 | a subnormal number, and the underflow and inexact exceptions are raised if
 967 | the abstract input cannot be represented exactly as a subnormal quadruple-
 968 | precision floating-point number.
 969 |     The input significand must be normalized or smaller.  If the input
 970 | significand is not normalized, `zExp' must be 0; in that case, the result
 971 | returned is a subnormal number, and it must not require rounding.  In the
 972 | usual case that the input significand is normalized, `zExp' must be 1 less
 973 | than the ``true'' floating-point exponent.  The handling of underflow and
 974 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 975 *----------------------------------------------------------------------------*/
 976
 977 static float128
 978  roundAndPackFloat128(
 979      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
 980 {
 981     int8 roundingMode;
 982     flag roundNearestEven, increment, isTiny;
 983
 984     roundingMode = STATUS(float_rounding_mode);
 985     roundNearestEven = ( roundingMode == float_round_nearest_even );
 986     increment = ( (sbits64) zSig2 < 0 );
 987     if ( ! roundNearestEven ) {
 988         if ( roundingMode == float_round_to_zero ) {
 989             increment = 0;
 990         }
 991         else {
 992             if ( zSign ) {
 993                 increment = ( roundingMode == float_round_down ) && zSig2;
 994             }
 995             else {
 996                 increment = ( roundingMode == float_round_up ) && zSig2;
 997             }
 998         }
 999     }
1000     if ( 0x7FFD <= (bits32) zExp ) {
1001         if (    ( 0x7FFD < zExp )
1002              || (    ( zExp == 0x7FFD )
1003                   && eq128(
1004                          LIT64( 0x0001FFFFFFFFFFFF ),
1005                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1006                          zSig0,
1007                          zSig1
1008                      )
1009                   && increment
1010                 )
1011            ) {
1012             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1013             if (    ( roundingMode == float_round_to_zero )
1014                  || ( zSign && ( roundingMode == float_round_up ) )
1015                  || ( ! zSign && ( roundingMode == float_round_down ) )
1016                ) {
1017                 return
1018                     packFloat128(
1019                         zSign,
1020                         0x7FFE,
1021                         LIT64( 0x0000FFFFFFFFFFFF ),
1022                         LIT64( 0xFFFFFFFFFFFFFFFF )
1023                     );
1024             }
1025             return packFloat128( zSign, 0x7FFF, 0, 0 );
1026         }
1027         if ( zExp < 0 ) {
1028             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
1029             isTiny =
1030                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1031                 || ( zExp < -1 )
1032                 || ! increment
1033                 || lt128(
1034                        zSig0,
1035                        zSig1,
1036                        LIT64( 0x0001FFFFFFFFFFFF ),
1037                        LIT64( 0xFFFFFFFFFFFFFFFF )
1038                    );
1039             shift128ExtraRightJamming(
1040                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1041             zExp = 0;
1042             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1043             if ( roundNearestEven ) {
1044                 increment = ( (sbits64) zSig2 < 0 );
1045             }
1046             else {
1047                 if ( zSign ) {
1048                     increment = ( roundingMode == float_round_down ) && zSig2;
1049                 }
1050                 else {
1051                     increment = ( roundingMode == float_round_up ) && zSig2;
1052                 }
1053             }
1054         }
1055     }
1056     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1057     if ( increment ) {
1058         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1059         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1060     }
1061     else {
1062         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1063     }
1064     return packFloat128( zSign, zExp, zSig0, zSig1 );
1065
1066 }
1067
1068 /*----------------------------------------------------------------------------
1069 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1070 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1071 | returns the proper quadruple-precision floating-point value corresponding
1072 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1073 | except that the input significand has fewer bits and does not have to be
1074 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1075 | point exponent.
1076 *----------------------------------------------------------------------------*/
1077
1078 static float128
1079  normalizeRoundAndPackFloat128(
1080      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1081 {
1082     int8 shiftCount;
1083     bits64 zSig2;
1084
1085     if ( zSig0 == 0 ) {
1086         zSig0 = zSig1;
1087         zSig1 = 0;
1088         zExp -= 64;
1089     }
1090     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1091     if ( 0 <= shiftCount ) {
1092         zSig2 = 0;
1093         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1094     }
1095     else {
1096         shift128ExtraRightJamming(
1097             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1098     }
1099     zExp -= shiftCount;
1100     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1101
1102 }
1103
1104 #endif
1105
1106 /*----------------------------------------------------------------------------
1107 | Returns the result of converting the 32-bit two's complement integer `a'
1108 | to the single-precision floating-point format.  The conversion is performed
1109 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1110 *----------------------------------------------------------------------------*/
1111
1112 float32 int32_to_float32( int32 a STATUS_PARAM )
1113 {
1114     flag zSign;
1115
1116     if ( a == 0 ) return float32_zero;
1117     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1118     zSign = ( a < 0 );
1119     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1120
1121 }
1122
1123 /*----------------------------------------------------------------------------
1124 | Returns the result of converting the 32-bit two's complement integer `a'
1125 | to the double-precision floating-point format.  The conversion is performed
1126 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127 *----------------------------------------------------------------------------*/
1128
1129 float64 int32_to_float64( int32 a STATUS_PARAM )
1130 {
1131     flag zSign;
1132     uint32 absA;
1133     int8 shiftCount;
1134     bits64 zSig;
1135
1136     if ( a == 0 ) return float64_zero;
1137     zSign = ( a < 0 );
1138     absA = zSign ? - a : a;
1139     shiftCount = countLeadingZeros32( absA ) + 21;
1140     zSig = absA;
1141     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1142
1143 }
1144
1145 #ifdef FLOATX80
1146
1147 /*----------------------------------------------------------------------------
1148 | Returns the result of converting the 32-bit two's complement integer `a'
1149 | to the extended double-precision floating-point format.  The conversion
1150 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1151 | Arithmetic.
1152 *----------------------------------------------------------------------------*/
1153
1154 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1155 {
1156     flag zSign;
1157     uint32 absA;
1158     int8 shiftCount;
1159     bits64 zSig;
1160
1161     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1162     zSign = ( a < 0 );
1163     absA = zSign ? - a : a;
1164     shiftCount = countLeadingZeros32( absA ) + 32;
1165     zSig = absA;
1166     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1167
1168 }
1169
1170 #endif
1171
1172 #ifdef FLOAT128
1173
1174 /*----------------------------------------------------------------------------
1175 | Returns the result of converting the 32-bit two's complement integer `a' to
1176 | the quadruple-precision floating-point format.  The conversion is performed
1177 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1178 *----------------------------------------------------------------------------*/
1179
1180 float128 int32_to_float128( int32 a STATUS_PARAM )
1181 {
1182     flag zSign;
1183     uint32 absA;
1184     int8 shiftCount;
1185     bits64 zSig0;
1186
1187     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1188     zSign = ( a < 0 );
1189     absA = zSign ? - a : a;
1190     shiftCount = countLeadingZeros32( absA ) + 17;
1191     zSig0 = absA;
1192     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1193
1194 }
1195
1196 #endif
1197
1198 /*----------------------------------------------------------------------------
1199 | Returns the result of converting the 64-bit two's complement integer `a'
1200 | to the single-precision floating-point format.  The conversion is performed
1201 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1202 *----------------------------------------------------------------------------*/
1203
1204 float32 int64_to_float32( int64 a STATUS_PARAM )
1205 {
1206     flag zSign;
1207     uint64 absA;
1208     int8 shiftCount;
1209
1210     if ( a == 0 ) return float32_zero;
1211     zSign = ( a < 0 );
1212     absA = zSign ? - a : a;
1213     shiftCount = countLeadingZeros64( absA ) - 40;
1214     if ( 0 <= shiftCount ) {
1215         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1216     }
1217     else {
1218         shiftCount += 7;
1219         if ( shiftCount < 0 ) {
1220             shift64RightJamming( absA, - shiftCount, &absA );
1221         }
1222         else {
1223             absA <<= shiftCount;
1224         }
1225         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1226     }
1227
1228 }
1229
1230 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1231 {
1232     int8 shiftCount;
1233
1234     if ( a == 0 ) return float32_zero;
1235     shiftCount = countLeadingZeros64( a ) - 40;
1236     if ( 0 <= shiftCount ) {
1237         return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1238     }
1239     else {
1240         shiftCount += 7;
1241         if ( shiftCount < 0 ) {
1242             shift64RightJamming( a, - shiftCount, &a );
1243         }
1244         else {
1245             a <<= shiftCount;
1246         }
1247         return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1248     }
1249 }
1250
1251 /*----------------------------------------------------------------------------
1252 | Returns the result of converting the 64-bit two's complement integer `a'
1253 | to the double-precision floating-point format.  The conversion is performed
1254 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1255 *----------------------------------------------------------------------------*/
1256
1257 float64 int64_to_float64( int64 a STATUS_PARAM )
1258 {
1259     flag zSign;
1260
1261     if ( a == 0 ) return float64_zero;
1262     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1263         return packFloat64( 1, 0x43E, 0 );
1264     }
1265     zSign = ( a < 0 );
1266     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1267
1268 }
1269
1270 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1271 {
1272     if ( a == 0 ) return float64_zero;
1273     return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1274
1275 }
1276
1277 #ifdef FLOATX80
1278
1279 /*----------------------------------------------------------------------------
1280 | Returns the result of converting the 64-bit two's complement integer `a'
1281 | to the extended double-precision floating-point format.  The conversion
1282 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1283 | Arithmetic.
1284 *----------------------------------------------------------------------------*/
1285
1286 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1287 {
1288     flag zSign;
1289     uint64 absA;
1290     int8 shiftCount;
1291
1292     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1293     zSign = ( a < 0 );
1294     absA = zSign ? - a : a;
1295     shiftCount = countLeadingZeros64( absA );
1296     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1297
1298 }
1299
1300 #endif
1301
1302 #ifdef FLOAT128
1303
1304 /*----------------------------------------------------------------------------
1305 | Returns the result of converting the 64-bit two's complement integer `a' to
1306 | the quadruple-precision floating-point format.  The conversion is performed
1307 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1308 *----------------------------------------------------------------------------*/
1309
1310 float128 int64_to_float128( int64 a STATUS_PARAM )
1311 {
1312     flag zSign;
1313     uint64 absA;
1314     int8 shiftCount;
1315     int32 zExp;
1316     bits64 zSig0, zSig1;
1317
1318     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1319     zSign = ( a < 0 );
1320     absA = zSign ? - a : a;
1321     shiftCount = countLeadingZeros64( absA ) + 49;
1322     zExp = 0x406E - shiftCount;
1323     if ( 64 <= shiftCount ) {
1324         zSig1 = 0;
1325         zSig0 = absA;
1326         shiftCount -= 64;
1327     }
1328     else {
1329         zSig1 = absA;
1330         zSig0 = 0;
1331     }
1332     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1333     return packFloat128( zSign, zExp, zSig0, zSig1 );
1334
1335 }
1336
1337 #endif
1338
1339 /*----------------------------------------------------------------------------
1340 | Returns the result of converting the single-precision floating-point value
1341 | `a' to the 32-bit two's complement integer format.  The conversion is
1342 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1343 | Arithmetic---which means in particular that the conversion is rounded
1344 | according to the current rounding mode.  If `a' is a NaN, the largest
1345 | positive integer is returned.  Otherwise, if the conversion overflows, the
1346 | largest integer with the same sign as `a' is returned.
1347 *----------------------------------------------------------------------------*/
1348
1349 int32 float32_to_int32( float32 a STATUS_PARAM )
1350 {
1351     flag aSign;
1352     int16 aExp, shiftCount;
1353     bits32 aSig;
1354     bits64 aSig64;
1355
1356     a = float32_squash_input_denormal(a STATUS_VAR);
1357     aSig = extractFloat32Frac( a );
1358     aExp = extractFloat32Exp( a );
1359     aSign = extractFloat32Sign( a );
1360     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1361     if ( aExp ) aSig |= 0x00800000;
1362     shiftCount = 0xAF - aExp;
1363     aSig64 = aSig;
1364     aSig64 <<= 32;
1365     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1366     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1367
1368 }
1369
1370 /*----------------------------------------------------------------------------
1371 | Returns the result of converting the single-precision floating-point value
1372 | `a' to the 32-bit two's complement integer format.  The conversion is
1373 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1374 | Arithmetic, except that the conversion is always rounded toward zero.
1375 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1376 | the conversion overflows, the largest integer with the same sign as `a' is
1377 | returned.
1378 *----------------------------------------------------------------------------*/
1379
1380 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1381 {
1382     flag aSign;
1383     int16 aExp, shiftCount;
1384     bits32 aSig;
1385     int32 z;
1386     a = float32_squash_input_denormal(a STATUS_VAR);
1387
1388     aSig = extractFloat32Frac( a );
1389     aExp = extractFloat32Exp( a );
1390     aSign = extractFloat32Sign( a );
1391     shiftCount = aExp - 0x9E;
1392     if ( 0 <= shiftCount ) {
1393         if ( float32_val(a) != 0xCF000000 ) {
1394             float_raise( float_flag_invalid STATUS_VAR);
1395             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1396         }
1397         return (sbits32) 0x80000000;
1398     }
1399     else if ( aExp <= 0x7E ) {
1400         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1401         return 0;
1402     }
1403     aSig = ( aSig | 0x00800000 )<<8;
1404     z = aSig>>( - shiftCount );
1405     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1406         STATUS(float_exception_flags) |= float_flag_inexact;
1407     }
1408     if ( aSign ) z = - z;
1409     return z;
1410
1411 }
1412
1413 /*----------------------------------------------------------------------------
1414 | Returns the result of converting the single-precision floating-point value
1415 | `a' to the 16-bit two's complement integer format.  The conversion is
1416 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1417 | Arithmetic, except that the conversion is always rounded toward zero.
1418 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1419 | the conversion overflows, the largest integer with the same sign as `a' is
1420 | returned.
1421 *----------------------------------------------------------------------------*/
1422
1423 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1424 {
1425     flag aSign;
1426     int16 aExp, shiftCount;
1427     bits32 aSig;
1428     int32 z;
1429
1430     aSig = extractFloat32Frac( a );
1431     aExp = extractFloat32Exp( a );
1432     aSign = extractFloat32Sign( a );
1433     shiftCount = aExp - 0x8E;
1434     if ( 0 <= shiftCount ) {
1435         if ( float32_val(a) != 0xC7000000 ) {
1436             float_raise( float_flag_invalid STATUS_VAR);
1437             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1438                 return 0x7FFF;
1439             }
1440         }
1441         return (sbits32) 0xffff8000;
1442     }
1443     else if ( aExp <= 0x7E ) {
1444         if ( aExp | aSig ) {
1445             STATUS(float_exception_flags) |= float_flag_inexact;
1446         }
1447         return 0;
1448     }
1449     shiftCount -= 0x10;
1450     aSig = ( aSig | 0x00800000 )<<8;
1451     z = aSig>>( - shiftCount );
1452     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1453         STATUS(float_exception_flags) |= float_flag_inexact;
1454     }
1455     if ( aSign ) {
1456         z = - z;
1457     }
1458     return z;
1459
1460 }
1461
1462 /*----------------------------------------------------------------------------
1463 | Returns the result of converting the single-precision floating-point value
1464 | `a' to the 64-bit two's complement integer format.  The conversion is
1465 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1466 | Arithmetic---which means in particular that the conversion is rounded
1467 | according to the current rounding mode.  If `a' is a NaN, the largest
1468 | positive integer is returned.  Otherwise, if the conversion overflows, the
1469 | largest integer with the same sign as `a' is returned.
1470 *----------------------------------------------------------------------------*/
1471
1472 int64 float32_to_int64( float32 a STATUS_PARAM )
1473 {
1474     flag aSign;
1475     int16 aExp, shiftCount;
1476     bits32 aSig;
1477     bits64 aSig64, aSigExtra;
1478     a = float32_squash_input_denormal(a STATUS_VAR);
1479
1480     aSig = extractFloat32Frac( a );
1481     aExp = extractFloat32Exp( a );
1482     aSign = extractFloat32Sign( a );
1483     shiftCount = 0xBE - aExp;
1484     if ( shiftCount < 0 ) {
1485         float_raise( float_flag_invalid STATUS_VAR);
1486         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1487             return LIT64( 0x7FFFFFFFFFFFFFFF );
1488         }
1489         return (sbits64) LIT64( 0x8000000000000000 );
1490     }
1491     if ( aExp ) aSig |= 0x00800000;
1492     aSig64 = aSig;
1493     aSig64 <<= 40;
1494     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1495     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1496
1497 }
1498
1499 /*----------------------------------------------------------------------------
1500 | Returns the result of converting the single-precision floating-point value
1501 | `a' to the 64-bit two's complement integer format.  The conversion is
1502 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1503 | Arithmetic, except that the conversion is always rounded toward zero.  If
1504 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1505 | conversion overflows, the largest integer with the same sign as `a' is
1506 | returned.
1507 *----------------------------------------------------------------------------*/
1508
1509 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1510 {
1511     flag aSign;
1512     int16 aExp, shiftCount;
1513     bits32 aSig;
1514     bits64 aSig64;
1515     int64 z;
1516     a = float32_squash_input_denormal(a STATUS_VAR);
1517
1518     aSig = extractFloat32Frac( a );
1519     aExp = extractFloat32Exp( a );
1520     aSign = extractFloat32Sign( a );
1521     shiftCount = aExp - 0xBE;
1522     if ( 0 <= shiftCount ) {
1523         if ( float32_val(a) != 0xDF000000 ) {
1524             float_raise( float_flag_invalid STATUS_VAR);
1525             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1526                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1527             }
1528         }
1529         return (sbits64) LIT64( 0x8000000000000000 );
1530     }
1531     else if ( aExp <= 0x7E ) {
1532         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1533         return 0;
1534     }
1535     aSig64 = aSig | 0x00800000;
1536     aSig64 <<= 40;
1537     z = aSig64>>( - shiftCount );
1538     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1539         STATUS(float_exception_flags) |= float_flag_inexact;
1540     }
1541     if ( aSign ) z = - z;
1542     return z;
1543
1544 }
1545
1546 /*----------------------------------------------------------------------------
1547 | Returns the result of converting the single-precision floating-point value
1548 | `a' to the double-precision floating-point format.  The conversion is
1549 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1550 | Arithmetic.
1551 *----------------------------------------------------------------------------*/
1552
1553 float64 float32_to_float64( float32 a STATUS_PARAM )
1554 {
1555     flag aSign;
1556     int16 aExp;
1557     bits32 aSig;
1558     a = float32_squash_input_denormal(a STATUS_VAR);
1559
1560     aSig = extractFloat32Frac( a );
1561     aExp = extractFloat32Exp( a );
1562     aSign = extractFloat32Sign( a );
1563     if ( aExp == 0xFF ) {
1564         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1565         return packFloat64( aSign, 0x7FF, 0 );
1566     }
1567     if ( aExp == 0 ) {
1568         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1569         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1570         --aExp;
1571     }
1572     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1573
1574 }
1575
1576 #ifdef FLOATX80
1577
1578 /*----------------------------------------------------------------------------
1579 | Returns the result of converting the single-precision floating-point value
1580 | `a' to the extended double-precision floating-point format.  The conversion
1581 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1582 | Arithmetic.
1583 *----------------------------------------------------------------------------*/
1584
1585 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1586 {
1587     flag aSign;
1588     int16 aExp;
1589     bits32 aSig;
1590
1591     a = float32_squash_input_denormal(a STATUS_VAR);
1592     aSig = extractFloat32Frac( a );
1593     aExp = extractFloat32Exp( a );
1594     aSign = extractFloat32Sign( a );
1595     if ( aExp == 0xFF ) {
1596         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1597         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1598     }
1599     if ( aExp == 0 ) {
1600         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1601         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1602     }
1603     aSig |= 0x00800000;
1604     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1605
1606 }
1607
1608 #endif
1609
1610 #ifdef FLOAT128
1611
1612 /*----------------------------------------------------------------------------
1613 | Returns the result of converting the single-precision floating-point value
1614 | `a' to the double-precision floating-point format.  The conversion is
1615 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1616 | Arithmetic.
1617 *----------------------------------------------------------------------------*/
1618
1619 float128 float32_to_float128( float32 a STATUS_PARAM )
1620 {
1621     flag aSign;
1622     int16 aExp;
1623     bits32 aSig;
1624
1625     a = float32_squash_input_denormal(a STATUS_VAR);
1626     aSig = extractFloat32Frac( a );
1627     aExp = extractFloat32Exp( a );
1628     aSign = extractFloat32Sign( a );
1629     if ( aExp == 0xFF ) {
1630         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1631         return packFloat128( aSign, 0x7FFF, 0, 0 );
1632     }
1633     if ( aExp == 0 ) {
1634         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1635         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1636         --aExp;
1637     }
1638     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1639
1640 }
1641
1642 #endif
1643
1644 /*----------------------------------------------------------------------------
1645 | Rounds the single-precision floating-point value `a' to an integer, and
1646 | returns the result as a single-precision floating-point value.  The
1647 | operation is performed according to the IEC/IEEE Standard for Binary
1648 | Floating-Point Arithmetic.
1649 *----------------------------------------------------------------------------*/
1650
1651 float32 float32_round_to_int( float32 a STATUS_PARAM)
1652 {
1653     flag aSign;
1654     int16 aExp;
1655     bits32 lastBitMask, roundBitsMask;
1656     int8 roundingMode;
1657     bits32 z;
1658     a = float32_squash_input_denormal(a STATUS_VAR);
1659
1660     aExp = extractFloat32Exp( a );
1661     if ( 0x96 <= aExp ) {
1662         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1663             return propagateFloat32NaN( a, a STATUS_VAR );
1664         }
1665         return a;
1666     }
1667     if ( aExp <= 0x7E ) {
1668         if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1669         STATUS(float_exception_flags) |= float_flag_inexact;
1670         aSign = extractFloat32Sign( a );
1671         switch ( STATUS(float_rounding_mode) ) {
1672          case float_round_nearest_even:
1673             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1674                 return packFloat32( aSign, 0x7F, 0 );
1675             }
1676             break;
1677          case float_round_down:
1678             return make_float32(aSign ? 0xBF800000 : 0);
1679          case float_round_up:
1680             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1681         }
1682         return packFloat32( aSign, 0, 0 );
1683     }
1684     lastBitMask = 1;
1685     lastBitMask <<= 0x96 - aExp;
1686     roundBitsMask = lastBitMask - 1;
1687     z = float32_val(a);
1688     roundingMode = STATUS(float_rounding_mode);
1689     if ( roundingMode == float_round_nearest_even ) {
1690         z += lastBitMask>>1;
1691         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1692     }
1693     else if ( roundingMode != float_round_to_zero ) {
1694         if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1695             z += roundBitsMask;
1696         }
1697     }
1698     z &= ~ roundBitsMask;
1699     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1700     return make_float32(z);
1701
1702 }
1703
1704 /*----------------------------------------------------------------------------
1705 | Returns the result of adding the absolute values of the single-precision
1706 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1707 | before being returned.  `zSign' is ignored if the result is a NaN.
1708 | The addition is performed according to the IEC/IEEE Standard for Binary
1709 | Floating-Point Arithmetic.
1710 *----------------------------------------------------------------------------*/
1711
1712 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1713 {
1714     int16 aExp, bExp, zExp;
1715     bits32 aSig, bSig, zSig;
1716     int16 expDiff;
1717
1718     aSig = extractFloat32Frac( a );
1719     aExp = extractFloat32Exp( a );
1720     bSig = extractFloat32Frac( b );
1721     bExp = extractFloat32Exp( b );
1722     expDiff = aExp - bExp;
1723     aSig <<= 6;
1724     bSig <<= 6;
1725     if ( 0 < expDiff ) {
1726         if ( aExp == 0xFF ) {
1727             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1728             return a;
1729         }
1730         if ( bExp == 0 ) {
1731             --expDiff;
1732         }
1733         else {
1734             bSig |= 0x20000000;
1735         }
1736         shift32RightJamming( bSig, expDiff, &bSig );
1737         zExp = aExp;
1738     }
1739     else if ( expDiff < 0 ) {
1740         if ( bExp == 0xFF ) {
1741             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1742             return packFloat32( zSign, 0xFF, 0 );
1743         }
1744         if ( aExp == 0 ) {
1745             ++expDiff;
1746         }
1747         else {
1748             aSig |= 0x20000000;
1749         }
1750         shift32RightJamming( aSig, - expDiff, &aSig );
1751         zExp = bExp;
1752     }
1753     else {
1754         if ( aExp == 0xFF ) {
1755             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1756             return a;
1757         }
1758         if ( aExp == 0 ) {
1759             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1760             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1761         }
1762         zSig = 0x40000000 + aSig + bSig;
1763         zExp = aExp;
1764         goto roundAndPack;
1765     }
1766     aSig |= 0x20000000;
1767     zSig = ( aSig + bSig )<<1;
1768     --zExp;
1769     if ( (sbits32) zSig < 0 ) {
1770         zSig = aSig + bSig;
1771         ++zExp;
1772     }
1773  roundAndPack:
1774     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1775
1776 }
1777
1778 /*----------------------------------------------------------------------------
1779 | Returns the result of subtracting the absolute values of the single-
1780 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1781 | difference is negated before being returned.  `zSign' is ignored if the
1782 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1783 | Standard for Binary Floating-Point Arithmetic.
1784 *----------------------------------------------------------------------------*/
1785
1786 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1787 {
1788     int16 aExp, bExp, zExp;
1789     bits32 aSig, bSig, zSig;
1790     int16 expDiff;
1791
1792     aSig = extractFloat32Frac( a );
1793     aExp = extractFloat32Exp( a );
1794     bSig = extractFloat32Frac( b );
1795     bExp = extractFloat32Exp( b );
1796     expDiff = aExp - bExp;
1797     aSig <<= 7;
1798     bSig <<= 7;
1799     if ( 0 < expDiff ) goto aExpBigger;
1800     if ( expDiff < 0 ) goto bExpBigger;
1801     if ( aExp == 0xFF ) {
1802         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1803         float_raise( float_flag_invalid STATUS_VAR);
1804         return float32_default_nan;
1805     }
1806     if ( aExp == 0 ) {
1807         aExp = 1;
1808         bExp = 1;
1809     }
1810     if ( bSig < aSig ) goto aBigger;
1811     if ( aSig < bSig ) goto bBigger;
1812     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1813  bExpBigger:
1814     if ( bExp == 0xFF ) {
1815         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1816         return packFloat32( zSign ^ 1, 0xFF, 0 );
1817     }
1818     if ( aExp == 0 ) {
1819         ++expDiff;
1820     }
1821     else {
1822         aSig |= 0x40000000;
1823     }
1824     shift32RightJamming( aSig, - expDiff, &aSig );
1825     bSig |= 0x40000000;
1826  bBigger:
1827     zSig = bSig - aSig;
1828     zExp = bExp;
1829     zSign ^= 1;
1830     goto normalizeRoundAndPack;
1831  aExpBigger:
1832     if ( aExp == 0xFF ) {
1833         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1834         return a;
1835     }
1836     if ( bExp == 0 ) {
1837         --expDiff;
1838     }
1839     else {
1840         bSig |= 0x40000000;
1841     }
1842     shift32RightJamming( bSig, expDiff, &bSig );
1843     aSig |= 0x40000000;
1844  aBigger:
1845     zSig = aSig - bSig;
1846     zExp = aExp;
1847  normalizeRoundAndPack:
1848     --zExp;
1849     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1850
1851 }
1852
1853 /*----------------------------------------------------------------------------
1854 | Returns the result of adding the single-precision floating-point values `a'
1855 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1856 | Binary Floating-Point Arithmetic.
1857 *----------------------------------------------------------------------------*/
1858
1859 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1860 {
1861     flag aSign, bSign;
1862     a = float32_squash_input_denormal(a STATUS_VAR);
1863     b = float32_squash_input_denormal(b STATUS_VAR);
1864
1865     aSign = extractFloat32Sign( a );
1866     bSign = extractFloat32Sign( b );
1867     if ( aSign == bSign ) {
1868         return addFloat32Sigs( a, b, aSign STATUS_VAR);
1869     }
1870     else {
1871         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1872     }
1873
1874 }
1875
1876 /*----------------------------------------------------------------------------
1877 | Returns the result of subtracting the single-precision floating-point values
1878 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1879 | for Binary Floating-Point Arithmetic.
1880 *----------------------------------------------------------------------------*/
1881
1882 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1883 {
1884     flag aSign, bSign;
1885     a = float32_squash_input_denormal(a STATUS_VAR);
1886     b = float32_squash_input_denormal(b STATUS_VAR);
1887
1888     aSign = extractFloat32Sign( a );
1889     bSign = extractFloat32Sign( b );
1890     if ( aSign == bSign ) {
1891         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1892     }
1893     else {
1894         return addFloat32Sigs( a, b, aSign STATUS_VAR );
1895     }
1896
1897 }
1898
1899 /*----------------------------------------------------------------------------
1900 | Returns the result of multiplying the single-precision floating-point values
1901 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1902 | for Binary Floating-Point Arithmetic.
1903 *----------------------------------------------------------------------------*/
1904
1905 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1906 {
1907     flag aSign, bSign, zSign;
1908     int16 aExp, bExp, zExp;
1909     bits32 aSig, bSig;
1910     bits64 zSig64;
1911     bits32 zSig;
1912
1913     a = float32_squash_input_denormal(a STATUS_VAR);
1914     b = float32_squash_input_denormal(b STATUS_VAR);
1915
1916     aSig = extractFloat32Frac( a );
1917     aExp = extractFloat32Exp( a );
1918     aSign = extractFloat32Sign( a );
1919     bSig = extractFloat32Frac( b );
1920     bExp = extractFloat32Exp( b );
1921     bSign = extractFloat32Sign( b );
1922     zSign = aSign ^ bSign;
1923     if ( aExp == 0xFF ) {
1924         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1925             return propagateFloat32NaN( a, b STATUS_VAR );
1926         }
1927         if ( ( bExp | bSig ) == 0 ) {
1928             float_raise( float_flag_invalid STATUS_VAR);
1929             return float32_default_nan;
1930         }
1931         return packFloat32( zSign, 0xFF, 0 );
1932     }
1933     if ( bExp == 0xFF ) {
1934         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1935         if ( ( aExp | aSig ) == 0 ) {
1936             float_raise( float_flag_invalid STATUS_VAR);
1937             return float32_default_nan;
1938         }
1939         return packFloat32( zSign, 0xFF, 0 );
1940     }
1941     if ( aExp == 0 ) {
1942         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1943         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1944     }
1945     if ( bExp == 0 ) {
1946         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1947         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1948     }
1949     zExp = aExp + bExp - 0x7F;
1950     aSig = ( aSig | 0x00800000 )<<7;
1951     bSig = ( bSig | 0x00800000 )<<8;
1952     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1953     zSig = zSig64;
1954     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1955         zSig <<= 1;
1956         --zExp;
1957     }
1958     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1959
1960 }
1961
1962 /*----------------------------------------------------------------------------
1963 | Returns the result of dividing the single-precision floating-point value `a'
1964 | by the corresponding value `b'.  The operation is performed according to the
1965 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1966 *----------------------------------------------------------------------------*/
1967
1968 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1969 {
1970     flag aSign, bSign, zSign;
1971     int16 aExp, bExp, zExp;
1972     bits32 aSig, bSig, zSig;
1973     a = float32_squash_input_denormal(a STATUS_VAR);
1974     b = float32_squash_input_denormal(b STATUS_VAR);
1975
1976     aSig = extractFloat32Frac( a );
1977     aExp = extractFloat32Exp( a );
1978     aSign = extractFloat32Sign( a );
1979     bSig = extractFloat32Frac( b );
1980     bExp = extractFloat32Exp( b );
1981     bSign = extractFloat32Sign( b );
1982     zSign = aSign ^ bSign;
1983     if ( aExp == 0xFF ) {
1984         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1985         if ( bExp == 0xFF ) {
1986             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1987             float_raise( float_flag_invalid STATUS_VAR);
1988             return float32_default_nan;
1989         }
1990         return packFloat32( zSign, 0xFF, 0 );
1991     }
1992     if ( bExp == 0xFF ) {
1993         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1994         return packFloat32( zSign, 0, 0 );
1995     }
1996     if ( bExp == 0 ) {
1997         if ( bSig == 0 ) {
1998             if ( ( aExp | aSig ) == 0 ) {
1999                 float_raise( float_flag_invalid STATUS_VAR);
2000                 return float32_default_nan;
2001             }
2002             float_raise( float_flag_divbyzero STATUS_VAR);
2003             return packFloat32( zSign, 0xFF, 0 );
2004         }
2005         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2006     }
2007     if ( aExp == 0 ) {
2008         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2009         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2010     }
2011     zExp = aExp - bExp + 0x7D;
2012     aSig = ( aSig | 0x00800000 )<<7;
2013     bSig = ( bSig | 0x00800000 )<<8;
2014     if ( bSig <= ( aSig + aSig ) ) {
2015         aSig >>= 1;
2016         ++zExp;
2017     }
2018     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
2019     if ( ( zSig & 0x3F ) == 0 ) {
2020         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2021     }
2022     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2023
2024 }
2025
2026 /*----------------------------------------------------------------------------
2027 | Returns the remainder of the single-precision floating-point value `a'
2028 | with respect to the corresponding value `b'.  The operation is performed
2029 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2030 *----------------------------------------------------------------------------*/
2031
2032 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2033 {
2034     flag aSign, zSign;
2035     int16 aExp, bExp, expDiff;
2036     bits32 aSig, bSig;
2037     bits32 q;
2038     bits64 aSig64, bSig64, q64;
2039     bits32 alternateASig;
2040     sbits32 sigMean;
2041     a = float32_squash_input_denormal(a STATUS_VAR);
2042     b = float32_squash_input_denormal(b STATUS_VAR);
2043
2044     aSig = extractFloat32Frac( a );
2045     aExp = extractFloat32Exp( a );
2046     aSign = extractFloat32Sign( a );
2047     bSig = extractFloat32Frac( b );
2048     bExp = extractFloat32Exp( b );
2049     if ( aExp == 0xFF ) {
2050         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2051             return propagateFloat32NaN( a, b STATUS_VAR );
2052         }
2053         float_raise( float_flag_invalid STATUS_VAR);
2054         return float32_default_nan;
2055     }
2056     if ( bExp == 0xFF ) {
2057         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2058         return a;
2059     }
2060     if ( bExp == 0 ) {
2061         if ( bSig == 0 ) {
2062             float_raise( float_flag_invalid STATUS_VAR);
2063             return float32_default_nan;
2064         }
2065         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2066     }
2067     if ( aExp == 0 ) {
2068         if ( aSig == 0 ) return a;
2069         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2070     }
2071     expDiff = aExp - bExp;
2072     aSig |= 0x00800000;
2073     bSig |= 0x00800000;
2074     if ( expDiff < 32 ) {
2075         aSig <<= 8;
2076         bSig <<= 8;
2077         if ( expDiff < 0 ) {
2078             if ( expDiff < -1 ) return a;
2079             aSig >>= 1;
2080         }
2081         q = ( bSig <= aSig );
2082         if ( q ) aSig -= bSig;
2083         if ( 0 < expDiff ) {
2084             q = ( ( (bits64) aSig )<<32 ) / bSig;
2085             q >>= 32 - expDiff;
2086             bSig >>= 2;
2087             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2088         }
2089         else {
2090             aSig >>= 2;
2091             bSig >>= 2;
2092         }
2093     }
2094     else {
2095         if ( bSig <= aSig ) aSig -= bSig;
2096         aSig64 = ( (bits64) aSig )<<40;
2097         bSig64 = ( (bits64) bSig )<<40;
2098         expDiff -= 64;
2099         while ( 0 < expDiff ) {
2100             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2101             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2102             aSig64 = - ( ( bSig * q64 )<<38 );
2103             expDiff -= 62;
2104         }
2105         expDiff += 64;
2106         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2107         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2108         q = q64>>( 64 - expDiff );
2109         bSig <<= 6;
2110         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2111     }
2112     do {
2113         alternateASig = aSig;
2114         ++q;
2115         aSig -= bSig;
2116     } while ( 0 <= (sbits32) aSig );
2117     sigMean = aSig + alternateASig;
2118     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2119         aSig = alternateASig;
2120     }
2121     zSign = ( (sbits32) aSig < 0 );
2122     if ( zSign ) aSig = - aSig;
2123     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2124
2125 }
2126
2127 /*----------------------------------------------------------------------------
2128 | Returns the square root of the single-precision floating-point value `a'.
2129 | The operation is performed according to the IEC/IEEE Standard for Binary
2130 | Floating-Point Arithmetic.
2131 *----------------------------------------------------------------------------*/
2132
2133 float32 float32_sqrt( float32 a STATUS_PARAM )
2134 {
2135     flag aSign;
2136     int16 aExp, zExp;
2137     bits32 aSig, zSig;
2138     bits64 rem, term;
2139     a = float32_squash_input_denormal(a STATUS_VAR);
2140
2141     aSig = extractFloat32Frac( a );
2142     aExp = extractFloat32Exp( a );
2143     aSign = extractFloat32Sign( a );
2144     if ( aExp == 0xFF ) {
2145         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2146         if ( ! aSign ) return a;
2147         float_raise( float_flag_invalid STATUS_VAR);
2148         return float32_default_nan;
2149     }
2150     if ( aSign ) {
2151         if ( ( aExp | aSig ) == 0 ) return a;
2152         float_raise( float_flag_invalid STATUS_VAR);
2153         return float32_default_nan;
2154     }
2155     if ( aExp == 0 ) {
2156         if ( aSig == 0 ) return float32_zero;
2157         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2158     }
2159     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2160     aSig = ( aSig | 0x00800000 )<<8;
2161     zSig = estimateSqrt32( aExp, aSig ) + 2;
2162     if ( ( zSig & 0x7F ) <= 5 ) {
2163         if ( zSig < 2 ) {
2164             zSig = 0x7FFFFFFF;
2165             goto roundAndPack;
2166         }
2167         aSig >>= aExp & 1;
2168         term = ( (bits64) zSig ) * zSig;
2169         rem = ( ( (bits64) aSig )<<32 ) - term;
2170         while ( (sbits64) rem < 0 ) {
2171             --zSig;
2172             rem += ( ( (bits64) zSig )<<1 ) | 1;
2173         }
2174         zSig |= ( rem != 0 );
2175     }
2176     shift32RightJamming( zSig, 1, &zSig );
2177  roundAndPack:
2178     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2179
2180 }
2181
2182 /*----------------------------------------------------------------------------
2183 | Returns the binary exponential of the single-precision floating-point value
2184 | `a'. The operation is performed according to the IEC/IEEE Standard for
2185 | Binary Floating-Point Arithmetic.
2186 |
2187 | Uses the following identities:
2188 |
2189 | 1. -------------------------------------------------------------------------
2190 |      x    x*ln(2)
2191 |     2  = e
2192 |
2193 | 2. -------------------------------------------------------------------------
2194 |                      2     3     4     5           n
2195 |      x        x     x     x     x     x           x
2196 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2197 |               1!    2!    3!    4!    5!          n!
2198 *----------------------------------------------------------------------------*/
2199
2200 static const float64 float32_exp2_coefficients[15] =
2201 {
2202     make_float64( 0x3ff0000000000000ll ), /*  1 */
2203     make_float64( 0x3fe0000000000000ll ), /*  2 */
2204     make_float64( 0x3fc5555555555555ll ), /*  3 */
2205     make_float64( 0x3fa5555555555555ll ), /*  4 */
2206     make_float64( 0x3f81111111111111ll ), /*  5 */
2207     make_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2208     make_float64( 0x3f2a01a01a01a01all ), /*  7 */
2209     make_float64( 0x3efa01a01a01a01all ), /*  8 */
2210     make_float64( 0x3ec71de3a556c734ll ), /*  9 */
2211     make_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2212     make_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2213     make_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2214     make_float64( 0x3de6124613a86d09ll ), /* 13 */
2215     make_float64( 0x3da93974a8c07c9dll ), /* 14 */
2216     make_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2217 };
2218
2219 float32 float32_exp2( float32 a STATUS_PARAM )
2220 {
2221     flag aSign;
2222     int16 aExp;
2223     bits32 aSig;
2224     float64 r, x, xn;
2225     int i;
2226     a = float32_squash_input_denormal(a STATUS_VAR);
2227
2228     aSig = extractFloat32Frac( a );
2229     aExp = extractFloat32Exp( a );
2230     aSign = extractFloat32Sign( a );
2231
2232     if ( aExp == 0xFF) {
2233         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2234         return (aSign) ? float32_zero : a;
2235     }
2236     if (aExp == 0) {
2237         if (aSig == 0) return float32_one;
2238     }
2239
2240     float_raise( float_flag_inexact STATUS_VAR);
2241
2242     /* ******************************* */
2243     /* using float64 for approximation */
2244     /* ******************************* */
2245     x = float32_to_float64(a STATUS_VAR);
2246     x = float64_mul(x, float64_ln2 STATUS_VAR);
2247
2248     xn = x;
2249     r = float64_one;
2250     for (i = 0 ; i < 15 ; i++) {
2251         float64 f;
2252
2253         f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2254         r = float64_add(r, f STATUS_VAR);
2255
2256         xn = float64_mul(xn, x STATUS_VAR);
2257     }
2258
2259     return float64_to_float32(r, status);
2260 }
2261
2262 /*----------------------------------------------------------------------------
2263 | Returns the binary log of the single-precision floating-point value `a'.
2264 | The operation is performed according to the IEC/IEEE Standard for Binary
2265 | Floating-Point Arithmetic.
2266 *----------------------------------------------------------------------------*/
2267 float32 float32_log2( float32 a STATUS_PARAM )
2268 {
2269     flag aSign, zSign;
2270     int16 aExp;
2271     bits32 aSig, zSig, i;
2272
2273     a = float32_squash_input_denormal(a STATUS_VAR);
2274     aSig = extractFloat32Frac( a );
2275     aExp = extractFloat32Exp( a );
2276     aSign = extractFloat32Sign( a );
2277
2278     if ( aExp == 0 ) {
2279         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2280         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2281     }
2282     if ( aSign ) {
2283         float_raise( float_flag_invalid STATUS_VAR);
2284         return float32_default_nan;
2285     }
2286     if ( aExp == 0xFF ) {
2287         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2288         return a;
2289     }
2290
2291     aExp -= 0x7F;
2292     aSig |= 0x00800000;
2293     zSign = aExp < 0;
2294     zSig = aExp << 23;
2295
2296     for (i = 1 << 22; i > 0; i >>= 1) {
2297         aSig = ( (bits64)aSig * aSig ) >> 23;
2298         if ( aSig & 0x01000000 ) {
2299             aSig >>= 1;
2300             zSig |= i;
2301         }
2302     }
2303
2304     if ( zSign )
2305         zSig = -zSig;
2306
2307     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2308 }
2309
2310 /*----------------------------------------------------------------------------
2311 | Returns 1 if the single-precision floating-point value `a' is equal to
2312 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2313 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2314 *----------------------------------------------------------------------------*/
2315
2316 int float32_eq( float32 a, float32 b STATUS_PARAM )
2317 {
2318     a = float32_squash_input_denormal(a STATUS_VAR);
2319     b = float32_squash_input_denormal(b STATUS_VAR);
2320
2321     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2322          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2323        ) {
2324         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2325             float_raise( float_flag_invalid STATUS_VAR);
2326         }
2327         return 0;
2328     }
2329     return ( float32_val(a) == float32_val(b) ) ||
2330             ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2331
2332 }
2333
2334 /*----------------------------------------------------------------------------
2335 | Returns 1 if the single-precision floating-point value `a' is less than
2336 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
2337 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2338 | Arithmetic.
2339 *----------------------------------------------------------------------------*/
2340
2341 int float32_le( float32 a, float32 b STATUS_PARAM )
2342 {
2343     flag aSign, bSign;
2344     bits32 av, bv;
2345     a = float32_squash_input_denormal(a STATUS_VAR);
2346     b = float32_squash_input_denormal(b STATUS_VAR);
2347
2348     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2349          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2350        ) {
2351         float_raise( float_flag_invalid STATUS_VAR);
2352         return 0;
2353     }
2354     aSign = extractFloat32Sign( a );
2355     bSign = extractFloat32Sign( b );
2356     av = float32_val(a);
2357     bv = float32_val(b);
2358     if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2359     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2360
2361 }
2362
2363 /*----------------------------------------------------------------------------
2364 | Returns 1 if the single-precision floating-point value `a' is less than
2365 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2366 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2367 *----------------------------------------------------------------------------*/
2368
2369 int float32_lt( float32 a, float32 b STATUS_PARAM )
2370 {
2371     flag aSign, bSign;
2372     bits32 av, bv;
2373     a = float32_squash_input_denormal(a STATUS_VAR);
2374     b = float32_squash_input_denormal(b STATUS_VAR);
2375
2376     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2377          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2378        ) {
2379         float_raise( float_flag_invalid STATUS_VAR);
2380         return 0;
2381     }
2382     aSign = extractFloat32Sign( a );
2383     bSign = extractFloat32Sign( b );
2384     av = float32_val(a);
2385     bv = float32_val(b);
2386     if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2387     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2388
2389 }
2390
2391 /*----------------------------------------------------------------------------
2392 | Returns 1 if the single-precision floating-point value `a' is equal to
2393 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2394 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2395 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2396 *----------------------------------------------------------------------------*/
2397
2398 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2399 {
2400     bits32 av, bv;
2401     a = float32_squash_input_denormal(a STATUS_VAR);
2402     b = float32_squash_input_denormal(b STATUS_VAR);
2403
2404     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2405          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2406        ) {
2407         float_raise( float_flag_invalid STATUS_VAR);
2408         return 0;
2409     }
2410     av = float32_val(a);
2411     bv = float32_val(b);
2412     return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2413
2414 }
2415
2416 /*----------------------------------------------------------------------------
2417 | Returns 1 if the single-precision floating-point value `a' is less than or
2418 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2419 | cause an exception.  Otherwise, the comparison is performed according to the
2420 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2421 *----------------------------------------------------------------------------*/
2422
2423 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2424 {
2425     flag aSign, bSign;
2426     bits32 av, bv;
2427     a = float32_squash_input_denormal(a STATUS_VAR);
2428     b = float32_squash_input_denormal(b STATUS_VAR);
2429
2430     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2431          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2432        ) {
2433         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2434             float_raise( float_flag_invalid STATUS_VAR);
2435         }
2436         return 0;
2437     }
2438     aSign = extractFloat32Sign( a );
2439     bSign = extractFloat32Sign( b );
2440     av = float32_val(a);
2441     bv = float32_val(b);
2442     if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2443     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2444
2445 }
2446
2447 /*----------------------------------------------------------------------------
2448 | Returns 1 if the single-precision floating-point value `a' is less than
2449 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2450 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2451 | Standard for Binary Floating-Point Arithmetic.
2452 *----------------------------------------------------------------------------*/
2453
2454 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2455 {
2456     flag aSign, bSign;
2457     bits32 av, bv;
2458     a = float32_squash_input_denormal(a STATUS_VAR);
2459     b = float32_squash_input_denormal(b STATUS_VAR);
2460
2461     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2462          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2463        ) {
2464         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2465             float_raise( float_flag_invalid STATUS_VAR);
2466         }
2467         return 0;
2468     }
2469     aSign = extractFloat32Sign( a );
2470     bSign = extractFloat32Sign( b );
2471     av = float32_val(a);
2472     bv = float32_val(b);
2473     if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2474     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2475
2476 }
2477
2478 /*----------------------------------------------------------------------------
2479 | Returns the result of converting the double-precision floating-point value
2480 | `a' to the 32-bit two's complement integer format.  The conversion is
2481 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2482 | Arithmetic---which means in particular that the conversion is rounded
2483 | according to the current rounding mode.  If `a' is a NaN, the largest
2484 | positive integer is returned.  Otherwise, if the conversion overflows, the
2485 | largest integer with the same sign as `a' is returned.
2486 *----------------------------------------------------------------------------*/
2487
2488 int32 float64_to_int32( float64 a STATUS_PARAM )
2489 {
2490     flag aSign;
2491     int16 aExp, shiftCount;
2492     bits64 aSig;
2493     a = float64_squash_input_denormal(a STATUS_VAR);
2494
2495     aSig = extractFloat64Frac( a );
2496     aExp = extractFloat64Exp( a );
2497     aSign = extractFloat64Sign( a );
2498     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2499     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2500     shiftCount = 0x42C - aExp;
2501     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2502     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2503
2504 }
2505
2506 /*----------------------------------------------------------------------------
2507 | Returns the result of converting the double-precision floating-point value
2508 | `a' to the 32-bit two's complement integer format.  The conversion is
2509 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2510 | Arithmetic, except that the conversion is always rounded toward zero.
2511 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2512 | the conversion overflows, the largest integer with the same sign as `a' is
2513 | returned.
2514 *----------------------------------------------------------------------------*/
2515
2516 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2517 {
2518     flag aSign;
2519     int16 aExp, shiftCount;
2520     bits64 aSig, savedASig;
2521     int32 z;
2522     a = float64_squash_input_denormal(a STATUS_VAR);
2523
2524     aSig = extractFloat64Frac( a );
2525     aExp = extractFloat64Exp( a );
2526     aSign = extractFloat64Sign( a );
2527     if ( 0x41E < aExp ) {
2528         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2529         goto invalid;
2530     }
2531     else if ( aExp < 0x3FF ) {
2532         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2533         return 0;
2534     }
2535     aSig |= LIT64( 0x0010000000000000 );
2536     shiftCount = 0x433 - aExp;
2537     savedASig = aSig;
2538     aSig >>= shiftCount;
2539     z = aSig;
2540     if ( aSign ) z = - z;
2541     if ( ( z < 0 ) ^ aSign ) {
2542  invalid:
2543         float_raise( float_flag_invalid STATUS_VAR);
2544         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2545     }
2546     if ( ( aSig<<shiftCount ) != savedASig ) {
2547         STATUS(float_exception_flags) |= float_flag_inexact;
2548     }
2549     return z;
2550
2551 }
2552
2553 /*----------------------------------------------------------------------------
2554 | Returns the result of converting the double-precision floating-point value
2555 | `a' to the 16-bit two's complement integer format.  The conversion is
2556 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2557 | Arithmetic, except that the conversion is always rounded toward zero.
2558 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2559 | the conversion overflows, the largest integer with the same sign as `a' is
2560 | returned.
2561 *----------------------------------------------------------------------------*/
2562
2563 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2564 {
2565     flag aSign;
2566     int16 aExp, shiftCount;
2567     bits64 aSig, savedASig;
2568     int32 z;
2569
2570     aSig = extractFloat64Frac( a );
2571     aExp = extractFloat64Exp( a );
2572     aSign = extractFloat64Sign( a );
2573     if ( 0x40E < aExp ) {
2574         if ( ( aExp == 0x7FF ) && aSig ) {
2575             aSign = 0;
2576         }
2577         goto invalid;
2578     }
2579     else if ( aExp < 0x3FF ) {
2580         if ( aExp || aSig ) {
2581             STATUS(float_exception_flags) |= float_flag_inexact;
2582         }
2583         return 0;
2584     }
2585     aSig |= LIT64( 0x0010000000000000 );
2586     shiftCount = 0x433 - aExp;
2587     savedASig = aSig;
2588     aSig >>= shiftCount;
2589     z = aSig;
2590     if ( aSign ) {
2591         z = - z;
2592     }
2593     if ( ( (int16_t)z < 0 ) ^ aSign ) {
2594  invalid:
2595         float_raise( float_flag_invalid STATUS_VAR);
2596         return aSign ? (sbits32) 0xffff8000 : 0x7FFF;
2597     }
2598     if ( ( aSig<<shiftCount ) != savedASig ) {
2599         STATUS(float_exception_flags) |= float_flag_inexact;
2600     }
2601     return z;
2602 }
2603
2604 /*----------------------------------------------------------------------------
2605 | Returns the result of converting the double-precision floating-point value
2606 | `a' to the 64-bit two's complement integer format.  The conversion is
2607 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2608 | Arithmetic---which means in particular that the conversion is rounded
2609 | according to the current rounding mode.  If `a' is a NaN, the largest
2610 | positive integer is returned.  Otherwise, if the conversion overflows, the
2611 | largest integer with the same sign as `a' is returned.
2612 *----------------------------------------------------------------------------*/
2613
2614 int64 float64_to_int64( float64 a STATUS_PARAM )
2615 {
2616     flag aSign;
2617     int16 aExp, shiftCount;
2618     bits64 aSig, aSigExtra;
2619     a = float64_squash_input_denormal(a STATUS_VAR);
2620
2621     aSig = extractFloat64Frac( a );
2622     aExp = extractFloat64Exp( a );
2623     aSign = extractFloat64Sign( a );
2624     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2625     shiftCount = 0x433 - aExp;
2626     if ( shiftCount <= 0 ) {
2627         if ( 0x43E < aExp ) {
2628             float_raise( float_flag_invalid STATUS_VAR);
2629             if (    ! aSign
2630                  || (    ( aExp == 0x7FF )
2631                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2632                ) {
2633                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2634             }
2635             return (sbits64) LIT64( 0x8000000000000000 );
2636         }
2637         aSigExtra = 0;
2638         aSig <<= - shiftCount;
2639     }
2640     else {
2641         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2642     }
2643     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2644
2645 }
2646
2647 /*----------------------------------------------------------------------------
2648 | Returns the result of converting the double-precision floating-point value
2649 | `a' to the 64-bit two's complement integer format.  The conversion is
2650 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2651 | Arithmetic, except that the conversion is always rounded toward zero.
2652 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2653 | the conversion overflows, the largest integer with the same sign as `a' is
2654 | returned.
2655 *----------------------------------------------------------------------------*/
2656
2657 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2658 {
2659     flag aSign;
2660     int16 aExp, shiftCount;
2661     bits64 aSig;
2662     int64 z;
2663     a = float64_squash_input_denormal(a STATUS_VAR);
2664
2665     aSig = extractFloat64Frac( a );
2666     aExp = extractFloat64Exp( a );
2667     aSign = extractFloat64Sign( a );
2668     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2669     shiftCount = aExp - 0x433;
2670     if ( 0 <= shiftCount ) {
2671         if ( 0x43E <= aExp ) {
2672             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2673                 float_raise( float_flag_invalid STATUS_VAR);
2674                 if (    ! aSign
2675                      || (    ( aExp == 0x7FF )
2676                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2677                    ) {
2678                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2679                 }
2680             }
2681             return (sbits64) LIT64( 0x8000000000000000 );
2682         }
2683         z = aSig<<shiftCount;
2684     }
2685     else {
2686         if ( aExp < 0x3FE ) {
2687             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2688             return 0;
2689         }
2690         z = aSig>>( - shiftCount );
2691         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2692             STATUS(float_exception_flags) |= float_flag_inexact;
2693         }
2694     }
2695     if ( aSign ) z = - z;
2696     return z;
2697
2698 }
2699
2700 /*----------------------------------------------------------------------------
2701 | Returns the result of converting the double-precision floating-point value
2702 | `a' to the single-precision floating-point format.  The conversion is
2703 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2704 | Arithmetic.
2705 *----------------------------------------------------------------------------*/
2706
2707 float32 float64_to_float32( float64 a STATUS_PARAM )
2708 {
2709     flag aSign;
2710     int16 aExp;
2711     bits64 aSig;
2712     bits32 zSig;
2713     a = float64_squash_input_denormal(a STATUS_VAR);
2714
2715     aSig = extractFloat64Frac( a );
2716     aExp = extractFloat64Exp( a );
2717     aSign = extractFloat64Sign( a );
2718     if ( aExp == 0x7FF ) {
2719         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2720         return packFloat32( aSign, 0xFF, 0 );
2721     }
2722     shift64RightJamming( aSig, 22, &aSig );
2723     zSig = aSig;
2724     if ( aExp || zSig ) {
2725         zSig |= 0x40000000;
2726         aExp -= 0x381;
2727     }
2728     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2729
2730 }
2731
2732
2733 /*----------------------------------------------------------------------------
2734 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2735 | half-precision floating-point value, returning the result.  After being
2736 | shifted into the proper positions, the three fields are simply added
2737 | together to form the result.  This means that any integer portion of `zSig'
2738 | will be added into the exponent.  Since a properly normalized significand
2739 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2740 | than the desired result exponent whenever `zSig' is a complete, normalized
2741 | significand.
2742 *----------------------------------------------------------------------------*/
2743 static float16 packFloat16(flag zSign, int16 zExp, bits16 zSig)
2744 {
2745     return make_float16(
2746         (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig);
2747 }
2748
2749 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2750    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2751
2752 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2753 {
2754     flag aSign;
2755     int16 aExp;
2756     bits32 aSig;
2757
2758     aSign = extractFloat16Sign(a);
2759     aExp = extractFloat16Exp(a);
2760     aSig = extractFloat16Frac(a);
2761
2762     if (aExp == 0x1f && ieee) {
2763         if (aSig) {
2764             /* Make sure correct exceptions are raised.  */
2765             float32ToCommonNaN(a STATUS_VAR);
2766             aSig |= 0x200;
2767         }
2768         return packFloat32(aSign, 0xff, aSig << 13);
2769     }
2770     if (aExp == 0) {
2771         int8 shiftCount;
2772
2773         if (aSig == 0) {
2774             return packFloat32(aSign, 0, 0);
2775         }
2776
2777         shiftCount = countLeadingZeros32( aSig ) - 21;
2778         aSig = aSig << shiftCount;
2779         aExp = -shiftCount;
2780     }
2781     return packFloat32( aSign, aExp + 0x70, aSig << 13);
2782 }
2783
2784 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
2785 {
2786     flag aSign;
2787     int16 aExp;
2788     bits32 aSig;
2789     bits32 mask;
2790     bits32 increment;
2791     int8 roundingMode;
2792     a = float32_squash_input_denormal(a STATUS_VAR);
2793
2794     aSig = extractFloat32Frac( a );
2795     aExp = extractFloat32Exp( a );
2796     aSign = extractFloat32Sign( a );
2797     if ( aExp == 0xFF ) {
2798         if (aSig) {
2799             /* Input is a NaN */
2800             float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2801             if (!ieee) {
2802                 return packFloat16(aSign, 0, 0);
2803             }
2804             return r;
2805         }
2806         /* Infinity */
2807         if (!ieee) {
2808             float_raise(float_flag_invalid STATUS_VAR);
2809             return packFloat16(aSign, 0x1f, 0x3ff);
2810         }
2811         return packFloat16(aSign, 0x1f, 0);
2812     }
2813     if (aExp == 0 && aSig == 0) {
2814         return packFloat16(aSign, 0, 0);
2815     }
2816     /* Decimal point between bits 22 and 23.  */
2817     aSig |= 0x00800000;
2818     aExp -= 0x7f;
2819     if (aExp < -14) {
2820         mask = 0x00ffffff;
2821         if (aExp >= -24) {
2822             mask >>= 25 + aExp;
2823         }
2824     } else {
2825         mask = 0x00001fff;
2826     }
2827     if (aSig & mask) {
2828         float_raise( float_flag_underflow STATUS_VAR );
2829         roundingMode = STATUS(float_rounding_mode);
2830         switch (roundingMode) {
2831         case float_round_nearest_even:
2832             increment = (mask + 1) >> 1;
2833             if ((aSig & mask) == increment) {
2834                 increment = aSig & (increment << 1);
2835             }
2836             break;
2837         case float_round_up:
2838             increment = aSign ? 0 : mask;
2839             break;
2840         case float_round_down:
2841             increment = aSign ? mask : 0;
2842             break;
2843         default: /* round_to_zero */
2844             increment = 0;
2845             break;
2846         }
2847         aSig += increment;
2848         if (aSig >= 0x01000000) {
2849             aSig >>= 1;
2850             aExp++;
2851         }
2852     } else if (aExp < -14
2853           && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2854         float_raise( float_flag_underflow STATUS_VAR);
2855     }
2856
2857     if (ieee) {
2858         if (aExp > 15) {
2859             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2860             return packFloat16(aSign, 0x1f, 0);
2861         }
2862     } else {
2863         if (aExp > 16) {
2864             float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
2865             return packFloat16(aSign, 0x1f, 0x3ff);
2866         }
2867     }
2868     if (aExp < -24) {
2869         return packFloat16(aSign, 0, 0);
2870     }
2871     if (aExp < -14) {
2872         aSig >>= -14 - aExp;
2873         aExp = -14;
2874     }
2875     return packFloat16(aSign, aExp + 14, aSig >> 13);
2876 }
2877
2878 #ifdef FLOATX80
2879
2880 /*----------------------------------------------------------------------------
2881 | Returns the result of converting the double-precision floating-point value
2882 | `a' to the extended double-precision floating-point format.  The conversion
2883 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2884 | Arithmetic.
2885 *----------------------------------------------------------------------------*/
2886
2887 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2888 {
2889     flag aSign;
2890     int16 aExp;
2891     bits64 aSig;
2892
2893     a = float64_squash_input_denormal(a STATUS_VAR);
2894     aSig = extractFloat64Frac( a );
2895     aExp = extractFloat64Exp( a );
2896     aSign = extractFloat64Sign( a );
2897     if ( aExp == 0x7FF ) {
2898         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2899         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2900     }
2901     if ( aExp == 0 ) {
2902         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2903         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2904     }
2905     return
2906         packFloatx80(
2907             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2908
2909 }
2910
2911 #endif
2912
2913 #ifdef FLOAT128
2914
2915 /*----------------------------------------------------------------------------
2916 | Returns the result of converting the double-precision floating-point value
2917 | `a' to the quadruple-precision floating-point format.  The conversion is
2918 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2919 | Arithmetic.
2920 *----------------------------------------------------------------------------*/
2921
2922 float128 float64_to_float128( float64 a STATUS_PARAM )
2923 {
2924     flag aSign;
2925     int16 aExp;
2926     bits64 aSig, zSig0, zSig1;
2927
2928     a = float64_squash_input_denormal(a STATUS_VAR);
2929     aSig = extractFloat64Frac( a );
2930     aExp = extractFloat64Exp( a );
2931     aSign = extractFloat64Sign( a );
2932     if ( aExp == 0x7FF ) {
2933         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2934         return packFloat128( aSign, 0x7FFF, 0, 0 );
2935     }
2936     if ( aExp == 0 ) {
2937         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2938         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2939         --aExp;
2940     }
2941     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2942     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2943
2944 }
2945
2946 #endif
2947
2948 /*----------------------------------------------------------------------------
2949 | Rounds the double-precision floating-point value `a' to an integer, and
2950 | returns the result as a double-precision floating-point value.  The
2951 | operation is performed according to the IEC/IEEE Standard for Binary
2952 | Floating-Point Arithmetic.
2953 *----------------------------------------------------------------------------*/
2954
2955 float64 float64_round_to_int( float64 a STATUS_PARAM )
2956 {
2957     flag aSign;
2958     int16 aExp;
2959     bits64 lastBitMask, roundBitsMask;
2960     int8 roundingMode;
2961     bits64 z;
2962     a = float64_squash_input_denormal(a STATUS_VAR);
2963
2964     aExp = extractFloat64Exp( a );
2965     if ( 0x433 <= aExp ) {
2966         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2967             return propagateFloat64NaN( a, a STATUS_VAR );
2968         }
2969         return a;
2970     }
2971     if ( aExp < 0x3FF ) {
2972         if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2973         STATUS(float_exception_flags) |= float_flag_inexact;
2974         aSign = extractFloat64Sign( a );
2975         switch ( STATUS(float_rounding_mode) ) {
2976          case float_round_nearest_even:
2977             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2978                 return packFloat64( aSign, 0x3FF, 0 );
2979             }
2980             break;
2981          case float_round_down:
2982             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2983          case float_round_up:
2984             return make_float64(
2985             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2986         }
2987         return packFloat64( aSign, 0, 0 );
2988     }
2989     lastBitMask = 1;
2990     lastBitMask <<= 0x433 - aExp;
2991     roundBitsMask = lastBitMask - 1;
2992     z = float64_val(a);
2993     roundingMode = STATUS(float_rounding_mode);
2994     if ( roundingMode == float_round_nearest_even ) {
2995         z += lastBitMask>>1;
2996         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2997     }
2998     else if ( roundingMode != float_round_to_zero ) {
2999         if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3000             z += roundBitsMask;
3001         }
3002     }
3003     z &= ~ roundBitsMask;
3004     if ( z != float64_val(a) )
3005         STATUS(float_exception_flags) |= float_flag_inexact;
3006     return make_float64(z);
3007
3008 }
3009
3010 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3011 {
3012     int oldmode;
3013     float64 res;
3014     oldmode = STATUS(float_rounding_mode);
3015     STATUS(float_rounding_mode) = float_round_to_zero;
3016     res = float64_round_to_int(a STATUS_VAR);
3017     STATUS(float_rounding_mode) = oldmode;
3018     return res;
3019 }
3020
3021 /*----------------------------------------------------------------------------
3022 | Returns the result of adding the absolute values of the double-precision
3023 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3024 | before being returned.  `zSign' is ignored if the result is a NaN.
3025 | The addition is performed according to the IEC/IEEE Standard for Binary
3026 | Floating-Point Arithmetic.
3027 *----------------------------------------------------------------------------*/
3028
3029 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3030 {
3031     int16 aExp, bExp, zExp;
3032     bits64 aSig, bSig, zSig;
3033     int16 expDiff;
3034
3035     aSig = extractFloat64Frac( a );
3036     aExp = extractFloat64Exp( a );
3037     bSig = extractFloat64Frac( b );
3038     bExp = extractFloat64Exp( b );
3039     expDiff = aExp - bExp;
3040     aSig <<= 9;
3041     bSig <<= 9;
3042     if ( 0 < expDiff ) {
3043         if ( aExp == 0x7FF ) {
3044             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3045             return a;
3046         }
3047         if ( bExp == 0 ) {
3048             --expDiff;
3049         }
3050         else {
3051             bSig |= LIT64( 0x2000000000000000 );
3052         }
3053         shift64RightJamming( bSig, expDiff, &bSig );
3054         zExp = aExp;
3055     }
3056     else if ( expDiff < 0 ) {
3057         if ( bExp == 0x7FF ) {
3058             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3059             return packFloat64( zSign, 0x7FF, 0 );
3060         }
3061         if ( aExp == 0 ) {
3062             ++expDiff;
3063         }
3064         else {
3065             aSig |= LIT64( 0x2000000000000000 );
3066         }
3067         shift64RightJamming( aSig, - expDiff, &aSig );
3068         zExp = bExp;
3069     }
3070     else {
3071         if ( aExp == 0x7FF ) {
3072             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3073             return a;
3074         }
3075         if ( aExp == 0 ) {
3076             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
3077             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3078         }
3079         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3080         zExp = aExp;
3081         goto roundAndPack;
3082     }
3083     aSig |= LIT64( 0x2000000000000000 );
3084     zSig = ( aSig + bSig )<<1;
3085     --zExp;
3086     if ( (sbits64) zSig < 0 ) {
3087         zSig = aSig + bSig;
3088         ++zExp;
3089     }
3090  roundAndPack:
3091     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3092
3093 }
3094
3095 /*----------------------------------------------------------------------------
3096 | Returns the result of subtracting the absolute values of the double-
3097 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3098 | difference is negated before being returned.  `zSign' is ignored if the
3099 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3100 | Standard for Binary Floating-Point Arithmetic.
3101 *----------------------------------------------------------------------------*/
3102
3103 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3104 {
3105     int16 aExp, bExp, zExp;
3106     bits64 aSig, bSig, zSig;
3107     int16 expDiff;
3108
3109     aSig = extractFloat64Frac( a );
3110     aExp = extractFloat64Exp( a );
3111     bSig = extractFloat64Frac( b );
3112     bExp = extractFloat64Exp( b );
3113     expDiff = aExp - bExp;
3114     aSig <<= 10;
3115     bSig <<= 10;
3116     if ( 0 < expDiff ) goto aExpBigger;
3117     if ( expDiff < 0 ) goto bExpBigger;
3118     if ( aExp == 0x7FF ) {
3119         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3120         float_raise( float_flag_invalid STATUS_VAR);
3121         return float64_default_nan;
3122     }
3123     if ( aExp == 0 ) {
3124         aExp = 1;
3125         bExp = 1;
3126     }
3127     if ( bSig < aSig ) goto aBigger;
3128     if ( aSig < bSig ) goto bBigger;
3129     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3130  bExpBigger:
3131     if ( bExp == 0x7FF ) {
3132         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3133         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3134     }
3135     if ( aExp == 0 ) {
3136         ++expDiff;
3137     }
3138     else {
3139         aSig |= LIT64( 0x4000000000000000 );
3140     }
3141     shift64RightJamming( aSig, - expDiff, &aSig );
3142     bSig |= LIT64( 0x4000000000000000 );
3143  bBigger:
3144     zSig = bSig - aSig;
3145     zExp = bExp;
3146     zSign ^= 1;
3147     goto normalizeRoundAndPack;
3148  aExpBigger:
3149     if ( aExp == 0x7FF ) {
3150         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3151         return a;
3152     }
3153     if ( bExp == 0 ) {
3154         --expDiff;
3155     }
3156     else {
3157         bSig |= LIT64( 0x4000000000000000 );
3158     }
3159     shift64RightJamming( bSig, expDiff, &bSig );
3160     aSig |= LIT64( 0x4000000000000000 );
3161  aBigger:
3162     zSig = aSig - bSig;
3163     zExp = aExp;
3164  normalizeRoundAndPack:
3165     --zExp;
3166     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3167
3168 }
3169
3170 /*----------------------------------------------------------------------------
3171 | Returns the result of adding the double-precision floating-point values `a'
3172 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3173 | Binary Floating-Point Arithmetic.
3174 *----------------------------------------------------------------------------*/
3175
3176 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3177 {
3178     flag aSign, bSign;
3179     a = float64_squash_input_denormal(a STATUS_VAR);
3180     b = float64_squash_input_denormal(b STATUS_VAR);
3181
3182     aSign = extractFloat64Sign( a );
3183     bSign = extractFloat64Sign( b );
3184     if ( aSign == bSign ) {
3185         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3186     }
3187     else {
3188         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3189     }
3190
3191 }
3192
3193 /*----------------------------------------------------------------------------
3194 | Returns the result of subtracting the double-precision floating-point values
3195 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3196 | for Binary Floating-Point Arithmetic.
3197 *----------------------------------------------------------------------------*/
3198
3199 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3200 {
3201     flag aSign, bSign;
3202     a = float64_squash_input_denormal(a STATUS_VAR);
3203     b = float64_squash_input_denormal(b STATUS_VAR);
3204
3205     aSign = extractFloat64Sign( a );
3206     bSign = extractFloat64Sign( b );
3207     if ( aSign == bSign ) {
3208         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3209     }
3210     else {
3211         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3212     }
3213
3214 }
3215
3216 /*----------------------------------------------------------------------------
3217 | Returns the result of multiplying the double-precision floating-point values
3218 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3219 | for Binary Floating-Point Arithmetic.
3220 *----------------------------------------------------------------------------*/
3221
3222 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3223 {
3224     flag aSign, bSign, zSign;
3225     int16 aExp, bExp, zExp;
3226     bits64 aSig, bSig, zSig0, zSig1;
3227
3228     a = float64_squash_input_denormal(a STATUS_VAR);
3229     b = float64_squash_input_denormal(b STATUS_VAR);
3230
3231     aSig = extractFloat64Frac( a );
3232     aExp = extractFloat64Exp( a );
3233     aSign = extractFloat64Sign( a );
3234     bSig = extractFloat64Frac( b );
3235     bExp = extractFloat64Exp( b );
3236     bSign = extractFloat64Sign( b );
3237     zSign = aSign ^ bSign;
3238     if ( aExp == 0x7FF ) {
3239         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3240             return propagateFloat64NaN( a, b STATUS_VAR );
3241         }
3242         if ( ( bExp | bSig ) == 0 ) {
3243             float_raise( float_flag_invalid STATUS_VAR);
3244             return float64_default_nan;
3245         }
3246         return packFloat64( zSign, 0x7FF, 0 );
3247     }
3248     if ( bExp == 0x7FF ) {
3249         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3250         if ( ( aExp | aSig ) == 0 ) {
3251             float_raise( float_flag_invalid STATUS_VAR);
3252             return float64_default_nan;
3253         }
3254         return packFloat64( zSign, 0x7FF, 0 );
3255     }
3256     if ( aExp == 0 ) {
3257         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3258         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3259     }
3260     if ( bExp == 0 ) {
3261         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3262         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3263     }
3264     zExp = aExp + bExp - 0x3FF;
3265     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3266     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3267     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3268     zSig0 |= ( zSig1 != 0 );
3269     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
3270         zSig0 <<= 1;
3271         --zExp;
3272     }
3273     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3274
3275 }
3276
3277 /*----------------------------------------------------------------------------
3278 | Returns the result of dividing the double-precision floating-point value `a'
3279 | by the corresponding value `b'.  The operation is performed according to
3280 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3281 *----------------------------------------------------------------------------*/
3282
3283 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3284 {
3285     flag aSign, bSign, zSign;
3286     int16 aExp, bExp, zExp;
3287     bits64 aSig, bSig, zSig;
3288     bits64 rem0, rem1;
3289     bits64 term0, term1;
3290     a = float64_squash_input_denormal(a STATUS_VAR);
3291     b = float64_squash_input_denormal(b STATUS_VAR);
3292
3293     aSig = extractFloat64Frac( a );
3294     aExp = extractFloat64Exp( a );
3295     aSign = extractFloat64Sign( a );
3296     bSig = extractFloat64Frac( b );
3297     bExp = extractFloat64Exp( b );
3298     bSign = extractFloat64Sign( b );
3299     zSign = aSign ^ bSign;
3300     if ( aExp == 0x7FF ) {
3301         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3302         if ( bExp == 0x7FF ) {
3303             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3304             float_raise( float_flag_invalid STATUS_VAR);
3305             return float64_default_nan;
3306         }
3307         return packFloat64( zSign, 0x7FF, 0 );
3308     }
3309     if ( bExp == 0x7FF ) {
3310         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3311         return packFloat64( zSign, 0, 0 );
3312     }
3313     if ( bExp == 0 ) {
3314         if ( bSig == 0 ) {
3315             if ( ( aExp | aSig ) == 0 ) {
3316                 float_raise( float_flag_invalid STATUS_VAR);
3317                 return float64_default_nan;
3318             }
3319             float_raise( float_flag_divbyzero STATUS_VAR);
3320             return packFloat64( zSign, 0x7FF, 0 );
3321         }
3322         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3323     }
3324     if ( aExp == 0 ) {
3325         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3326         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3327     }
3328     zExp = aExp - bExp + 0x3FD;
3329     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3330     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3331     if ( bSig <= ( aSig + aSig ) ) {
3332         aSig >>= 1;
3333         ++zExp;
3334     }
3335     zSig = estimateDiv128To64( aSig, 0, bSig );
3336     if ( ( zSig & 0x1FF ) <= 2 ) {
3337         mul64To128( bSig, zSig, &term0, &term1 );
3338         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3339         while ( (sbits64) rem0 < 0 ) {
3340             --zSig;
3341             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3342         }
3343         zSig |= ( rem1 != 0 );
3344     }
3345     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3346
3347 }
3348
3349 /*----------------------------------------------------------------------------
3350 | Returns the remainder of the double-precision floating-point value `a'
3351 | with respect to the corresponding value `b'.  The operation is performed
3352 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3353 *----------------------------------------------------------------------------*/
3354
3355 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3356 {
3357     flag aSign, zSign;
3358     int16 aExp, bExp, expDiff;
3359     bits64 aSig, bSig;
3360     bits64 q, alternateASig;
3361     sbits64 sigMean;
3362
3363     a = float64_squash_input_denormal(a STATUS_VAR);
3364     b = float64_squash_input_denormal(b STATUS_VAR);
3365     aSig = extractFloat64Frac( a );
3366     aExp = extractFloat64Exp( a );
3367     aSign = extractFloat64Sign( a );
3368     bSig = extractFloat64Frac( b );
3369     bExp = extractFloat64Exp( b );
3370     if ( aExp == 0x7FF ) {
3371         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3372             return propagateFloat64NaN( a, b STATUS_VAR );
3373         }
3374         float_raise( float_flag_invalid STATUS_VAR);
3375         return float64_default_nan;
3376     }
3377     if ( bExp == 0x7FF ) {
3378         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3379         return a;
3380     }
3381     if ( bExp == 0 ) {
3382         if ( bSig == 0 ) {
3383             float_raise( float_flag_invalid STATUS_VAR);
3384             return float64_default_nan;
3385         }
3386         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3387     }
3388     if ( aExp == 0 ) {
3389         if ( aSig == 0 ) return a;
3390         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3391     }
3392     expDiff = aExp - bExp;
3393     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3394     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3395     if ( expDiff < 0 ) {
3396         if ( expDiff < -1 ) return a;
3397         aSig >>= 1;
3398     }
3399     q = ( bSig <= aSig );
3400     if ( q ) aSig -= bSig;
3401     expDiff -= 64;
3402     while ( 0 < expDiff ) {
3403         q = estimateDiv128To64( aSig, 0, bSig );
3404         q = ( 2 < q ) ? q - 2 : 0;
3405         aSig = - ( ( bSig>>2 ) * q );
3406         expDiff -= 62;
3407     }
3408     expDiff += 64;
3409     if ( 0 < expDiff ) {
3410         q = estimateDiv128To64( aSig, 0, bSig );
3411         q = ( 2 < q ) ? q - 2 : 0;
3412         q >>= 64 - expDiff;
3413         bSig >>= 2;
3414         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3415     }
3416     else {
3417         aSig >>= 2;
3418         bSig >>= 2;
3419     }
3420     do {
3421         alternateASig = aSig;
3422         ++q;
3423         aSig -= bSig;
3424     } while ( 0 <= (sbits64) aSig );
3425     sigMean = aSig + alternateASig;
3426     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3427         aSig = alternateASig;
3428     }
3429     zSign = ( (sbits64) aSig < 0 );
3430     if ( zSign ) aSig = - aSig;
3431     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3432
3433 }
3434
3435 /*----------------------------------------------------------------------------
3436 | Returns the square root of the double-precision floating-point value `a'.
3437 | The operation is performed according to the IEC/IEEE Standard for Binary
3438 | Floating-Point Arithmetic.
3439 *----------------------------------------------------------------------------*/
3440
3441 float64 float64_sqrt( float64 a STATUS_PARAM )
3442 {
3443     flag aSign;
3444     int16 aExp, zExp;
3445     bits64 aSig, zSig, doubleZSig;
3446     bits64 rem0, rem1, term0, term1;
3447     a = float64_squash_input_denormal(a STATUS_VAR);
3448
3449     aSig = extractFloat64Frac( a );
3450     aExp = extractFloat64Exp( a );
3451     aSign = extractFloat64Sign( a );
3452     if ( aExp == 0x7FF ) {
3453         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3454         if ( ! aSign ) return a;
3455         float_raise( float_flag_invalid STATUS_VAR);
3456         return float64_default_nan;
3457     }
3458     if ( aSign ) {
3459         if ( ( aExp | aSig ) == 0 ) return a;
3460         float_raise( float_flag_invalid STATUS_VAR);
3461         return float64_default_nan;
3462     }
3463     if ( aExp == 0 ) {
3464         if ( aSig == 0 ) return float64_zero;
3465         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3466     }
3467     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3468     aSig |= LIT64( 0x0010000000000000 );
3469     zSig = estimateSqrt32( aExp, aSig>>21 );
3470     aSig <<= 9 - ( aExp & 1 );
3471     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3472     if ( ( zSig & 0x1FF ) <= 5 ) {
3473         doubleZSig = zSig<<1;
3474         mul64To128( zSig, zSig, &term0, &term1 );
3475         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3476         while ( (sbits64) rem0 < 0 ) {
3477             --zSig;
3478             doubleZSig -= 2;
3479             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3480         }
3481         zSig |= ( ( rem0 | rem1 ) != 0 );
3482     }
3483     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3484
3485 }
3486
3487 /*----------------------------------------------------------------------------
3488 | Returns the binary log of the double-precision floating-point value `a'.
3489 | The operation is performed according to the IEC/IEEE Standard for Binary
3490 | Floating-Point Arithmetic.
3491 *----------------------------------------------------------------------------*/
3492 float64 float64_log2( float64 a STATUS_PARAM )
3493 {
3494     flag aSign, zSign;
3495     int16 aExp;
3496     bits64 aSig, aSig0, aSig1, zSig, i;
3497     a = float64_squash_input_denormal(a STATUS_VAR);
3498
3499     aSig = extractFloat64Frac( a );
3500     aExp = extractFloat64Exp( a );
3501     aSign = extractFloat64Sign( a );
3502
3503     if ( aExp == 0 ) {
3504         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3505         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3506     }
3507     if ( aSign ) {
3508         float_raise( float_flag_invalid STATUS_VAR);
3509         return float64_default_nan;
3510     }
3511     if ( aExp == 0x7FF ) {
3512         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3513         return a;
3514     }
3515
3516     aExp -= 0x3FF;
3517     aSig |= LIT64( 0x0010000000000000 );
3518     zSign = aExp < 0;
3519     zSig = (bits64)aExp << 52;
3520     for (i = 1LL << 51; i > 0; i >>= 1) {
3521         mul64To128( aSig, aSig, &aSig0, &aSig1 );
3522         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3523         if ( aSig & LIT64( 0x0020000000000000 ) ) {
3524             aSig >>= 1;
3525             zSig |= i;
3526         }
3527     }
3528
3529     if ( zSign )
3530         zSig = -zSig;
3531     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3532 }
3533
3534 /*----------------------------------------------------------------------------
3535 | Returns 1 if the double-precision floating-point value `a' is equal to the
3536 | corresponding value `b', and 0 otherwise.  The comparison is performed
3537 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3538 *----------------------------------------------------------------------------*/
3539
3540 int float64_eq( float64 a, float64 b STATUS_PARAM )
3541 {
3542     bits64 av, bv;
3543     a = float64_squash_input_denormal(a STATUS_VAR);
3544     b = float64_squash_input_denormal(b STATUS_VAR);
3545
3546     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3547          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3548        ) {
3549         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3550             float_raise( float_flag_invalid STATUS_VAR);
3551         }
3552         return 0;
3553     }
3554     av = float64_val(a);
3555     bv = float64_val(b);
3556     return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3557
3558 }
3559
3560 /*----------------------------------------------------------------------------
3561 | Returns 1 if the double-precision floating-point value `a' is less than or
3562 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
3563 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3564 | Arithmetic.
3565 *----------------------------------------------------------------------------*/
3566
3567 int float64_le( float64 a, float64 b STATUS_PARAM )
3568 {
3569     flag aSign, bSign;
3570     bits64 av, bv;
3571     a = float64_squash_input_denormal(a STATUS_VAR);
3572     b = float64_squash_input_denormal(b STATUS_VAR);
3573
3574     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3575          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3576        ) {
3577         float_raise( float_flag_invalid STATUS_VAR);
3578         return 0;
3579     }
3580     aSign = extractFloat64Sign( a );
3581     bSign = extractFloat64Sign( b );
3582     av = float64_val(a);
3583     bv = float64_val(b);
3584     if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3585     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3586
3587 }
3588
3589 /*----------------------------------------------------------------------------
3590 | Returns 1 if the double-precision floating-point value `a' is less than
3591 | the corresponding value `b', and 0 otherwise.  The comparison is performed
3592 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3593 *----------------------------------------------------------------------------*/
3594
3595 int float64_lt( float64 a, float64 b STATUS_PARAM )
3596 {
3597     flag aSign, bSign;
3598     bits64 av, bv;
3599
3600     a = float64_squash_input_denormal(a STATUS_VAR);
3601     b = float64_squash_input_denormal(b STATUS_VAR);
3602     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3603          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3604        ) {
3605         float_raise( float_flag_invalid STATUS_VAR);
3606         return 0;
3607     }
3608     aSign = extractFloat64Sign( a );
3609     bSign = extractFloat64Sign( b );
3610     av = float64_val(a);
3611     bv = float64_val(b);
3612     if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3613     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3614
3615 }
3616
3617 /*----------------------------------------------------------------------------
3618 | Returns 1 if the double-precision floating-point value `a' is equal to the
3619 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
3620 | if either operand is a NaN.  Otherwise, the comparison is performed
3621 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3622 *----------------------------------------------------------------------------*/
3623
3624 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3625 {
3626     bits64 av, bv;
3627     a = float64_squash_input_denormal(a STATUS_VAR);
3628     b = float64_squash_input_denormal(b STATUS_VAR);
3629
3630     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3631          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3632        ) {
3633         float_raise( float_flag_invalid STATUS_VAR);
3634         return 0;
3635     }
3636     av = float64_val(a);
3637     bv = float64_val(b);
3638     return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3639
3640 }
3641
3642 /*----------------------------------------------------------------------------
3643 | Returns 1 if the double-precision floating-point value `a' is less than or
3644 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3645 | cause an exception.  Otherwise, the comparison is performed according to the
3646 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3647 *----------------------------------------------------------------------------*/
3648
3649 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3650 {
3651     flag aSign, bSign;
3652     bits64 av, bv;
3653     a = float64_squash_input_denormal(a STATUS_VAR);
3654     b = float64_squash_input_denormal(b STATUS_VAR);
3655
3656     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3657          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3658        ) {
3659         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3660             float_raise( float_flag_invalid STATUS_VAR);
3661         }
3662         return 0;
3663     }
3664     aSign = extractFloat64Sign( a );
3665     bSign = extractFloat64Sign( b );
3666     av = float64_val(a);
3667     bv = float64_val(b);
3668     if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3669     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3670
3671 }
3672
3673 /*----------------------------------------------------------------------------
3674 | Returns 1 if the double-precision floating-point value `a' is less than
3675 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3676 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3677 | Standard for Binary Floating-Point Arithmetic.
3678 *----------------------------------------------------------------------------*/
3679
3680 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3681 {
3682     flag aSign, bSign;
3683     bits64 av, bv;
3684     a = float64_squash_input_denormal(a STATUS_VAR);
3685     b = float64_squash_input_denormal(b STATUS_VAR);
3686
3687     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3688          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3689        ) {
3690         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3691             float_raise( float_flag_invalid STATUS_VAR);
3692         }
3693         return 0;
3694     }
3695     aSign = extractFloat64Sign( a );
3696     bSign = extractFloat64Sign( b );
3697     av = float64_val(a);
3698     bv = float64_val(b);
3699     if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3700     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3701
3702 }
3703
3704 #ifdef FLOATX80
3705
3706 /*----------------------------------------------------------------------------
3707 | Returns the result of converting the extended double-precision floating-
3708 | point value `a' to the 32-bit two's complement integer format.  The
3709 | conversion is performed according to the IEC/IEEE Standard for Binary
3710 | Floating-Point Arithmetic---which means in particular that the conversion
3711 | is rounded according to the current rounding mode.  If `a' is a NaN, the
3712 | largest positive integer is returned.  Otherwise, if the conversion
3713 | overflows, the largest integer with the same sign as `a' is returned.
3714 *----------------------------------------------------------------------------*/
3715
3716 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3717 {
3718     flag aSign;
3719     int32 aExp, shiftCount;
3720     bits64 aSig;
3721
3722     aSig = extractFloatx80Frac( a );
3723     aExp = extractFloatx80Exp( a );
3724     aSign = extractFloatx80Sign( a );
3725     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3726     shiftCount = 0x4037 - aExp;
3727     if ( shiftCount <= 0 ) shiftCount = 1;
3728     shift64RightJamming( aSig, shiftCount, &aSig );
3729     return roundAndPackInt32( aSign, aSig STATUS_VAR );
3730
3731 }
3732
3733 /*----------------------------------------------------------------------------
3734 | Returns the result of converting the extended double-precision floating-
3735 | point value `a' to the 32-bit two's complement integer format.  The
3736 | conversion is performed according to the IEC/IEEE Standard for Binary
3737 | Floating-Point Arithmetic, except that the conversion is always rounded
3738 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3739 | Otherwise, if the conversion overflows, the largest integer with the same
3740 | sign as `a' is returned.
3741 *----------------------------------------------------------------------------*/
3742
3743 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3744 {
3745     flag aSign;
3746     int32 aExp, shiftCount;
3747     bits64 aSig, savedASig;
3748     int32 z;
3749
3750     aSig = extractFloatx80Frac( a );
3751     aExp = extractFloatx80Exp( a );
3752     aSign = extractFloatx80Sign( a );
3753     if ( 0x401E < aExp ) {
3754         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3755         goto invalid;
3756     }
3757     else if ( aExp < 0x3FFF ) {
3758         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3759         return 0;
3760     }
3761     shiftCount = 0x403E - aExp;
3762     savedASig = aSig;
3763     aSig >>= shiftCount;
3764     z = aSig;
3765     if ( aSign ) z = - z;
3766     if ( ( z < 0 ) ^ aSign ) {
3767  invalid:
3768         float_raise( float_flag_invalid STATUS_VAR);
3769         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3770     }
3771     if ( ( aSig<<shiftCount ) != savedASig ) {
3772         STATUS(float_exception_flags) |= float_flag_inexact;
3773     }
3774     return z;
3775
3776 }
3777
3778 /*----------------------------------------------------------------------------
3779 | Returns the result of converting the extended double-precision floating-
3780 | point value `a' to the 64-bit two's complement integer format.  The
3781 | conversion is performed according to the IEC/IEEE Standard for Binary
3782 | Floating-Point Arithmetic---which means in particular that the conversion
3783 | is rounded according to the current rounding mode.  If `a' is a NaN,
3784 | the largest positive integer is returned.  Otherwise, if the conversion
3785 | overflows, the largest integer with the same sign as `a' is returned.
3786 *----------------------------------------------------------------------------*/
3787
3788 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3789 {
3790     flag aSign;
3791     int32 aExp, shiftCount;
3792     bits64 aSig, aSigExtra;
3793
3794     aSig = extractFloatx80Frac( a );
3795     aExp = extractFloatx80Exp( a );
3796     aSign = extractFloatx80Sign( a );
3797     shiftCount = 0x403E - aExp;
3798     if ( shiftCount <= 0 ) {
3799         if ( shiftCount ) {
3800             float_raise( float_flag_invalid STATUS_VAR);
3801             if (    ! aSign
3802                  || (    ( aExp == 0x7FFF )
3803                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3804                ) {
3805                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3806             }
3807             return (sbits64) LIT64( 0x8000000000000000 );
3808         }
3809         aSigExtra = 0;
3810     }
3811     else {
3812         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3813     }
3814     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3815
3816 }
3817
3818 /*----------------------------------------------------------------------------
3819 | Returns the result of converting the extended double-precision floating-
3820 | point value `a' to the 64-bit two's complement integer format.  The
3821 | conversion is performed according to the IEC/IEEE Standard for Binary
3822 | Floating-Point Arithmetic, except that the conversion is always rounded
3823 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3824 | Otherwise, if the conversion overflows, the largest integer with the same
3825 | sign as `a' is returned.
3826 *----------------------------------------------------------------------------*/
3827
3828 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3829 {
3830     flag aSign;
3831     int32 aExp, shiftCount;
3832     bits64 aSig;
3833     int64 z;
3834
3835     aSig = extractFloatx80Frac( a );
3836     aExp = extractFloatx80Exp( a );
3837     aSign = extractFloatx80Sign( a );
3838     shiftCount = aExp - 0x403E;
3839     if ( 0 <= shiftCount ) {
3840         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3841         if ( ( a.high != 0xC03E ) || aSig ) {
3842             float_raise( float_flag_invalid STATUS_VAR);
3843             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3844                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3845             }
3846         }
3847         return (sbits64) LIT64( 0x8000000000000000 );
3848     }
3849     else if ( aExp < 0x3FFF ) {
3850         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3851         return 0;
3852     }
3853     z = aSig>>( - shiftCount );
3854     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3855         STATUS(float_exception_flags) |= float_flag_inexact;
3856     }
3857     if ( aSign ) z = - z;
3858     return z;
3859
3860 }
3861
3862 /*----------------------------------------------------------------------------
3863 | Returns the result of converting the extended double-precision floating-
3864 | point value `a' to the single-precision floating-point format.  The
3865 | conversion is performed according to the IEC/IEEE Standard for Binary
3866 | Floating-Point Arithmetic.
3867 *----------------------------------------------------------------------------*/
3868
3869 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3870 {
3871     flag aSign;
3872     int32 aExp;
3873     bits64 aSig;
3874
3875     aSig = extractFloatx80Frac( a );
3876     aExp = extractFloatx80Exp( a );
3877     aSign = extractFloatx80Sign( a );
3878     if ( aExp == 0x7FFF ) {
3879         if ( (bits64) ( aSig<<1 ) ) {
3880             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3881         }
3882         return packFloat32( aSign, 0xFF, 0 );
3883     }
3884     shift64RightJamming( aSig, 33, &aSig );
3885     if ( aExp || aSig ) aExp -= 0x3F81;
3886     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3887
3888 }
3889
3890 /*----------------------------------------------------------------------------
3891 | Returns the result of converting the extended double-precision floating-
3892 | point value `a' to the double-precision floating-point format.  The
3893 | conversion is performed according to the IEC/IEEE Standard for Binary
3894 | Floating-Point Arithmetic.
3895 *----------------------------------------------------------------------------*/
3896
3897 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3898 {
3899     flag aSign;
3900     int32 aExp;
3901     bits64 aSig, zSig;
3902
3903     aSig = extractFloatx80Frac( a );
3904     aExp = extractFloatx80Exp( a );
3905     aSign = extractFloatx80Sign( a );
3906     if ( aExp == 0x7FFF ) {
3907         if ( (bits64) ( aSig<<1 ) ) {
3908             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3909         }
3910         return packFloat64( aSign, 0x7FF, 0 );
3911     }
3912     shift64RightJamming( aSig, 1, &zSig );
3913     if ( aExp || aSig ) aExp -= 0x3C01;
3914     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3915
3916 }
3917
3918 #ifdef FLOAT128
3919
3920 /*----------------------------------------------------------------------------
3921 | Returns the result of converting the extended double-precision floating-
3922 | point value `a' to the quadruple-precision floating-point format.  The
3923 | conversion is performed according to the IEC/IEEE Standard for Binary
3924 | Floating-Point Arithmetic.
3925 *----------------------------------------------------------------------------*/
3926
3927 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3928 {
3929     flag aSign;
3930     int16 aExp;
3931     bits64 aSig, zSig0, zSig1;
3932
3933     aSig = extractFloatx80Frac( a );
3934     aExp = extractFloatx80Exp( a );
3935     aSign = extractFloatx80Sign( a );
3936     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3937         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3938     }
3939     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3940     return packFloat128( aSign, aExp, zSig0, zSig1 );
3941
3942 }
3943
3944 #endif
3945
3946 /*----------------------------------------------------------------------------
3947 | Rounds the extended double-precision floating-point value `a' to an integer,
3948 | and returns the result as an extended quadruple-precision floating-point
3949 | value.  The operation is performed according to the IEC/IEEE Standard for
3950 | Binary Floating-Point Arithmetic.
3951 *----------------------------------------------------------------------------*/
3952
3953 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3954 {
3955     flag aSign;
3956     int32 aExp;
3957     bits64 lastBitMask, roundBitsMask;
3958     int8 roundingMode;
3959     floatx80 z;
3960
3961     aExp = extractFloatx80Exp( a );
3962     if ( 0x403E <= aExp ) {
3963         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3964             return propagateFloatx80NaN( a, a STATUS_VAR );
3965         }
3966         return a;
3967     }
3968     if ( aExp < 0x3FFF ) {
3969         if (    ( aExp == 0 )
3970              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3971             return a;
3972         }
3973         STATUS(float_exception_flags) |= float_flag_inexact;
3974         aSign = extractFloatx80Sign( a );
3975         switch ( STATUS(float_rounding_mode) ) {
3976          case float_round_nearest_even:
3977             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3978                ) {
3979                 return
3980                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3981             }
3982             break;
3983          case float_round_down:
3984             return
3985                   aSign ?
3986                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3987                 : packFloatx80( 0, 0, 0 );
3988          case float_round_up:
3989             return
3990                   aSign ? packFloatx80( 1, 0, 0 )
3991                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3992         }
3993         return packFloatx80( aSign, 0, 0 );
3994     }
3995     lastBitMask = 1;
3996     lastBitMask <<= 0x403E - aExp;
3997     roundBitsMask = lastBitMask - 1;
3998     z = a;
3999     roundingMode = STATUS(float_rounding_mode);
4000     if ( roundingMode == float_round_nearest_even ) {
4001         z.low += lastBitMask>>1;
4002         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4003     }
4004     else if ( roundingMode != float_round_to_zero ) {
4005         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4006             z.low += roundBitsMask;
4007         }
4008     }
4009     z.low &= ~ roundBitsMask;
4010     if ( z.low == 0 ) {
4011         ++z.high;
4012         z.low = LIT64( 0x8000000000000000 );
4013     }
4014     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4015     return z;
4016
4017 }
4018
4019 /*----------------------------------------------------------------------------
4020 | Returns the result of adding the absolute values of the extended double-
4021 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4022 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4023 | The addition is performed according to the IEC/IEEE Standard for Binary
4024 | Floating-Point Arithmetic.
4025 *----------------------------------------------------------------------------*/
4026
4027 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4028 {
4029     int32 aExp, bExp, zExp;
4030     bits64 aSig, bSig, zSig0, zSig1;
4031     int32 expDiff;
4032
4033     aSig = extractFloatx80Frac( a );
4034     aExp = extractFloatx80Exp( a );
4035     bSig = extractFloatx80Frac( b );
4036     bExp = extractFloatx80Exp( b );
4037     expDiff = aExp - bExp;
4038     if ( 0 < expDiff ) {
4039         if ( aExp == 0x7FFF ) {
4040             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4041             return a;
4042         }
4043         if ( bExp == 0 ) --expDiff;
4044         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4045         zExp = aExp;
4046     }
4047     else if ( expDiff < 0 ) {
4048         if ( bExp == 0x7FFF ) {
4049             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4050             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4051         }
4052         if ( aExp == 0 ) ++expDiff;
4053         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4054         zExp = bExp;
4055     }
4056     else {
4057         if ( aExp == 0x7FFF ) {
4058             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
4059                 return propagateFloatx80NaN( a, b STATUS_VAR );
4060             }
4061             return a;
4062         }
4063         zSig1 = 0;
4064         zSig0 = aSig + bSig;
4065         if ( aExp == 0 ) {
4066             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4067             goto roundAndPack;
4068         }
4069         zExp = aExp;
4070         goto shiftRight1;
4071     }
4072     zSig0 = aSig + bSig;
4073     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
4074  shiftRight1:
4075     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4076     zSig0 |= LIT64( 0x8000000000000000 );
4077     ++zExp;
4078  roundAndPack:
4079     return
4080         roundAndPackFloatx80(
4081             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4082
4083 }
4084
4085 /*----------------------------------------------------------------------------
4086 | Returns the result of subtracting the absolute values of the extended
4087 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4088 | difference is negated before being returned.  `zSign' is ignored if the
4089 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4090 | Standard for Binary Floating-Point Arithmetic.
4091 *----------------------------------------------------------------------------*/
4092
4093 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4094 {
4095     int32 aExp, bExp, zExp;
4096     bits64 aSig, bSig, zSig0, zSig1;
4097     int32 expDiff;
4098     floatx80 z;
4099
4100     aSig = extractFloatx80Frac( a );
4101     aExp = extractFloatx80Exp( a );
4102     bSig = extractFloatx80Frac( b );
4103     bExp = extractFloatx80Exp( b );
4104     expDiff = aExp - bExp;
4105     if ( 0 < expDiff ) goto aExpBigger;
4106     if ( expDiff < 0 ) goto bExpBigger;
4107     if ( aExp == 0x7FFF ) {
4108         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
4109             return propagateFloatx80NaN( a, b STATUS_VAR );
4110         }
4111         float_raise( float_flag_invalid STATUS_VAR);
4112         z.low = floatx80_default_nan_low;
4113         z.high = floatx80_default_nan_high;
4114         return z;
4115     }
4116     if ( aExp == 0 ) {
4117         aExp = 1;
4118         bExp = 1;
4119     }
4120     zSig1 = 0;
4121     if ( bSig < aSig ) goto aBigger;
4122     if ( aSig < bSig ) goto bBigger;
4123     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4124  bExpBigger:
4125     if ( bExp == 0x7FFF ) {
4126         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4127         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4128     }
4129     if ( aExp == 0 ) ++expDiff;
4130     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4131  bBigger:
4132     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4133     zExp = bExp;
4134     zSign ^= 1;
4135     goto normalizeRoundAndPack;
4136  aExpBigger:
4137     if ( aExp == 0x7FFF ) {
4138         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4139         return a;
4140     }
4141     if ( bExp == 0 ) --expDiff;
4142     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4143  aBigger:
4144     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4145     zExp = aExp;
4146  normalizeRoundAndPack:
4147     return
4148         normalizeRoundAndPackFloatx80(
4149             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4150
4151 }
4152
4153 /*----------------------------------------------------------------------------
4154 | Returns the result of adding the extended double-precision floating-point
4155 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4156 | Standard for Binary Floating-Point Arithmetic.
4157 *----------------------------------------------------------------------------*/
4158
4159 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4160 {
4161     flag aSign, bSign;
4162
4163     aSign = extractFloatx80Sign( a );
4164     bSign = extractFloatx80Sign( b );
4165     if ( aSign == bSign ) {
4166         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4167     }
4168     else {
4169         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4170     }
4171
4172 }
4173
4174 /*----------------------------------------------------------------------------
4175 | Returns the result of subtracting the extended double-precision floating-
4176 | point values `a' and `b'.  The operation is performed according to the
4177 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4178 *----------------------------------------------------------------------------*/
4179
4180 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4181 {
4182     flag aSign, bSign;
4183
4184     aSign = extractFloatx80Sign( a );
4185     bSign = extractFloatx80Sign( b );
4186     if ( aSign == bSign ) {
4187         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4188     }
4189     else {
4190         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4191     }
4192
4193 }
4194
4195 /*----------------------------------------------------------------------------
4196 | Returns the result of multiplying the extended double-precision floating-
4197 | point values `a' and `b'.  The operation is performed according to the
4198 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4199 *----------------------------------------------------------------------------*/
4200
4201 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4202 {
4203     flag aSign, bSign, zSign;
4204     int32 aExp, bExp, zExp;
4205     bits64 aSig, bSig, zSig0, zSig1;
4206     floatx80 z;
4207
4208     aSig = extractFloatx80Frac( a );
4209     aExp = extractFloatx80Exp( a );
4210     aSign = extractFloatx80Sign( a );
4211     bSig = extractFloatx80Frac( b );
4212     bExp = extractFloatx80Exp( b );
4213     bSign = extractFloatx80Sign( b );
4214     zSign = aSign ^ bSign;
4215     if ( aExp == 0x7FFF ) {
4216         if (    (bits64) ( aSig<<1 )
4217              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4218             return propagateFloatx80NaN( a, b STATUS_VAR );
4219         }
4220         if ( ( bExp | bSig ) == 0 ) goto invalid;
4221         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4222     }
4223     if ( bExp == 0x7FFF ) {
4224         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4225         if ( ( aExp | aSig ) == 0 ) {
4226  invalid:
4227             float_raise( float_flag_invalid STATUS_VAR);
4228             z.low = floatx80_default_nan_low;
4229             z.high = floatx80_default_nan_high;
4230             return z;
4231         }
4232         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4233     }
4234     if ( aExp == 0 ) {
4235         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4236         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4237     }
4238     if ( bExp == 0 ) {
4239         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4240         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4241     }
4242     zExp = aExp + bExp - 0x3FFE;
4243     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4244     if ( 0 < (sbits64) zSig0 ) {
4245         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4246         --zExp;
4247     }
4248     return
4249         roundAndPackFloatx80(
4250             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4251
4252 }
4253
4254 /*----------------------------------------------------------------------------
4255 | Returns the result of dividing the extended double-precision floating-point
4256 | value `a' by the corresponding value `b'.  The operation is performed
4257 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4258 *----------------------------------------------------------------------------*/
4259
4260 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4261 {
4262     flag aSign, bSign, zSign;
4263     int32 aExp, bExp, zExp;
4264     bits64 aSig, bSig, zSig0, zSig1;
4265     bits64 rem0, rem1, rem2, term0, term1, term2;
4266     floatx80 z;
4267
4268     aSig = extractFloatx80Frac( a );
4269     aExp = extractFloatx80Exp( a );
4270     aSign = extractFloatx80Sign( a );
4271     bSig = extractFloatx80Frac( b );
4272     bExp = extractFloatx80Exp( b );
4273     bSign = extractFloatx80Sign( b );
4274     zSign = aSign ^ bSign;
4275     if ( aExp == 0x7FFF ) {
4276         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4277         if ( bExp == 0x7FFF ) {
4278             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4279             goto invalid;
4280         }
4281         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4282     }
4283     if ( bExp == 0x7FFF ) {
4284         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4285         return packFloatx80( zSign, 0, 0 );
4286     }
4287     if ( bExp == 0 ) {
4288         if ( bSig == 0 ) {
4289             if ( ( aExp | aSig ) == 0 ) {
4290  invalid:
4291                 float_raise( float_flag_invalid STATUS_VAR);
4292                 z.low = floatx80_default_nan_low;
4293                 z.high = floatx80_default_nan_high;
4294                 return z;
4295             }
4296             float_raise( float_flag_divbyzero STATUS_VAR);
4297             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4298         }
4299         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4300     }
4301     if ( aExp == 0 ) {
4302         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4303         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4304     }
4305     zExp = aExp - bExp + 0x3FFE;
4306     rem1 = 0;
4307     if ( bSig <= aSig ) {
4308         shift128Right( aSig, 0, 1, &aSig, &rem1 );
4309         ++zExp;
4310     }
4311     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4312     mul64To128( bSig, zSig0, &term0, &term1 );
4313     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4314     while ( (sbits64) rem0 < 0 ) {
4315         --zSig0;
4316         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4317     }
4318     zSig1 = estimateDiv128To64( rem1, 0, bSig );
4319     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
4320         mul64To128( bSig, zSig1, &term1, &term2 );
4321         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4322         while ( (sbits64) rem1 < 0 ) {
4323             --zSig1;
4324             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4325         }
4326         zSig1 |= ( ( rem1 | rem2 ) != 0 );
4327     }
4328     return
4329         roundAndPackFloatx80(
4330             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4331
4332 }
4333
4334 /*----------------------------------------------------------------------------
4335 | Returns the remainder of the extended double-precision floating-point value
4336 | `a' with respect to the corresponding value `b'.  The operation is performed
4337 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4338 *----------------------------------------------------------------------------*/
4339
4340 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4341 {
4342     flag aSign, zSign;
4343     int32 aExp, bExp, expDiff;
4344     bits64 aSig0, aSig1, bSig;
4345     bits64 q, term0, term1, alternateASig0, alternateASig1;
4346     floatx80 z;
4347
4348     aSig0 = extractFloatx80Frac( a );
4349     aExp = extractFloatx80Exp( a );
4350     aSign = extractFloatx80Sign( a );
4351     bSig = extractFloatx80Frac( b );
4352     bExp = extractFloatx80Exp( b );
4353     if ( aExp == 0x7FFF ) {
4354         if (    (bits64) ( aSig0<<1 )
4355              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4356             return propagateFloatx80NaN( a, b STATUS_VAR );
4357         }
4358         goto invalid;
4359     }
4360     if ( bExp == 0x7FFF ) {
4361         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4362         return a;
4363     }
4364     if ( bExp == 0 ) {
4365         if ( bSig == 0 ) {
4366  invalid:
4367             float_raise( float_flag_invalid STATUS_VAR);
4368             z.low = floatx80_default_nan_low;
4369             z.high = floatx80_default_nan_high;
4370             return z;
4371         }
4372         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4373     }
4374     if ( aExp == 0 ) {
4375         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4376         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4377     }
4378     bSig |= LIT64( 0x8000000000000000 );
4379     zSign = aSign;
4380     expDiff = aExp - bExp;
4381     aSig1 = 0;
4382     if ( expDiff < 0 ) {
4383         if ( expDiff < -1 ) return a;
4384         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4385         expDiff = 0;
4386     }
4387     q = ( bSig <= aSig0 );
4388     if ( q ) aSig0 -= bSig;
4389     expDiff -= 64;
4390     while ( 0 < expDiff ) {
4391         q = estimateDiv128To64( aSig0, aSig1, bSig );
4392         q = ( 2 < q ) ? q - 2 : 0;
4393         mul64To128( bSig, q, &term0, &term1 );
4394         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4395         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4396         expDiff -= 62;
4397     }
4398     expDiff += 64;
4399     if ( 0 < expDiff ) {
4400         q = estimateDiv128To64( aSig0, aSig1, bSig );
4401         q = ( 2 < q ) ? q - 2 : 0;
4402         q >>= 64 - expDiff;
4403         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4404         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4405         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4406         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4407             ++q;
4408             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4409         }
4410     }
4411     else {
4412         term1 = 0;
4413         term0 = bSig;
4414     }
4415     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4416     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4417          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4418               && ( q & 1 ) )
4419        ) {
4420         aSig0 = alternateASig0;
4421         aSig1 = alternateASig1;
4422         zSign = ! zSign;
4423     }
4424     return
4425         normalizeRoundAndPackFloatx80(
4426             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4427
4428 }
4429
4430 /*----------------------------------------------------------------------------
4431 | Returns the square root of the extended double-precision floating-point
4432 | value `a'.  The operation is performed according to the IEC/IEEE Standard
4433 | for Binary Floating-Point Arithmetic.
4434 *----------------------------------------------------------------------------*/
4435
4436 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4437 {
4438     flag aSign;
4439     int32 aExp, zExp;
4440     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4441     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4442     floatx80 z;
4443
4444     aSig0 = extractFloatx80Frac( a );
4445     aExp = extractFloatx80Exp( a );
4446     aSign = extractFloatx80Sign( a );
4447     if ( aExp == 0x7FFF ) {
4448         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4449         if ( ! aSign ) return a;
4450         goto invalid;
4451     }
4452     if ( aSign ) {
4453         if ( ( aExp | aSig0 ) == 0 ) return a;
4454  invalid:
4455         float_raise( float_flag_invalid STATUS_VAR);
4456         z.low = floatx80_default_nan_low;
4457         z.high = floatx80_default_nan_high;
4458         return z;
4459     }
4460     if ( aExp == 0 ) {
4461         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4462         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4463     }
4464     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4465     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4466     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4467     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4468     doubleZSig0 = zSig0<<1;
4469     mul64To128( zSig0, zSig0, &term0, &term1 );
4470     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4471     while ( (sbits64) rem0 < 0 ) {
4472         --zSig0;
4473         doubleZSig0 -= 2;
4474         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4475     }
4476     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4477     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4478         if ( zSig1 == 0 ) zSig1 = 1;
4479         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4480         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4481         mul64To128( zSig1, zSig1, &term2, &term3 );
4482         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4483         while ( (sbits64) rem1 < 0 ) {
4484             --zSig1;
4485             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4486             term3 |= 1;
4487             term2 |= doubleZSig0;
4488             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4489         }
4490         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4491     }
4492     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4493     zSig0 |= doubleZSig0;
4494     return
4495         roundAndPackFloatx80(
4496             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4497
4498 }
4499
4500 /*----------------------------------------------------------------------------
4501 | Returns 1 if the extended double-precision floating-point value `a' is
4502 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
4503 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4504 | Arithmetic.
4505 *----------------------------------------------------------------------------*/
4506
4507 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4508 {
4509
4510     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4511               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4512          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4513               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4514        ) {
4515         if (    floatx80_is_signaling_nan( a )
4516              || floatx80_is_signaling_nan( b ) ) {
4517             float_raise( float_flag_invalid STATUS_VAR);
4518         }
4519         return 0;
4520     }
4521     return
4522            ( a.low == b.low )
4523         && (    ( a.high == b.high )
4524              || (    ( a.low == 0 )
4525                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4526            );
4527
4528 }
4529
4530 /*----------------------------------------------------------------------------
4531 | Returns 1 if the extended double-precision floating-point value `a' is
4532 | less than or equal to the corresponding value `b', and 0 otherwise.  The
4533 | comparison is performed according to the IEC/IEEE Standard for Binary
4534 | Floating-Point Arithmetic.
4535 *----------------------------------------------------------------------------*/
4536
4537 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4538 {
4539     flag aSign, bSign;
4540
4541     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4542               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4543          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4544               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4545        ) {
4546         float_raise( float_flag_invalid STATUS_VAR);
4547         return 0;
4548     }
4549     aSign = extractFloatx80Sign( a );
4550     bSign = extractFloatx80Sign( b );
4551     if ( aSign != bSign ) {
4552         return
4553                aSign
4554             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4555                  == 0 );
4556     }
4557     return
4558           aSign ? le128( b.high, b.low, a.high, a.low )
4559         : le128( a.high, a.low, b.high, b.low );
4560
4561 }
4562
4563 /*----------------------------------------------------------------------------
4564 | Returns 1 if the extended double-precision floating-point value `a' is
4565 | less than the corresponding value `b', and 0 otherwise.  The comparison
4566 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4567 | Arithmetic.
4568 *----------------------------------------------------------------------------*/
4569
4570 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4571 {
4572     flag aSign, bSign;
4573
4574     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4575               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4576          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4577               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4578        ) {
4579         float_raise( float_flag_invalid STATUS_VAR);
4580         return 0;
4581     }
4582     aSign = extractFloatx80Sign( a );
4583     bSign = extractFloatx80Sign( b );
4584     if ( aSign != bSign ) {
4585         return
4586                aSign
4587             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4588                  != 0 );
4589     }
4590     return
4591           aSign ? lt128( b.high, b.low, a.high, a.low )
4592         : lt128( a.high, a.low, b.high, b.low );
4593
4594 }
4595
4596 /*----------------------------------------------------------------------------
4597 | Returns 1 if the extended double-precision floating-point value `a' is equal
4598 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
4599 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4600 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4601 *----------------------------------------------------------------------------*/
4602
4603 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4604 {
4605
4606     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4607               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4608          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4609               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4610        ) {
4611         float_raise( float_flag_invalid STATUS_VAR);
4612         return 0;
4613     }
4614     return
4615            ( a.low == b.low )
4616         && (    ( a.high == b.high )
4617              || (    ( a.low == 0 )
4618                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4619            );
4620
4621 }
4622
4623 /*----------------------------------------------------------------------------
4624 | Returns 1 if the extended double-precision floating-point value `a' is less
4625 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4626 | do not cause an exception.  Otherwise, the comparison is performed according
4627 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4628 *----------------------------------------------------------------------------*/
4629
4630 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4631 {
4632     flag aSign, bSign;
4633
4634     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4635               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4636          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4637               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4638        ) {
4639         if (    floatx80_is_signaling_nan( a )
4640              || floatx80_is_signaling_nan( b ) ) {
4641             float_raise( float_flag_invalid STATUS_VAR);
4642         }
4643         return 0;
4644     }
4645     aSign = extractFloatx80Sign( a );
4646     bSign = extractFloatx80Sign( b );
4647     if ( aSign != bSign ) {
4648         return
4649                aSign
4650             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4651                  == 0 );
4652     }
4653     return
4654           aSign ? le128( b.high, b.low, a.high, a.low )
4655         : le128( a.high, a.low, b.high, b.low );
4656
4657 }
4658
4659 /*----------------------------------------------------------------------------
4660 | Returns 1 if the extended double-precision floating-point value `a' is less
4661 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4662 | an exception.  Otherwise, the comparison is performed according to the
4663 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4664 *----------------------------------------------------------------------------*/
4665
4666 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4667 {
4668     flag aSign, bSign;
4669
4670     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4671               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4672          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4673               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4674        ) {
4675         if (    floatx80_is_signaling_nan( a )
4676              || floatx80_is_signaling_nan( b ) ) {
4677             float_raise( float_flag_invalid STATUS_VAR);
4678         }
4679         return 0;
4680     }
4681     aSign = extractFloatx80Sign( a );
4682     bSign = extractFloatx80Sign( b );
4683     if ( aSign != bSign ) {
4684         return
4685                aSign
4686             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4687                  != 0 );
4688     }
4689     return
4690           aSign ? lt128( b.high, b.low, a.high, a.low )
4691         : lt128( a.high, a.low, b.high, b.low );
4692
4693 }
4694
4695 #endif
4696
4697 #ifdef FLOAT128
4698
4699 /*----------------------------------------------------------------------------
4700 | Returns the result of converting the quadruple-precision floating-point
4701 | value `a' to the 32-bit two's complement integer format.  The conversion
4702 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4703 | Arithmetic---which means in particular that the conversion is rounded
4704 | according to the current rounding mode.  If `a' is a NaN, the largest
4705 | positive integer is returned.  Otherwise, if the conversion overflows, the
4706 | largest integer with the same sign as `a' is returned.
4707 *----------------------------------------------------------------------------*/
4708
4709 int32 float128_to_int32( float128 a STATUS_PARAM )
4710 {
4711     flag aSign;
4712     int32 aExp, shiftCount;
4713     bits64 aSig0, aSig1;
4714
4715     aSig1 = extractFloat128Frac1( a );
4716     aSig0 = extractFloat128Frac0( a );
4717     aExp = extractFloat128Exp( a );
4718     aSign = extractFloat128Sign( a );
4719     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4720     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4721     aSig0 |= ( aSig1 != 0 );
4722     shiftCount = 0x4028 - aExp;
4723     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4724     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4725
4726 }
4727
4728 /*----------------------------------------------------------------------------
4729 | Returns the result of converting the quadruple-precision floating-point
4730 | value `a' to the 32-bit two's complement integer format.  The conversion
4731 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4732 | Arithmetic, except that the conversion is always rounded toward zero.  If
4733 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4734 | conversion overflows, the largest integer with the same sign as `a' is
4735 | returned.
4736 *----------------------------------------------------------------------------*/
4737
4738 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4739 {
4740     flag aSign;
4741     int32 aExp, shiftCount;
4742     bits64 aSig0, aSig1, savedASig;
4743     int32 z;
4744
4745     aSig1 = extractFloat128Frac1( a );
4746     aSig0 = extractFloat128Frac0( a );
4747     aExp = extractFloat128Exp( a );
4748     aSign = extractFloat128Sign( a );
4749     aSig0 |= ( aSig1 != 0 );
4750     if ( 0x401E < aExp ) {
4751         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4752         goto invalid;
4753     }
4754     else if ( aExp < 0x3FFF ) {
4755         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4756         return 0;
4757     }
4758     aSig0 |= LIT64( 0x0001000000000000 );
4759     shiftCount = 0x402F - aExp;
4760     savedASig = aSig0;
4761     aSig0 >>= shiftCount;
4762     z = aSig0;
4763     if ( aSign ) z = - z;
4764     if ( ( z < 0 ) ^ aSign ) {
4765  invalid:
4766         float_raise( float_flag_invalid STATUS_VAR);
4767         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4768     }
4769     if ( ( aSig0<<shiftCount ) != savedASig ) {
4770         STATUS(float_exception_flags) |= float_flag_inexact;
4771     }
4772     return z;
4773
4774 }
4775
4776 /*----------------------------------------------------------------------------
4777 | Returns the result of converting the quadruple-precision floating-point
4778 | value `a' to the 64-bit two's complement integer format.  The conversion
4779 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4780 | Arithmetic---which means in particular that the conversion is rounded
4781 | according to the current rounding mode.  If `a' is a NaN, the largest
4782 | positive integer is returned.  Otherwise, if the conversion overflows, the
4783 | largest integer with the same sign as `a' is returned.
4784 *----------------------------------------------------------------------------*/
4785
4786 int64 float128_to_int64( float128 a STATUS_PARAM )
4787 {
4788     flag aSign;
4789     int32 aExp, shiftCount;
4790     bits64 aSig0, aSig1;
4791
4792     aSig1 = extractFloat128Frac1( a );
4793     aSig0 = extractFloat128Frac0( a );
4794     aExp = extractFloat128Exp( a );
4795     aSign = extractFloat128Sign( a );
4796     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4797     shiftCount = 0x402F - aExp;
4798     if ( shiftCount <= 0 ) {
4799         if ( 0x403E < aExp ) {
4800             float_raise( float_flag_invalid STATUS_VAR);
4801             if (    ! aSign
4802                  || (    ( aExp == 0x7FFF )
4803                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4804                     )
4805                ) {
4806                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4807             }
4808             return (sbits64) LIT64( 0x8000000000000000 );
4809         }
4810         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4811     }
4812     else {
4813         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4814     }
4815     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4816
4817 }
4818
4819 /*----------------------------------------------------------------------------
4820 | Returns the result of converting the quadruple-precision floating-point
4821 | value `a' to the 64-bit two's complement integer format.  The conversion
4822 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4823 | Arithmetic, except that the conversion is always rounded toward zero.
4824 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4825 | the conversion overflows, the largest integer with the same sign as `a' is
4826 | returned.
4827 *----------------------------------------------------------------------------*/
4828
4829 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4830 {
4831     flag aSign;
4832     int32 aExp, shiftCount;
4833     bits64 aSig0, aSig1;
4834     int64 z;
4835
4836     aSig1 = extractFloat128Frac1( a );
4837     aSig0 = extractFloat128Frac0( a );
4838     aExp = extractFloat128Exp( a );
4839     aSign = extractFloat128Sign( a );
4840     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4841     shiftCount = aExp - 0x402F;
4842     if ( 0 < shiftCount ) {
4843         if ( 0x403E <= aExp ) {
4844             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4845             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4846                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4847                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4848             }
4849             else {
4850                 float_raise( float_flag_invalid STATUS_VAR);
4851                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4852                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4853                 }
4854             }
4855             return (sbits64) LIT64( 0x8000000000000000 );
4856         }
4857         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4858         if ( (bits64) ( aSig1<<shiftCount ) ) {
4859             STATUS(float_exception_flags) |= float_flag_inexact;
4860         }
4861     }
4862     else {
4863         if ( aExp < 0x3FFF ) {
4864             if ( aExp | aSig0 | aSig1 ) {
4865                 STATUS(float_exception_flags) |= float_flag_inexact;
4866             }
4867             return 0;
4868         }
4869         z = aSig0>>( - shiftCount );
4870         if (    aSig1
4871              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4872             STATUS(float_exception_flags) |= float_flag_inexact;
4873         }
4874     }
4875     if ( aSign ) z = - z;
4876     return z;
4877
4878 }
4879
4880 /*----------------------------------------------------------------------------
4881 | Returns the result of converting the quadruple-precision floating-point
4882 | value `a' to the single-precision floating-point format.  The conversion
4883 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4884 | Arithmetic.
4885 *----------------------------------------------------------------------------*/
4886
4887 float32 float128_to_float32( float128 a STATUS_PARAM )
4888 {
4889     flag aSign;
4890     int32 aExp;
4891     bits64 aSig0, aSig1;
4892     bits32 zSig;
4893
4894     aSig1 = extractFloat128Frac1( a );
4895     aSig0 = extractFloat128Frac0( a );
4896     aExp = extractFloat128Exp( a );
4897     aSign = extractFloat128Sign( a );
4898     if ( aExp == 0x7FFF ) {
4899         if ( aSig0 | aSig1 ) {
4900             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4901         }
4902         return packFloat32( aSign, 0xFF, 0 );
4903     }
4904     aSig0 |= ( aSig1 != 0 );
4905     shift64RightJamming( aSig0, 18, &aSig0 );
4906     zSig = aSig0;
4907     if ( aExp || zSig ) {
4908         zSig |= 0x40000000;
4909         aExp -= 0x3F81;
4910     }
4911     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4912
4913 }
4914
4915 /*----------------------------------------------------------------------------
4916 | Returns the result of converting the quadruple-precision floating-point
4917 | value `a' to the double-precision floating-point format.  The conversion
4918 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4919 | Arithmetic.
4920 *----------------------------------------------------------------------------*/
4921
4922 float64 float128_to_float64( float128 a STATUS_PARAM )
4923 {
4924     flag aSign;
4925     int32 aExp;
4926     bits64 aSig0, aSig1;
4927
4928     aSig1 = extractFloat128Frac1( a );
4929     aSig0 = extractFloat128Frac0( a );
4930     aExp = extractFloat128Exp( a );
4931     aSign = extractFloat128Sign( a );
4932     if ( aExp == 0x7FFF ) {
4933         if ( aSig0 | aSig1 ) {
4934             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4935         }
4936         return packFloat64( aSign, 0x7FF, 0 );
4937     }
4938     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4939     aSig0 |= ( aSig1 != 0 );
4940     if ( aExp || aSig0 ) {
4941         aSig0 |= LIT64( 0x4000000000000000 );
4942         aExp -= 0x3C01;
4943     }
4944     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4945
4946 }
4947
4948 #ifdef FLOATX80
4949
4950 /*----------------------------------------------------------------------------
4951 | Returns the result of converting the quadruple-precision floating-point
4952 | value `a' to the extended double-precision floating-point format.  The
4953 | conversion is performed according to the IEC/IEEE Standard for Binary
4954 | Floating-Point Arithmetic.
4955 *----------------------------------------------------------------------------*/
4956
4957 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4958 {
4959     flag aSign;
4960     int32 aExp;
4961     bits64 aSig0, aSig1;
4962
4963     aSig1 = extractFloat128Frac1( a );
4964     aSig0 = extractFloat128Frac0( a );
4965     aExp = extractFloat128Exp( a );
4966     aSign = extractFloat128Sign( a );
4967     if ( aExp == 0x7FFF ) {
4968         if ( aSig0 | aSig1 ) {
4969             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4970         }
4971         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4972     }
4973     if ( aExp == 0 ) {
4974         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4975         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4976     }
4977     else {
4978         aSig0 |= LIT64( 0x0001000000000000 );
4979     }
4980     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4981     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4982
4983 }
4984
4985 #endif
4986
4987 /*----------------------------------------------------------------------------
4988 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4989 | returns the result as a quadruple-precision floating-point value.  The
4990 | operation is performed according to the IEC/IEEE Standard for Binary
4991 | Floating-Point Arithmetic.
4992 *----------------------------------------------------------------------------*/
4993
4994 float128 float128_round_to_int( float128 a STATUS_PARAM )
4995 {
4996     flag aSign;
4997     int32 aExp;
4998     bits64 lastBitMask, roundBitsMask;
4999     int8 roundingMode;
5000     float128 z;
5001
5002     aExp = extractFloat128Exp( a );
5003     if ( 0x402F <= aExp ) {
5004         if ( 0x406F <= aExp ) {
5005             if (    ( aExp == 0x7FFF )
5006                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5007                ) {
5008                 return propagateFloat128NaN( a, a STATUS_VAR );
5009             }
5010             return a;
5011         }
5012         lastBitMask = 1;
5013         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5014         roundBitsMask = lastBitMask - 1;
5015         z = a;
5016         roundingMode = STATUS(float_rounding_mode);
5017         if ( roundingMode == float_round_nearest_even ) {
5018             if ( lastBitMask ) {
5019                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5020                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5021             }
5022             else {
5023                 if ( (sbits64) z.low < 0 ) {
5024                     ++z.high;
5025                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
5026                 }
5027             }
5028         }
5029         else if ( roundingMode != float_round_to_zero ) {
5030             if (   extractFloat128Sign( z )
5031                  ^ ( roundingMode == float_round_up ) ) {
5032                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5033             }
5034         }
5035         z.low &= ~ roundBitsMask;
5036     }
5037     else {
5038         if ( aExp < 0x3FFF ) {
5039             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5040             STATUS(float_exception_flags) |= float_flag_inexact;
5041             aSign = extractFloat128Sign( a );
5042             switch ( STATUS(float_rounding_mode) ) {
5043              case float_round_nearest_even:
5044                 if (    ( aExp == 0x3FFE )
5045                      && (   extractFloat128Frac0( a )
5046                           | extractFloat128Frac1( a ) )
5047                    ) {
5048                     return packFloat128( aSign, 0x3FFF, 0, 0 );
5049                 }
5050                 break;
5051              case float_round_down:
5052                 return
5053                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5054                     : packFloat128( 0, 0, 0, 0 );
5055              case float_round_up:
5056                 return
5057                       aSign ? packFloat128( 1, 0, 0, 0 )
5058                     : packFloat128( 0, 0x3FFF, 0, 0 );
5059             }
5060             return packFloat128( aSign, 0, 0, 0 );
5061         }
5062         lastBitMask = 1;
5063         lastBitMask <<= 0x402F - aExp;
5064         roundBitsMask = lastBitMask - 1;
5065         z.low = 0;
5066         z.high = a.high;
5067         roundingMode = STATUS(float_rounding_mode);
5068         if ( roundingMode == float_round_nearest_even ) {
5069             z.high += lastBitMask>>1;
5070             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5071                 z.high &= ~ lastBitMask;
5072             }
5073         }
5074         else if ( roundingMode != float_round_to_zero ) {
5075             if (   extractFloat128Sign( z )
5076                  ^ ( roundingMode == float_round_up ) ) {
5077                 z.high |= ( a.low != 0 );
5078                 z.high += roundBitsMask;
5079             }
5080         }
5081         z.high &= ~ roundBitsMask;
5082     }
5083     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5084         STATUS(float_exception_flags) |= float_flag_inexact;
5085     }
5086     return z;
5087
5088 }
5089
5090 /*----------------------------------------------------------------------------
5091 | Returns the result of adding the absolute values of the quadruple-precision
5092 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5093 | before being returned.  `zSign' is ignored if the result is a NaN.
5094 | The addition is performed according to the IEC/IEEE Standard for Binary
5095 | Floating-Point Arithmetic.
5096 *----------------------------------------------------------------------------*/
5097
5098 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5099 {
5100     int32 aExp, bExp, zExp;
5101     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5102     int32 expDiff;
5103
5104     aSig1 = extractFloat128Frac1( a );
5105     aSig0 = extractFloat128Frac0( a );
5106     aExp = extractFloat128Exp( a );
5107     bSig1 = extractFloat128Frac1( b );
5108     bSig0 = extractFloat128Frac0( b );
5109     bExp = extractFloat128Exp( b );
5110     expDiff = aExp - bExp;
5111     if ( 0 < expDiff ) {
5112         if ( aExp == 0x7FFF ) {
5113             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5114             return a;
5115         }
5116         if ( bExp == 0 ) {
5117             --expDiff;
5118         }
5119         else {
5120             bSig0 |= LIT64( 0x0001000000000000 );
5121         }
5122         shift128ExtraRightJamming(
5123             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5124         zExp = aExp;
5125     }
5126     else if ( expDiff < 0 ) {
5127         if ( bExp == 0x7FFF ) {
5128             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5129             return packFloat128( zSign, 0x7FFF, 0, 0 );
5130         }
5131         if ( aExp == 0 ) {
5132             ++expDiff;
5133         }
5134         else {
5135             aSig0 |= LIT64( 0x0001000000000000 );
5136         }
5137         shift128ExtraRightJamming(
5138             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5139         zExp = bExp;
5140     }
5141     else {
5142         if ( aExp == 0x7FFF ) {
5143             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5144                 return propagateFloat128NaN( a, b STATUS_VAR );
5145             }
5146             return a;
5147         }
5148         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5149         if ( aExp == 0 ) {
5150             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
5151             return packFloat128( zSign, 0, zSig0, zSig1 );
5152         }
5153         zSig2 = 0;
5154         zSig0 |= LIT64( 0x0002000000000000 );
5155         zExp = aExp;
5156         goto shiftRight1;
5157     }
5158     aSig0 |= LIT64( 0x0001000000000000 );
5159     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5160     --zExp;
5161     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5162     ++zExp;
5163  shiftRight1:
5164     shift128ExtraRightJamming(
5165         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5166  roundAndPack:
5167     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5168
5169 }
5170
5171 /*----------------------------------------------------------------------------
5172 | Returns the result of subtracting the absolute values of the quadruple-
5173 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
5174 | difference is negated before being returned.  `zSign' is ignored if the
5175 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5176 | Standard for Binary Floating-Point Arithmetic.
5177 *----------------------------------------------------------------------------*/
5178
5179 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5180 {
5181     int32 aExp, bExp, zExp;
5182     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5183     int32 expDiff;
5184     float128 z;
5185
5186     aSig1 = extractFloat128Frac1( a );
5187     aSig0 = extractFloat128Frac0( a );
5188     aExp = extractFloat128Exp( a );
5189     bSig1 = extractFloat128Frac1( b );
5190     bSig0 = extractFloat128Frac0( b );
5191     bExp = extractFloat128Exp( b );
5192     expDiff = aExp - bExp;
5193     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5194     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5195     if ( 0 < expDiff ) goto aExpBigger;
5196     if ( expDiff < 0 ) goto bExpBigger;
5197     if ( aExp == 0x7FFF ) {
5198         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5199             return propagateFloat128NaN( a, b STATUS_VAR );
5200         }
5201         float_raise( float_flag_invalid STATUS_VAR);
5202         z.low = float128_default_nan_low;
5203         z.high = float128_default_nan_high;
5204         return z;
5205     }
5206     if ( aExp == 0 ) {
5207         aExp = 1;
5208         bExp = 1;
5209     }
5210     if ( bSig0 < aSig0 ) goto aBigger;
5211     if ( aSig0 < bSig0 ) goto bBigger;
5212     if ( bSig1 < aSig1 ) goto aBigger;
5213     if ( aSig1 < bSig1 ) goto bBigger;
5214     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5215  bExpBigger:
5216     if ( bExp == 0x7FFF ) {
5217         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5218         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5219     }
5220     if ( aExp == 0 ) {
5221         ++expDiff;
5222     }
5223     else {
5224         aSig0 |= LIT64( 0x4000000000000000 );
5225     }
5226     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5227     bSig0 |= LIT64( 0x4000000000000000 );
5228  bBigger:
5229     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5230     zExp = bExp;
5231     zSign ^= 1;
5232     goto normalizeRoundAndPack;
5233  aExpBigger:
5234     if ( aExp == 0x7FFF ) {
5235         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5236         return a;
5237     }
5238     if ( bExp == 0 ) {
5239         --expDiff;
5240     }
5241     else {
5242         bSig0 |= LIT64( 0x4000000000000000 );
5243     }
5244     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5245     aSig0 |= LIT64( 0x4000000000000000 );
5246  aBigger:
5247     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5248     zExp = aExp;
5249  normalizeRoundAndPack:
5250     --zExp;
5251     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5252
5253 }
5254
5255 /*----------------------------------------------------------------------------
5256 | Returns the result of adding the quadruple-precision floating-point values
5257 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5258 | for Binary Floating-Point Arithmetic.
5259 *----------------------------------------------------------------------------*/
5260
5261 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5262 {
5263     flag aSign, bSign;
5264
5265     aSign = extractFloat128Sign( a );
5266     bSign = extractFloat128Sign( b );
5267     if ( aSign == bSign ) {
5268         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5269     }
5270     else {
5271         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5272     }
5273
5274 }
5275
5276 /*----------------------------------------------------------------------------
5277 | Returns the result of subtracting the quadruple-precision floating-point
5278 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5279 | Standard for Binary Floating-Point Arithmetic.
5280 *----------------------------------------------------------------------------*/
5281
5282 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5283 {
5284     flag aSign, bSign;
5285
5286     aSign = extractFloat128Sign( a );
5287     bSign = extractFloat128Sign( b );
5288     if ( aSign == bSign ) {
5289         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5290     }
5291     else {
5292         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5293     }
5294
5295 }
5296
5297 /*----------------------------------------------------------------------------
5298 | Returns the result of multiplying the quadruple-precision floating-point
5299 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5300 | Standard for Binary Floating-Point Arithmetic.
5301 *----------------------------------------------------------------------------*/
5302
5303 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5304 {
5305     flag aSign, bSign, zSign;
5306     int32 aExp, bExp, zExp;
5307     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5308     float128 z;
5309
5310     aSig1 = extractFloat128Frac1( a );
5311     aSig0 = extractFloat128Frac0( a );
5312     aExp = extractFloat128Exp( a );
5313     aSign = extractFloat128Sign( a );
5314     bSig1 = extractFloat128Frac1( b );
5315     bSig0 = extractFloat128Frac0( b );
5316     bExp = extractFloat128Exp( b );
5317     bSign = extractFloat128Sign( b );
5318     zSign = aSign ^ bSign;
5319     if ( aExp == 0x7FFF ) {
5320         if (    ( aSig0 | aSig1 )
5321              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5322             return propagateFloat128NaN( a, b STATUS_VAR );
5323         }
5324         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5325         return packFloat128( zSign, 0x7FFF, 0, 0 );
5326     }
5327     if ( bExp == 0x7FFF ) {
5328         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5329         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5330  invalid:
5331             float_raise( float_flag_invalid STATUS_VAR);
5332             z.low = float128_default_nan_low;
5333             z.high = float128_default_nan_high;
5334             return z;
5335         }
5336         return packFloat128( zSign, 0x7FFF, 0, 0 );
5337     }
5338     if ( aExp == 0 ) {
5339         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5340         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5341     }
5342     if ( bExp == 0 ) {
5343         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5344         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5345     }
5346     zExp = aExp + bExp - 0x4000;
5347     aSig0 |= LIT64( 0x0001000000000000 );
5348     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5349     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5350     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5351     zSig2 |= ( zSig3 != 0 );
5352     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5353         shift128ExtraRightJamming(
5354             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5355         ++zExp;
5356     }
5357     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5358
5359 }
5360
5361 /*----------------------------------------------------------------------------
5362 | Returns the result of dividing the quadruple-precision floating-point value
5363 | `a' by the corresponding value `b'.  The operation is performed according to
5364 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5365 *----------------------------------------------------------------------------*/
5366
5367 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5368 {
5369     flag aSign, bSign, zSign;
5370     int32 aExp, bExp, zExp;
5371     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5372     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5373     float128 z;
5374
5375     aSig1 = extractFloat128Frac1( a );
5376     aSig0 = extractFloat128Frac0( a );
5377     aExp = extractFloat128Exp( a );
5378     aSign = extractFloat128Sign( a );
5379     bSig1 = extractFloat128Frac1( b );
5380     bSig0 = extractFloat128Frac0( b );
5381     bExp = extractFloat128Exp( b );
5382     bSign = extractFloat128Sign( b );
5383     zSign = aSign ^ bSign;
5384     if ( aExp == 0x7FFF ) {
5385         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5386         if ( bExp == 0x7FFF ) {
5387             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5388             goto invalid;
5389         }
5390         return packFloat128( zSign, 0x7FFF, 0, 0 );
5391     }
5392     if ( bExp == 0x7FFF ) {
5393         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5394         return packFloat128( zSign, 0, 0, 0 );
5395     }
5396     if ( bExp == 0 ) {
5397         if ( ( bSig0 | bSig1 ) == 0 ) {
5398             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5399  invalid:
5400                 float_raise( float_flag_invalid STATUS_VAR);
5401                 z.low = float128_default_nan_low;
5402                 z.high = float128_default_nan_high;
5403                 return z;
5404             }
5405             float_raise( float_flag_divbyzero STATUS_VAR);
5406             return packFloat128( zSign, 0x7FFF, 0, 0 );
5407         }
5408         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5409     }
5410     if ( aExp == 0 ) {
5411         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5412         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5413     }
5414     zExp = aExp - bExp + 0x3FFD;
5415     shortShift128Left(
5416         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5417     shortShift128Left(
5418         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5419     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5420         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5421         ++zExp;
5422     }
5423     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5424     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5425     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5426     while ( (sbits64) rem0 < 0 ) {
5427         --zSig0;
5428         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5429     }
5430     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5431     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5432         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5433         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5434         while ( (sbits64) rem1 < 0 ) {
5435             --zSig1;
5436             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5437         }
5438         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5439     }
5440     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5441     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5442
5443 }
5444
5445 /*----------------------------------------------------------------------------
5446 | Returns the remainder of the quadruple-precision floating-point value `a'
5447 | with respect to the corresponding value `b'.  The operation is performed
5448 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5449 *----------------------------------------------------------------------------*/
5450
5451 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5452 {
5453     flag aSign, zSign;
5454     int32 aExp, bExp, expDiff;
5455     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5456     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5457     sbits64 sigMean0;
5458     float128 z;
5459
5460     aSig1 = extractFloat128Frac1( a );
5461     aSig0 = extractFloat128Frac0( a );
5462     aExp = extractFloat128Exp( a );
5463     aSign = extractFloat128Sign( a );
5464     bSig1 = extractFloat128Frac1( b );
5465     bSig0 = extractFloat128Frac0( b );
5466     bExp = extractFloat128Exp( b );
5467     if ( aExp == 0x7FFF ) {
5468         if (    ( aSig0 | aSig1 )
5469              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5470             return propagateFloat128NaN( a, b STATUS_VAR );
5471         }
5472         goto invalid;
5473     }
5474     if ( bExp == 0x7FFF ) {
5475         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5476         return a;
5477     }
5478     if ( bExp == 0 ) {
5479         if ( ( bSig0 | bSig1 ) == 0 ) {
5480  invalid:
5481             float_raise( float_flag_invalid STATUS_VAR);
5482             z.low = float128_default_nan_low;
5483             z.high = float128_default_nan_high;
5484             return z;
5485         }
5486         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5487     }
5488     if ( aExp == 0 ) {
5489         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5490         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5491     }
5492     expDiff = aExp - bExp;
5493     if ( expDiff < -1 ) return a;
5494     shortShift128Left(
5495         aSig0 | LIT64( 0x0001000000000000 ),
5496         aSig1,
5497         15 - ( expDiff < 0 ),
5498         &aSig0,
5499         &aSig1
5500     );
5501     shortShift128Left(
5502         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5503     q = le128( bSig0, bSig1, aSig0, aSig1 );
5504     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5505     expDiff -= 64;
5506     while ( 0 < expDiff ) {
5507         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5508         q = ( 4 < q ) ? q - 4 : 0;
5509         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5510         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5511         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5512         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5513         expDiff -= 61;
5514     }
5515     if ( -64 < expDiff ) {
5516         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5517         q = ( 4 < q ) ? q - 4 : 0;
5518         q >>= - expDiff;
5519         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5520         expDiff += 52;
5521         if ( expDiff < 0 ) {
5522             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5523         }
5524         else {
5525             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5526         }
5527         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5528         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5529     }
5530     else {
5531         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5532         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5533     }
5534     do {
5535         alternateASig0 = aSig0;
5536         alternateASig1 = aSig1;
5537         ++q;
5538         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5539     } while ( 0 <= (sbits64) aSig0 );
5540     add128(
5541         aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5542     if (    ( sigMean0 < 0 )
5543          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5544         aSig0 = alternateASig0;
5545         aSig1 = alternateASig1;
5546     }
5547     zSign = ( (sbits64) aSig0 < 0 );
5548     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5549     return
5550         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5551
5552 }
5553
5554 /*----------------------------------------------------------------------------
5555 | Returns the square root of the quadruple-precision floating-point value `a'.
5556 | The operation is performed according to the IEC/IEEE Standard for Binary
5557 | Floating-Point Arithmetic.
5558 *----------------------------------------------------------------------------*/
5559
5560 float128 float128_sqrt( float128 a STATUS_PARAM )
5561 {
5562     flag aSign;
5563     int32 aExp, zExp;
5564     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5565     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5566     float128 z;
5567
5568     aSig1 = extractFloat128Frac1( a );
5569     aSig0 = extractFloat128Frac0( a );
5570     aExp = extractFloat128Exp( a );
5571     aSign = extractFloat128Sign( a );
5572     if ( aExp == 0x7FFF ) {
5573         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5574         if ( ! aSign ) return a;
5575         goto invalid;
5576     }
5577     if ( aSign ) {
5578         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5579  invalid:
5580         float_raise( float_flag_invalid STATUS_VAR);
5581         z.low = float128_default_nan_low;
5582         z.high = float128_default_nan_high;
5583         return z;
5584     }
5585     if ( aExp == 0 ) {
5586         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5587         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5588     }
5589     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5590     aSig0 |= LIT64( 0x0001000000000000 );
5591     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5592     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5593     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5594     doubleZSig0 = zSig0<<1;
5595     mul64To128( zSig0, zSig0, &term0, &term1 );
5596     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5597     while ( (sbits64) rem0 < 0 ) {
5598         --zSig0;
5599         doubleZSig0 -= 2;
5600         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5601     }
5602     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5603     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5604         if ( zSig1 == 0 ) zSig1 = 1;
5605         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5606         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5607         mul64To128( zSig1, zSig1, &term2, &term3 );
5608         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5609         while ( (sbits64) rem1 < 0 ) {
5610             --zSig1;
5611             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5612             term3 |= 1;
5613             term2 |= doubleZSig0;
5614             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5615         }
5616         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5617     }
5618     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5619     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5620
5621 }
5622
5623 /*----------------------------------------------------------------------------
5624 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5625 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5626 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5627 *----------------------------------------------------------------------------*/
5628
5629 int float128_eq( float128 a, float128 b STATUS_PARAM )
5630 {
5631
5632     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5633               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5634          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5635               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5636        ) {
5637         if (    float128_is_signaling_nan( a )
5638              || float128_is_signaling_nan( b ) ) {
5639             float_raise( float_flag_invalid STATUS_VAR);
5640         }
5641         return 0;
5642     }
5643     return
5644            ( a.low == b.low )
5645         && (    ( a.high == b.high )
5646              || (    ( a.low == 0 )
5647                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5648            );
5649
5650 }
5651
5652 /*----------------------------------------------------------------------------
5653 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5654 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
5655 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5656 | Arithmetic.
5657 *----------------------------------------------------------------------------*/
5658
5659 int float128_le( float128 a, float128 b STATUS_PARAM )
5660 {
5661     flag aSign, bSign;
5662
5663     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5664               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5665          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5666               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5667        ) {
5668         float_raise( float_flag_invalid STATUS_VAR);
5669         return 0;
5670     }
5671     aSign = extractFloat128Sign( a );
5672     bSign = extractFloat128Sign( b );
5673     if ( aSign != bSign ) {
5674         return
5675                aSign
5676             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5677                  == 0 );
5678     }
5679     return
5680           aSign ? le128( b.high, b.low, a.high, a.low )
5681         : le128( a.high, a.low, b.high, b.low );
5682
5683 }
5684
5685 /*----------------------------------------------------------------------------
5686 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5687 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5688 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5689 *----------------------------------------------------------------------------*/
5690
5691 int float128_lt( float128 a, float128 b STATUS_PARAM )
5692 {
5693     flag aSign, bSign;
5694
5695     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5696               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5697          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5698               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5699        ) {
5700         float_raise( float_flag_invalid STATUS_VAR);
5701         return 0;
5702     }
5703     aSign = extractFloat128Sign( a );
5704     bSign = extractFloat128Sign( b );
5705     if ( aSign != bSign ) {
5706         return
5707                aSign
5708             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5709                  != 0 );
5710     }
5711     return
5712           aSign ? lt128( b.high, b.low, a.high, a.low )
5713         : lt128( a.high, a.low, b.high, b.low );
5714
5715 }
5716
5717 /*----------------------------------------------------------------------------
5718 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5719 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5720 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5721 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5722 *----------------------------------------------------------------------------*/
5723
5724 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5725 {
5726
5727     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5728               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5729          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5730               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5731        ) {
5732         float_raise( float_flag_invalid STATUS_VAR);
5733         return 0;
5734     }
5735     return
5736            ( a.low == b.low )
5737         && (    ( a.high == b.high )
5738              || (    ( a.low == 0 )
5739                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5740            );
5741
5742 }
5743
5744 /*----------------------------------------------------------------------------
5745 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5746 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5747 | cause an exception.  Otherwise, the comparison is performed according to the
5748 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5749 *----------------------------------------------------------------------------*/
5750
5751 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5752 {
5753     flag aSign, bSign;
5754
5755     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5756               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5757          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5758               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5759        ) {
5760         if (    float128_is_signaling_nan( a )
5761              || float128_is_signaling_nan( b ) ) {
5762             float_raise( float_flag_invalid STATUS_VAR);
5763         }
5764         return 0;
5765     }
5766     aSign = extractFloat128Sign( a );
5767     bSign = extractFloat128Sign( b );
5768     if ( aSign != bSign ) {
5769         return
5770                aSign
5771             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5772                  == 0 );
5773     }
5774     return
5775           aSign ? le128( b.high, b.low, a.high, a.low )
5776         : le128( a.high, a.low, b.high, b.low );
5777
5778 }
5779
5780 /*----------------------------------------------------------------------------
5781 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5782 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5783 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5784 | Standard for Binary Floating-Point Arithmetic.
5785 *----------------------------------------------------------------------------*/
5786
5787 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5788 {
5789     flag aSign, bSign;
5790
5791     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5792               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5793          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5794               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5795        ) {
5796         if (    float128_is_signaling_nan( a )
5797              || float128_is_signaling_nan( b ) ) {
5798             float_raise( float_flag_invalid STATUS_VAR);
5799         }
5800         return 0;
5801     }
5802     aSign = extractFloat128Sign( a );
5803     bSign = extractFloat128Sign( b );
5804     if ( aSign != bSign ) {
5805         return
5806                aSign
5807             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5808                  != 0 );
5809     }
5810     return
5811           aSign ? lt128( b.high, b.low, a.high, a.low )
5812         : lt128( a.high, a.low, b.high, b.low );
5813
5814 }
5815
5816 #endif
5817
5818 /* misc functions */
5819 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5820 {
5821     return int64_to_float32(a STATUS_VAR);
5822 }
5823
5824 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5825 {
5826     return int64_to_float64(a STATUS_VAR);
5827 }
5828
5829 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5830 {
5831     int64_t v;
5832     unsigned int res;
5833
5834     v = float32_to_int64(a STATUS_VAR);
5835     if (v < 0) {
5836         res = 0;
5837         float_raise( float_flag_invalid STATUS_VAR);
5838     } else if (v > 0xffffffff) {
5839         res = 0xffffffff;
5840         float_raise( float_flag_invalid STATUS_VAR);
5841     } else {
5842         res = v;
5843     }
5844     return res;
5845 }
5846
5847 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5848 {
5849     int64_t v;
5850     unsigned int res;
5851
5852     v = float32_to_int64_round_to_zero(a STATUS_VAR);
5853     if (v < 0) {
5854         res = 0;
5855         float_raise( float_flag_invalid STATUS_VAR);
5856     } else if (v > 0xffffffff) {
5857         res = 0xffffffff;
5858         float_raise( float_flag_invalid STATUS_VAR);
5859     } else {
5860         res = v;
5861     }
5862     return res;
5863 }
5864
5865 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
5866 {
5867     int64_t v;
5868     unsigned int res;
5869
5870     v = float32_to_int64_round_to_zero(a STATUS_VAR);
5871     if (v < 0) {
5872         res = 0;
5873         float_raise( float_flag_invalid STATUS_VAR);
5874     } else if (v > 0xffff) {
5875         res = 0xffff;
5876         float_raise( float_flag_invalid STATUS_VAR);
5877     } else {
5878         res = v;
5879     }
5880     return res;
5881 }
5882
5883 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5884 {
5885     int64_t v;
5886     unsigned int res;
5887
5888     v = float64_to_int64(a STATUS_VAR);
5889     if (v < 0) {
5890         res = 0;
5891         float_raise( float_flag_invalid STATUS_VAR);
5892     } else if (v > 0xffffffff) {
5893         res = 0xffffffff;
5894         float_raise( float_flag_invalid STATUS_VAR);
5895     } else {
5896         res = v;
5897     }
5898     return res;
5899 }
5900
5901 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5902 {
5903     int64_t v;
5904     unsigned int res;
5905
5906     v = float64_to_int64_round_to_zero(a STATUS_VAR);
5907     if (v < 0) {
5908         res = 0;
5909         float_raise( float_flag_invalid STATUS_VAR);
5910     } else if (v > 0xffffffff) {
5911         res = 0xffffffff;
5912         float_raise( float_flag_invalid STATUS_VAR);
5913     } else {
5914         res = v;
5915     }
5916     return res;
5917 }
5918
5919 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
5920 {
5921     int64_t v;
5922     unsigned int res;
5923
5924     v = float64_to_int64_round_to_zero(a STATUS_VAR);
5925     if (v < 0) {
5926         res = 0;
5927         float_raise( float_flag_invalid STATUS_VAR);
5928     } else if (v > 0xffff) {
5929         res = 0xffff;
5930         float_raise( float_flag_invalid STATUS_VAR);
5931     } else {
5932         res = v;
5933     }
5934     return res;
5935 }
5936
5937 /* FIXME: This looks broken.  */
5938 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5939 {
5940     int64_t v;
5941
5942     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5943     v += float64_val(a);
5944     v = float64_to_int64(make_float64(v) STATUS_VAR);
5945
5946     return v - INT64_MIN;
5947 }
5948
5949 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5950 {
5951     int64_t v;
5952
5953     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5954     v += float64_val(a);
5955     v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5956
5957     return v - INT64_MIN;
5958 }
5959
5960 #define COMPARE(s, nan_exp)                                                  \
5961 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
5962                                       int is_quiet STATUS_PARAM )            \
5963 {                                                                            \
5964     flag aSign, bSign;                                                       \
5965     bits ## s av, bv;                                                        \
5966     a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
5967     b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
5968                                                                              \
5969     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5970          extractFloat ## s ## Frac( a ) ) ||                                 \
5971         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5972           extractFloat ## s ## Frac( b ) )) {                                \
5973         if (!is_quiet ||                                                     \
5974             float ## s ## _is_signaling_nan( a ) ||                          \
5975             float ## s ## _is_signaling_nan( b ) ) {                         \
5976             float_raise( float_flag_invalid STATUS_VAR);                     \
5977         }                                                                    \
5978         return float_relation_unordered;                                     \
5979     }                                                                        \
5980     aSign = extractFloat ## s ## Sign( a );                                  \
5981     bSign = extractFloat ## s ## Sign( b );                                  \
5982     av = float ## s ## _val(a);                                              \
5983     bv = float ## s ## _val(b);                                              \
5984     if ( aSign != bSign ) {                                                  \
5985         if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) {                         \
5986             /* zero case */                                                  \
5987             return float_relation_equal;                                     \
5988         } else {                                                             \
5989             return 1 - (2 * aSign);                                          \
5990         }                                                                    \
5991     } else {                                                                 \
5992         if (av == bv) {                                                      \
5993             return float_relation_equal;                                     \
5994         } else {                                                             \
5995             return 1 - 2 * (aSign ^ ( av < bv ));                            \
5996         }                                                                    \
5997     }                                                                        \
5998 }                                                                            \
5999                                                                              \
6000 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
6001 {                                                                            \
6002     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
6003 }                                                                            \
6004                                                                              \
6005 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
6006 {                                                                            \
6007     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
6008 }
6009
6010 COMPARE(32, 0xff)
6011 COMPARE(64, 0x7ff)
6012
6013 INLINE int float128_compare_internal( float128 a, float128 b,
6014                                       int is_quiet STATUS_PARAM )
6015 {
6016     flag aSign, bSign;
6017
6018     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6019           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6020         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6021           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6022         if (!is_quiet ||
6023             float128_is_signaling_nan( a ) ||
6024             float128_is_signaling_nan( b ) ) {
6025             float_raise( float_flag_invalid STATUS_VAR);
6026         }
6027         return float_relation_unordered;
6028     }
6029     aSign = extractFloat128Sign( a );
6030     bSign = extractFloat128Sign( b );
6031     if ( aSign != bSign ) {
6032         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6033             /* zero case */
6034             return float_relation_equal;
6035         } else {
6036             return 1 - (2 * aSign);
6037         }
6038     } else {
6039         if (a.low == b.low && a.high == b.high) {
6040             return float_relation_equal;
6041         } else {
6042             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6043         }
6044     }
6045 }
6046
6047 int float128_compare( float128 a, float128 b STATUS_PARAM )
6048 {
6049     return float128_compare_internal(a, b, 0 STATUS_VAR);
6050 }
6051
6052 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6053 {
6054     return float128_compare_internal(a, b, 1 STATUS_VAR);
6055 }
6056
6057 /* Multiply A by 2 raised to the power N.  */
6058 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6059 {
6060     flag aSign;
6061     int16 aExp;
6062     bits32 aSig;
6063
6064     a = float32_squash_input_denormal(a STATUS_VAR);
6065     aSig = extractFloat32Frac( a );
6066     aExp = extractFloat32Exp( a );
6067     aSign = extractFloat32Sign( a );
6068
6069     if ( aExp == 0xFF ) {
6070         return a;
6071     }
6072     if ( aExp != 0 )
6073         aSig |= 0x00800000;
6074     else if ( aSig == 0 )
6075         return a;
6076
6077     aExp += n - 1;
6078     aSig <<= 7;
6079     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6080 }
6081
6082 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6083 {
6084     flag aSign;
6085     int16 aExp;
6086     bits64 aSig;
6087
6088     a = float64_squash_input_denormal(a STATUS_VAR);
6089     aSig = extractFloat64Frac( a );
6090     aExp = extractFloat64Exp( a );
6091     aSign = extractFloat64Sign( a );
6092
6093     if ( aExp == 0x7FF ) {
6094         return a;
6095     }
6096     if ( aExp != 0 )
6097         aSig |= LIT64( 0x0010000000000000 );
6098     else if ( aSig == 0 )
6099         return a;
6100
6101     aExp += n - 1;
6102     aSig <<= 10;
6103     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6104 }
6105
6106 #ifdef FLOATX80
6107 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6108 {
6109     flag aSign;
6110     int16 aExp;
6111     bits64 aSig;
6112
6113     aSig = extractFloatx80Frac( a );
6114     aExp = extractFloatx80Exp( a );
6115     aSign = extractFloatx80Sign( a );
6116
6117     if ( aExp == 0x7FF ) {
6118         return a;
6119     }
6120     if (aExp == 0 && aSig == 0)
6121         return a;
6122
6123     aExp += n;
6124     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6125                                           aSign, aExp, aSig, 0 STATUS_VAR );
6126 }
6127 #endif
6128
6129 #ifdef FLOAT128
6130 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6131 {
6132     flag aSign;
6133     int32 aExp;
6134     bits64 aSig0, aSig1;
6135
6136     aSig1 = extractFloat128Frac1( a );
6137     aSig0 = extractFloat128Frac0( a );
6138     aExp = extractFloat128Exp( a );
6139     aSign = extractFloat128Sign( a );
6140     if ( aExp == 0x7FFF ) {
6141         return a;
6142     }
6143     if ( aExp != 0 )
6144         aSig0 |= LIT64( 0x0001000000000000 );
6145     else if ( aSig0 == 0 && aSig1 == 0 )
6146         return a;
6147
6148     aExp += n - 1;
6149     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6150                                           STATUS_VAR );
6151
6152 }
6153 #endif