fpu/softfloat.c

   1
   2 /*============================================================================
   3
   4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
   5 Package, Release 2b.
   6
   7 Written by John R. Hauser.  This work was made possible in part by the
   8 International Computer Science Institute, located at Suite 600, 1947 Center
   9 Street, Berkeley, California 94704.  Funding was partially provided by the
  10 National Science Foundation under grant MIP-9311980.  The original version
  11 of this code was written as part of a project to build a fixed-point vector
  12 processor in collaboration with the University of California at Berkeley,
  13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  15 arithmetic/SoftFloat.html'.
  16
  17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  19 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  25
  26 Derivative works are acceptable, even for commercial purposes, so long as
  27 (1) the source code for the derivative work includes prominent notice that
  28 the work is derivative, and (2) the source code includes prominent notice with
  29 these four paragraphs for those parts of this code that are retained.
  30
  31 =============================================================================*/
  32
  33 #include "softfloat.h"
  34
  35 /*----------------------------------------------------------------------------
  36 | Primitive arithmetic functions, including multi-word arithmetic, and
  37 | division and square root approximations.  (Can be specialized to target if
  38 | desired.)
  39 *----------------------------------------------------------------------------*/
  40 #include "softfloat-macros.h"
  41
  42 /*----------------------------------------------------------------------------
  43 | Functions and definitions to determine:  (1) whether tininess for underflow
  44 | is detected before or after rounding by default, (2) what (if anything)
  45 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  46 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  47 | are propagated from function inputs to output.  These details are target-
  48 | specific.
  49 *----------------------------------------------------------------------------*/
  50 #include "softfloat-specialize.h"
  51
  52 void set_float_rounding_mode(int val STATUS_PARAM)
  53 {
  54     STATUS(float_rounding_mode) = val;
  55 }
  56
  57 void set_float_exception_flags(int val STATUS_PARAM)
  58 {
  59     STATUS(float_exception_flags) = val;
  60 }
  61
  62 #ifdef FLOATX80
  63 void set_floatx80_rounding_precision(int val STATUS_PARAM)
  64 {
  65     STATUS(floatx80_rounding_precision) = val;
  66 }
  67 #endif
  68
  69 /*----------------------------------------------------------------------------
  70 | Returns the fraction bits of the half-precision floating-point value `a'.
  71 *----------------------------------------------------------------------------*/
  72
  73 INLINE uint32_t extractFloat16Frac(float16 a)
  74 {
  75     return float16_val(a) & 0x3ff;
  76 }
  77
  78 /*----------------------------------------------------------------------------
  79 | Returns the exponent bits of the half-precision floating-point value `a'.
  80 *----------------------------------------------------------------------------*/
  81
  82 INLINE int16 extractFloat16Exp(float16 a)
  83 {
  84     return (float16_val(a) >> 10) & 0x1f;
  85 }
  86
  87 /*----------------------------------------------------------------------------
  88 | Returns the sign bit of the single-precision floating-point value `a'.
  89 *----------------------------------------------------------------------------*/
  90
  91 INLINE flag extractFloat16Sign(float16 a)
  92 {
  93     return float16_val(a)>>15;
  94 }
  95
  96 /*----------------------------------------------------------------------------
  97 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  98 | and 7, and returns the properly rounded 32-bit integer corresponding to the
  99 | input.  If `zSign' is 1, the input is negated before being converted to an
 100 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 101 | is simply rounded to an integer, with the inexact exception raised if the
 102 | input cannot be represented exactly as an integer.  However, if the fixed-
 103 | point input is too large, the invalid exception is raised and the largest
 104 | positive or negative integer is returned.
 105 *----------------------------------------------------------------------------*/
 106
 107 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
 108 {
 109     int8 roundingMode;
 110     flag roundNearestEven;
 111     int8 roundIncrement, roundBits;
 112     int32 z;
 113
 114     roundingMode = STATUS(float_rounding_mode);
 115     roundNearestEven = ( roundingMode == float_round_nearest_even );
 116     roundIncrement = 0x40;
 117     if ( ! roundNearestEven ) {
 118         if ( roundingMode == float_round_to_zero ) {
 119             roundIncrement = 0;
 120         }
 121         else {
 122             roundIncrement = 0x7F;
 123             if ( zSign ) {
 124                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 125             }
 126             else {
 127                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 128             }
 129         }
 130     }
 131     roundBits = absZ & 0x7F;
 132     absZ = ( absZ + roundIncrement )>>7;
 133     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 134     z = absZ;
 135     if ( zSign ) z = - z;
 136     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 137         float_raise( float_flag_invalid STATUS_VAR);
 138         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 139     }
 140     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 141     return z;
 142
 143 }
 144
 145 /*----------------------------------------------------------------------------
 146 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 147 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 148 | and returns the properly rounded 64-bit integer corresponding to the input.
 149 | If `zSign' is 1, the input is negated before being converted to an integer.
 150 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 151 | the inexact exception raised if the input cannot be represented exactly as
 152 | an integer.  However, if the fixed-point input is too large, the invalid
 153 | exception is raised and the largest positive or negative integer is
 154 | returned.
 155 *----------------------------------------------------------------------------*/
 156
 157 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
 158 {
 159     int8 roundingMode;
 160     flag roundNearestEven, increment;
 161     int64 z;
 162
 163     roundingMode = STATUS(float_rounding_mode);
 164     roundNearestEven = ( roundingMode == float_round_nearest_even );
 165     increment = ( (sbits64) absZ1 < 0 );
 166     if ( ! roundNearestEven ) {
 167         if ( roundingMode == float_round_to_zero ) {
 168             increment = 0;
 169         }
 170         else {
 171             if ( zSign ) {
 172                 increment = ( roundingMode == float_round_down ) && absZ1;
 173             }
 174             else {
 175                 increment = ( roundingMode == float_round_up ) && absZ1;
 176             }
 177         }
 178     }
 179     if ( increment ) {
 180         ++absZ0;
 181         if ( absZ0 == 0 ) goto overflow;
 182         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 183     }
 184     z = absZ0;
 185     if ( zSign ) z = - z;
 186     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 187  overflow:
 188         float_raise( float_flag_invalid STATUS_VAR);
 189         return
 190               zSign ? (sbits64) LIT64( 0x8000000000000000 )
 191             : LIT64( 0x7FFFFFFFFFFFFFFF );
 192     }
 193     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 194     return z;
 195
 196 }
 197
 198 /*----------------------------------------------------------------------------
 199 | Returns the fraction bits of the single-precision floating-point value `a'.
 200 *----------------------------------------------------------------------------*/
 201
 202 INLINE bits32 extractFloat32Frac( float32 a )
 203 {
 204
 205     return float32_val(a) & 0x007FFFFF;
 206
 207 }
 208
 209 /*----------------------------------------------------------------------------
 210 | Returns the exponent bits of the single-precision floating-point value `a'.
 211 *----------------------------------------------------------------------------*/
 212
 213 INLINE int16 extractFloat32Exp( float32 a )
 214 {
 215
 216     return ( float32_val(a)>>23 ) & 0xFF;
 217
 218 }
 219
 220 /*----------------------------------------------------------------------------
 221 | Returns the sign bit of the single-precision floating-point value `a'.
 222 *----------------------------------------------------------------------------*/
 223
 224 INLINE flag extractFloat32Sign( float32 a )
 225 {
 226
 227     return float32_val(a)>>31;
 228
 229 }
 230
 231 /*----------------------------------------------------------------------------
 232 | If `a' is denormal and we are in flush-to-zero mode then set the
 233 | input-denormal exception and return zero. Otherwise just return the value.
 234 *----------------------------------------------------------------------------*/
 235 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
 236 {
 237     if (STATUS(flush_inputs_to_zero)) {
 238         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
 239             float_raise(float_flag_input_denormal STATUS_VAR);
 240             return make_float32(float32_val(a) & 0x80000000);
 241         }
 242     }
 243     return a;
 244 }
 245
 246 /*----------------------------------------------------------------------------
 247 | Normalizes the subnormal single-precision floating-point value represented
 248 | by the denormalized significand `aSig'.  The normalized exponent and
 249 | significand are stored at the locations pointed to by `zExpPtr' and
 250 | `zSigPtr', respectively.
 251 *----------------------------------------------------------------------------*/
 252
 253 static void
 254  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 255 {
 256     int8 shiftCount;
 257
 258     shiftCount = countLeadingZeros32( aSig ) - 8;
 259     *zSigPtr = aSig<<shiftCount;
 260     *zExpPtr = 1 - shiftCount;
 261
 262 }
 263
 264 /*----------------------------------------------------------------------------
 265 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 266 | single-precision floating-point value, returning the result.  After being
 267 | shifted into the proper positions, the three fields are simply added
 268 | together to form the result.  This means that any integer portion of `zSig'
 269 | will be added into the exponent.  Since a properly normalized significand
 270 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 271 | than the desired result exponent whenever `zSig' is a complete, normalized
 272 | significand.
 273 *----------------------------------------------------------------------------*/
 274
 275 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 276 {
 277
 278     return make_float32(
 279           ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
 280
 281 }
 282
 283 /*----------------------------------------------------------------------------
 284 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 285 | and significand `zSig', and returns the proper single-precision floating-
 286 | point value corresponding to the abstract input.  Ordinarily, the abstract
 287 | value is simply rounded and packed into the single-precision format, with
 288 | the inexact exception raised if the abstract input cannot be represented
 289 | exactly.  However, if the abstract value is too large, the overflow and
 290 | inexact exceptions are raised and an infinity or maximal finite value is
 291 | returned.  If the abstract value is too small, the input value is rounded to
 292 | a subnormal number, and the underflow and inexact exceptions are raised if
 293 | the abstract input cannot be represented exactly as a subnormal single-
 294 | precision floating-point number.
 295 |     The input significand `zSig' has its binary point between bits 30
 296 | and 29, which is 7 bits to the left of the usual location.  This shifted
 297 | significand must be normalized or smaller.  If `zSig' is not normalized,
 298 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 299 | and it must not require rounding.  In the usual case that `zSig' is
 300 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 301 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 302 | Binary Floating-Point Arithmetic.
 303 *----------------------------------------------------------------------------*/
 304
 305 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 306 {
 307     int8 roundingMode;
 308     flag roundNearestEven;
 309     int8 roundIncrement, roundBits;
 310     flag isTiny;
 311
 312     roundingMode = STATUS(float_rounding_mode);
 313     roundNearestEven = ( roundingMode == float_round_nearest_even );
 314     roundIncrement = 0x40;
 315     if ( ! roundNearestEven ) {
 316         if ( roundingMode == float_round_to_zero ) {
 317             roundIncrement = 0;
 318         }
 319         else {
 320             roundIncrement = 0x7F;
 321             if ( zSign ) {
 322                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 323             }
 324             else {
 325                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 326             }
 327         }
 328     }
 329     roundBits = zSig & 0x7F;
 330     if ( 0xFD <= (bits16) zExp ) {
 331         if (    ( 0xFD < zExp )
 332              || (    ( zExp == 0xFD )
 333                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 334            ) {
 335             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 336             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 337         }
 338         if ( zExp < 0 ) {
 339             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
 340             isTiny =
 341                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 342                 || ( zExp < -1 )
 343                 || ( zSig + roundIncrement < 0x80000000 );
 344             shift32RightJamming( zSig, - zExp, &zSig );
 345             zExp = 0;
 346             roundBits = zSig & 0x7F;
 347             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 348         }
 349     }
 350     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 351     zSig = ( zSig + roundIncrement )>>7;
 352     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 353     if ( zSig == 0 ) zExp = 0;
 354     return packFloat32( zSign, zExp, zSig );
 355
 356 }
 357
 358 /*----------------------------------------------------------------------------
 359 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 360 | and significand `zSig', and returns the proper single-precision floating-
 361 | point value corresponding to the abstract input.  This routine is just like
 362 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 363 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 364 | floating-point exponent.
 365 *----------------------------------------------------------------------------*/
 366
 367 static float32
 368  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 369 {
 370     int8 shiftCount;
 371
 372     shiftCount = countLeadingZeros32( zSig ) - 1;
 373     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 374
 375 }
 376
 377 /*----------------------------------------------------------------------------
 378 | Returns the fraction bits of the double-precision floating-point value `a'.
 379 *----------------------------------------------------------------------------*/
 380
 381 INLINE bits64 extractFloat64Frac( float64 a )
 382 {
 383
 384     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 385
 386 }
 387
 388 /*----------------------------------------------------------------------------
 389 | Returns the exponent bits of the double-precision floating-point value `a'.
 390 *----------------------------------------------------------------------------*/
 391
 392 INLINE int16 extractFloat64Exp( float64 a )
 393 {
 394
 395     return ( float64_val(a)>>52 ) & 0x7FF;
 396
 397 }
 398
 399 /*----------------------------------------------------------------------------
 400 | Returns the sign bit of the double-precision floating-point value `a'.
 401 *----------------------------------------------------------------------------*/
 402
 403 INLINE flag extractFloat64Sign( float64 a )
 404 {
 405
 406     return float64_val(a)>>63;
 407
 408 }
 409
 410 /*----------------------------------------------------------------------------
 411 | If `a' is denormal and we are in flush-to-zero mode then set the
 412 | input-denormal exception and return zero. Otherwise just return the value.
 413 *----------------------------------------------------------------------------*/
 414 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
 415 {
 416     if (STATUS(flush_inputs_to_zero)) {
 417         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
 418             float_raise(float_flag_input_denormal STATUS_VAR);
 419             return make_float64(float64_val(a) & (1ULL << 63));
 420         }
 421     }
 422     return a;
 423 }
 424
 425 /*----------------------------------------------------------------------------
 426 | Normalizes the subnormal double-precision floating-point value represented
 427 | by the denormalized significand `aSig'.  The normalized exponent and
 428 | significand are stored at the locations pointed to by `zExpPtr' and
 429 | `zSigPtr', respectively.
 430 *----------------------------------------------------------------------------*/
 431
 432 static void
 433  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 434 {
 435     int8 shiftCount;
 436
 437     shiftCount = countLeadingZeros64( aSig ) - 11;
 438     *zSigPtr = aSig<<shiftCount;
 439     *zExpPtr = 1 - shiftCount;
 440
 441 }
 442
 443 /*----------------------------------------------------------------------------
 444 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 445 | double-precision floating-point value, returning the result.  After being
 446 | shifted into the proper positions, the three fields are simply added
 447 | together to form the result.  This means that any integer portion of `zSig'
 448 | will be added into the exponent.  Since a properly normalized significand
 449 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 450 | than the desired result exponent whenever `zSig' is a complete, normalized
 451 | significand.
 452 *----------------------------------------------------------------------------*/
 453
 454 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 455 {
 456
 457     return make_float64(
 458         ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
 459
 460 }
 461
 462 /*----------------------------------------------------------------------------
 463 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 464 | and significand `zSig', and returns the proper double-precision floating-
 465 | point value corresponding to the abstract input.  Ordinarily, the abstract
 466 | value is simply rounded and packed into the double-precision format, with
 467 | the inexact exception raised if the abstract input cannot be represented
 468 | exactly.  However, if the abstract value is too large, the overflow and
 469 | inexact exceptions are raised and an infinity or maximal finite value is
 470 | returned.  If the abstract value is too small, the input value is rounded
 471 | to a subnormal number, and the underflow and inexact exceptions are raised
 472 | if the abstract input cannot be represented exactly as a subnormal double-
 473 | precision floating-point number.
 474 |     The input significand `zSig' has its binary point between bits 62
 475 | and 61, which is 10 bits to the left of the usual location.  This shifted
 476 | significand must be normalized or smaller.  If `zSig' is not normalized,
 477 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 478 | and it must not require rounding.  In the usual case that `zSig' is
 479 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 480 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 481 | Binary Floating-Point Arithmetic.
 482 *----------------------------------------------------------------------------*/
 483
 484 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 485 {
 486     int8 roundingMode;
 487     flag roundNearestEven;
 488     int16 roundIncrement, roundBits;
 489     flag isTiny;
 490
 491     roundingMode = STATUS(float_rounding_mode);
 492     roundNearestEven = ( roundingMode == float_round_nearest_even );
 493     roundIncrement = 0x200;
 494     if ( ! roundNearestEven ) {
 495         if ( roundingMode == float_round_to_zero ) {
 496             roundIncrement = 0;
 497         }
 498         else {
 499             roundIncrement = 0x3FF;
 500             if ( zSign ) {
 501                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 502             }
 503             else {
 504                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 505             }
 506         }
 507     }
 508     roundBits = zSig & 0x3FF;
 509     if ( 0x7FD <= (bits16) zExp ) {
 510         if (    ( 0x7FD < zExp )
 511              || (    ( zExp == 0x7FD )
 512                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 513            ) {
 514             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 515             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 516         }
 517         if ( zExp < 0 ) {
 518             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
 519             isTiny =
 520                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 521                 || ( zExp < -1 )
 522                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 523             shift64RightJamming( zSig, - zExp, &zSig );
 524             zExp = 0;
 525             roundBits = zSig & 0x3FF;
 526             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 527         }
 528     }
 529     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 530     zSig = ( zSig + roundIncrement )>>10;
 531     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 532     if ( zSig == 0 ) zExp = 0;
 533     return packFloat64( zSign, zExp, zSig );
 534
 535 }
 536
 537 /*----------------------------------------------------------------------------
 538 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 539 | and significand `zSig', and returns the proper double-precision floating-
 540 | point value corresponding to the abstract input.  This routine is just like
 541 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 542 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 543 | floating-point exponent.
 544 *----------------------------------------------------------------------------*/
 545
 546 static float64
 547  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 548 {
 549     int8 shiftCount;
 550
 551     shiftCount = countLeadingZeros64( zSig ) - 1;
 552     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 553
 554 }
 555
 556 #ifdef FLOATX80
 557
 558 /*----------------------------------------------------------------------------
 559 | Returns the fraction bits of the extended double-precision floating-point
 560 | value `a'.
 561 *----------------------------------------------------------------------------*/
 562
 563 INLINE bits64 extractFloatx80Frac( floatx80 a )
 564 {
 565
 566     return a.low;
 567
 568 }
 569
 570 /*----------------------------------------------------------------------------
 571 | Returns the exponent bits of the extended double-precision floating-point
 572 | value `a'.
 573 *----------------------------------------------------------------------------*/
 574
 575 INLINE int32 extractFloatx80Exp( floatx80 a )
 576 {
 577
 578     return a.high & 0x7FFF;
 579
 580 }
 581
 582 /*----------------------------------------------------------------------------
 583 | Returns the sign bit of the extended double-precision floating-point value
 584 | `a'.
 585 *----------------------------------------------------------------------------*/
 586
 587 INLINE flag extractFloatx80Sign( floatx80 a )
 588 {
 589
 590     return a.high>>15;
 591
 592 }
 593
 594 /*----------------------------------------------------------------------------
 595 | Normalizes the subnormal extended double-precision floating-point value
 596 | represented by the denormalized significand `aSig'.  The normalized exponent
 597 | and significand are stored at the locations pointed to by `zExpPtr' and
 598 | `zSigPtr', respectively.
 599 *----------------------------------------------------------------------------*/
 600
 601 static void
 602  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 603 {
 604     int8 shiftCount;
 605
 606     shiftCount = countLeadingZeros64( aSig );
 607     *zSigPtr = aSig<<shiftCount;
 608     *zExpPtr = 1 - shiftCount;
 609
 610 }
 611
 612 /*----------------------------------------------------------------------------
 613 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 614 | extended double-precision floating-point value, returning the result.
 615 *----------------------------------------------------------------------------*/
 616
 617 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 618 {
 619     floatx80 z;
 620
 621     z.low = zSig;
 622     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 623     return z;
 624
 625 }
 626
 627 /*----------------------------------------------------------------------------
 628 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 629 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 630 | and returns the proper extended double-precision floating-point value
 631 | corresponding to the abstract input.  Ordinarily, the abstract value is
 632 | rounded and packed into the extended double-precision format, with the
 633 | inexact exception raised if the abstract input cannot be represented
 634 | exactly.  However, if the abstract value is too large, the overflow and
 635 | inexact exceptions are raised and an infinity or maximal finite value is
 636 | returned.  If the abstract value is too small, the input value is rounded to
 637 | a subnormal number, and the underflow and inexact exceptions are raised if
 638 | the abstract input cannot be represented exactly as a subnormal extended
 639 | double-precision floating-point number.
 640 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 641 | number of bits as single or double precision, respectively.  Otherwise, the
 642 | result is rounded to the full precision of the extended double-precision
 643 | format.
 644 |     The input significand must be normalized or smaller.  If the input
 645 | significand is not normalized, `zExp' must be 0; in that case, the result
 646 | returned is a subnormal number, and it must not require rounding.  The
 647 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 648 | Floating-Point Arithmetic.
 649 *----------------------------------------------------------------------------*/
 650
 651 static floatx80
 652  roundAndPackFloatx80(
 653      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 654  STATUS_PARAM)
 655 {
 656     int8 roundingMode;
 657     flag roundNearestEven, increment, isTiny;
 658     int64 roundIncrement, roundMask, roundBits;
 659
 660     roundingMode = STATUS(float_rounding_mode);
 661     roundNearestEven = ( roundingMode == float_round_nearest_even );
 662     if ( roundingPrecision == 80 ) goto precision80;
 663     if ( roundingPrecision == 64 ) {
 664         roundIncrement = LIT64( 0x0000000000000400 );
 665         roundMask = LIT64( 0x00000000000007FF );
 666     }
 667     else if ( roundingPrecision == 32 ) {
 668         roundIncrement = LIT64( 0x0000008000000000 );
 669         roundMask = LIT64( 0x000000FFFFFFFFFF );
 670     }
 671     else {
 672         goto precision80;
 673     }
 674     zSig0 |= ( zSig1 != 0 );
 675     if ( ! roundNearestEven ) {
 676         if ( roundingMode == float_round_to_zero ) {
 677             roundIncrement = 0;
 678         }
 679         else {
 680             roundIncrement = roundMask;
 681             if ( zSign ) {
 682                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 683             }
 684             else {
 685                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 686             }
 687         }
 688     }
 689     roundBits = zSig0 & roundMask;
 690     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 691         if (    ( 0x7FFE < zExp )
 692              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 693            ) {
 694             goto overflow;
 695         }
 696         if ( zExp <= 0 ) {
 697             if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
 698             isTiny =
 699                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 700                 || ( zExp < 0 )
 701                 || ( zSig0 <= zSig0 + roundIncrement );
 702             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 703             zExp = 0;
 704             roundBits = zSig0 & roundMask;
 705             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 706             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 707             zSig0 += roundIncrement;
 708             if ( (sbits64) zSig0 < 0 ) zExp = 1;
 709             roundIncrement = roundMask + 1;
 710             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 711                 roundMask |= roundIncrement;
 712             }
 713             zSig0 &= ~ roundMask;
 714             return packFloatx80( zSign, zExp, zSig0 );
 715         }
 716     }
 717     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 718     zSig0 += roundIncrement;
 719     if ( zSig0 < roundIncrement ) {
 720         ++zExp;
 721         zSig0 = LIT64( 0x8000000000000000 );
 722     }
 723     roundIncrement = roundMask + 1;
 724     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 725         roundMask |= roundIncrement;
 726     }
 727     zSig0 &= ~ roundMask;
 728     if ( zSig0 == 0 ) zExp = 0;
 729     return packFloatx80( zSign, zExp, zSig0 );
 730  precision80:
 731     increment = ( (sbits64) zSig1 < 0 );
 732     if ( ! roundNearestEven ) {
 733         if ( roundingMode == float_round_to_zero ) {
 734             increment = 0;
 735         }
 736         else {
 737             if ( zSign ) {
 738                 increment = ( roundingMode == float_round_down ) && zSig1;
 739             }
 740             else {
 741                 increment = ( roundingMode == float_round_up ) && zSig1;
 742             }
 743         }
 744     }
 745     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 746         if (    ( 0x7FFE < zExp )
 747              || (    ( zExp == 0x7FFE )
 748                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 749                   && increment
 750                 )
 751            ) {
 752             roundMask = 0;
 753  overflow:
 754             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 755             if (    ( roundingMode == float_round_to_zero )
 756                  || ( zSign && ( roundingMode == float_round_up ) )
 757                  || ( ! zSign && ( roundingMode == float_round_down ) )
 758                ) {
 759                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 760             }
 761             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 762         }
 763         if ( zExp <= 0 ) {
 764             isTiny =
 765                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 766                 || ( zExp < 0 )
 767                 || ! increment
 768                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 769             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 770             zExp = 0;
 771             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 772             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 773             if ( roundNearestEven ) {
 774                 increment = ( (sbits64) zSig1 < 0 );
 775             }
 776             else {
 777                 if ( zSign ) {
 778                     increment = ( roundingMode == float_round_down ) && zSig1;
 779                 }
 780                 else {
 781                     increment = ( roundingMode == float_round_up ) && zSig1;
 782                 }
 783             }
 784             if ( increment ) {
 785                 ++zSig0;
 786                 zSig0 &=
 787                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 788                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
 789             }
 790             return packFloatx80( zSign, zExp, zSig0 );
 791         }
 792     }
 793     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 794     if ( increment ) {
 795         ++zSig0;
 796         if ( zSig0 == 0 ) {
 797             ++zExp;
 798             zSig0 = LIT64( 0x8000000000000000 );
 799         }
 800         else {
 801             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 802         }
 803     }
 804     else {
 805         if ( zSig0 == 0 ) zExp = 0;
 806     }
 807     return packFloatx80( zSign, zExp, zSig0 );
 808
 809 }
 810
 811 /*----------------------------------------------------------------------------
 812 | Takes an abstract floating-point value having sign `zSign', exponent
 813 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 814 | and returns the proper extended double-precision floating-point value
 815 | corresponding to the abstract input.  This routine is just like
 816 | `roundAndPackFloatx80' except that the input significand does not have to be
 817 | normalized.
 818 *----------------------------------------------------------------------------*/
 819
 820 static floatx80
 821  normalizeRoundAndPackFloatx80(
 822      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 823  STATUS_PARAM)
 824 {
 825     int8 shiftCount;
 826
 827     if ( zSig0 == 0 ) {
 828         zSig0 = zSig1;
 829         zSig1 = 0;
 830         zExp -= 64;
 831     }
 832     shiftCount = countLeadingZeros64( zSig0 );
 833     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 834     zExp -= shiftCount;
 835     return
 836         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 837
 838 }
 839
 840 #endif
 841
 842 #ifdef FLOAT128
 843
 844 /*----------------------------------------------------------------------------
 845 | Returns the least-significant 64 fraction bits of the quadruple-precision
 846 | floating-point value `a'.
 847 *----------------------------------------------------------------------------*/
 848
 849 INLINE bits64 extractFloat128Frac1( float128 a )
 850 {
 851
 852     return a.low;
 853
 854 }
 855
 856 /*----------------------------------------------------------------------------
 857 | Returns the most-significant 48 fraction bits of the quadruple-precision
 858 | floating-point value `a'.
 859 *----------------------------------------------------------------------------*/
 860
 861 INLINE bits64 extractFloat128Frac0( float128 a )
 862 {
 863
 864     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 865
 866 }
 867
 868 /*----------------------------------------------------------------------------
 869 | Returns the exponent bits of the quadruple-precision floating-point value
 870 | `a'.
 871 *----------------------------------------------------------------------------*/
 872
 873 INLINE int32 extractFloat128Exp( float128 a )
 874 {
 875
 876     return ( a.high>>48 ) & 0x7FFF;
 877
 878 }
 879
 880 /*----------------------------------------------------------------------------
 881 | Returns the sign bit of the quadruple-precision floating-point value `a'.
 882 *----------------------------------------------------------------------------*/
 883
 884 INLINE flag extractFloat128Sign( float128 a )
 885 {
 886
 887     return a.high>>63;
 888
 889 }
 890
 891 /*----------------------------------------------------------------------------
 892 | Normalizes the subnormal quadruple-precision floating-point value
 893 | represented by the denormalized significand formed by the concatenation of
 894 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 895 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 896 | significand are stored at the location pointed to by `zSig0Ptr', and the
 897 | least significant 64 bits of the normalized significand are stored at the
 898 | location pointed to by `zSig1Ptr'.
 899 *----------------------------------------------------------------------------*/
 900
 901 static void
 902  normalizeFloat128Subnormal(
 903      bits64 aSig0,
 904      bits64 aSig1,
 905      int32 *zExpPtr,
 906      bits64 *zSig0Ptr,
 907      bits64 *zSig1Ptr
 908  )
 909 {
 910     int8 shiftCount;
 911
 912     if ( aSig0 == 0 ) {
 913         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 914         if ( shiftCount < 0 ) {
 915             *zSig0Ptr = aSig1>>( - shiftCount );
 916             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 917         }
 918         else {
 919             *zSig0Ptr = aSig1<<shiftCount;
 920             *zSig1Ptr = 0;
 921         }
 922         *zExpPtr = - shiftCount - 63;
 923     }
 924     else {
 925         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 926         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 927         *zExpPtr = 1 - shiftCount;
 928     }
 929
 930 }
 931
 932 /*----------------------------------------------------------------------------
 933 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 934 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 935 | floating-point value, returning the result.  After being shifted into the
 936 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 937 | added together to form the most significant 32 bits of the result.  This
 938 | means that any integer portion of `zSig0' will be added into the exponent.
 939 | Since a properly normalized significand will have an integer portion equal
 940 | to 1, the `zExp' input should be 1 less than the desired result exponent
 941 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 942 | significand.
 943 *----------------------------------------------------------------------------*/
 944
 945 INLINE float128
 946  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 947 {
 948     float128 z;
 949
 950     z.low = zSig1;
 951     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
 952     return z;
 953
 954 }
 955
 956 /*----------------------------------------------------------------------------
 957 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 958 | and extended significand formed by the concatenation of `zSig0', `zSig1',
 959 | and `zSig2', and returns the proper quadruple-precision floating-point value
 960 | corresponding to the abstract input.  Ordinarily, the abstract value is
 961 | simply rounded and packed into the quadruple-precision format, with the
 962 | inexact exception raised if the abstract input cannot be represented
 963 | exactly.  However, if the abstract value is too large, the overflow and
 964 | inexact exceptions are raised and an infinity or maximal finite value is
 965 | returned.  If the abstract value is too small, the input value is rounded to
 966 | a subnormal number, and the underflow and inexact exceptions are raised if
 967 | the abstract input cannot be represented exactly as a subnormal quadruple-
 968 | precision floating-point number.
 969 |     The input significand must be normalized or smaller.  If the input
 970 | significand is not normalized, `zExp' must be 0; in that case, the result
 971 | returned is a subnormal number, and it must not require rounding.  In the
 972 | usual case that the input significand is normalized, `zExp' must be 1 less
 973 | than the ``true'' floating-point exponent.  The handling of underflow and
 974 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 975 *----------------------------------------------------------------------------*/
 976
 977 static float128
 978  roundAndPackFloat128(
 979      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
 980 {
 981     int8 roundingMode;
 982     flag roundNearestEven, increment, isTiny;
 983
 984     roundingMode = STATUS(float_rounding_mode);
 985     roundNearestEven = ( roundingMode == float_round_nearest_even );
 986     increment = ( (sbits64) zSig2 < 0 );
 987     if ( ! roundNearestEven ) {
 988         if ( roundingMode == float_round_to_zero ) {
 989             increment = 0;
 990         }
 991         else {
 992             if ( zSign ) {
 993                 increment = ( roundingMode == float_round_down ) && zSig2;
 994             }
 995             else {
 996                 increment = ( roundingMode == float_round_up ) && zSig2;
 997             }
 998         }
 999     }
1000     if ( 0x7FFD <= (bits32) zExp ) {
1001         if (    ( 0x7FFD < zExp )
1002              || (    ( zExp == 0x7FFD )
1003                   && eq128(
1004                          LIT64( 0x0001FFFFFFFFFFFF ),
1005                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1006                          zSig0,
1007                          zSig1
1008                      )
1009                   && increment
1010                 )
1011            ) {
1012             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1013             if (    ( roundingMode == float_round_to_zero )
1014                  || ( zSign && ( roundingMode == float_round_up ) )
1015                  || ( ! zSign && ( roundingMode == float_round_down ) )
1016                ) {
1017                 return
1018                     packFloat128(
1019                         zSign,
1020                         0x7FFE,
1021                         LIT64( 0x0000FFFFFFFFFFFF ),
1022                         LIT64( 0xFFFFFFFFFFFFFFFF )
1023                     );
1024             }
1025             return packFloat128( zSign, 0x7FFF, 0, 0 );
1026         }
1027         if ( zExp < 0 ) {
1028             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
1029             isTiny =
1030                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1031                 || ( zExp < -1 )
1032                 || ! increment
1033                 || lt128(
1034                        zSig0,
1035                        zSig1,
1036                        LIT64( 0x0001FFFFFFFFFFFF ),
1037                        LIT64( 0xFFFFFFFFFFFFFFFF )
1038                    );
1039             shift128ExtraRightJamming(
1040                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1041             zExp = 0;
1042             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1043             if ( roundNearestEven ) {
1044                 increment = ( (sbits64) zSig2 < 0 );
1045             }
1046             else {
1047                 if ( zSign ) {
1048                     increment = ( roundingMode == float_round_down ) && zSig2;
1049                 }
1050                 else {
1051                     increment = ( roundingMode == float_round_up ) && zSig2;
1052                 }
1053             }
1054         }
1055     }
1056     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1057     if ( increment ) {
1058         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1059         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1060     }
1061     else {
1062         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1063     }
1064     return packFloat128( zSign, zExp, zSig0, zSig1 );
1065
1066 }
1067
1068 /*----------------------------------------------------------------------------
1069 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1070 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1071 | returns the proper quadruple-precision floating-point value corresponding
1072 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1073 | except that the input significand has fewer bits and does not have to be
1074 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1075 | point exponent.
1076 *----------------------------------------------------------------------------*/
1077
1078 static float128
1079  normalizeRoundAndPackFloat128(
1080      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1081 {
1082     int8 shiftCount;
1083     bits64 zSig2;
1084
1085     if ( zSig0 == 0 ) {
1086         zSig0 = zSig1;
1087         zSig1 = 0;
1088         zExp -= 64;
1089     }
1090     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1091     if ( 0 <= shiftCount ) {
1092         zSig2 = 0;
1093         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1094     }
1095     else {
1096         shift128ExtraRightJamming(
1097             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1098     }
1099     zExp -= shiftCount;
1100     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1101
1102 }
1103
1104 #endif
1105
1106 /*----------------------------------------------------------------------------
1107 | Returns the result of converting the 32-bit two's complement integer `a'
1108 | to the single-precision floating-point format.  The conversion is performed
1109 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1110 *----------------------------------------------------------------------------*/
1111
1112 float32 int32_to_float32( int32 a STATUS_PARAM )
1113 {
1114     flag zSign;
1115
1116     if ( a == 0 ) return float32_zero;
1117     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1118     zSign = ( a < 0 );
1119     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1120
1121 }
1122
1123 /*----------------------------------------------------------------------------
1124 | Returns the result of converting the 32-bit two's complement integer `a'
1125 | to the double-precision floating-point format.  The conversion is performed
1126 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127 *----------------------------------------------------------------------------*/
1128
1129 float64 int32_to_float64( int32 a STATUS_PARAM )
1130 {
1131     flag zSign;
1132     uint32 absA;
1133     int8 shiftCount;
1134     bits64 zSig;
1135
1136     if ( a == 0 ) return float64_zero;
1137     zSign = ( a < 0 );
1138     absA = zSign ? - a : a;
1139     shiftCount = countLeadingZeros32( absA ) + 21;
1140     zSig = absA;
1141     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1142
1143 }
1144
1145 #ifdef FLOATX80
1146
1147 /*----------------------------------------------------------------------------
1148 | Returns the result of converting the 32-bit two's complement integer `a'
1149 | to the extended double-precision floating-point format.  The conversion
1150 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1151 | Arithmetic.
1152 *----------------------------------------------------------------------------*/
1153
1154 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1155 {
1156     flag zSign;
1157     uint32 absA;
1158     int8 shiftCount;
1159     bits64 zSig;
1160
1161     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1162     zSign = ( a < 0 );
1163     absA = zSign ? - a : a;
1164     shiftCount = countLeadingZeros32( absA ) + 32;
1165     zSig = absA;
1166     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1167
1168 }
1169
1170 #endif
1171
1172 #ifdef FLOAT128
1173
1174 /*----------------------------------------------------------------------------
1175 | Returns the result of converting the 32-bit two's complement integer `a' to
1176 | the quadruple-precision floating-point format.  The conversion is performed
1177 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1178 *----------------------------------------------------------------------------*/
1179
1180 float128 int32_to_float128( int32 a STATUS_PARAM )
1181 {
1182     flag zSign;
1183     uint32 absA;
1184     int8 shiftCount;
1185     bits64 zSig0;
1186
1187     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1188     zSign = ( a < 0 );
1189     absA = zSign ? - a : a;
1190     shiftCount = countLeadingZeros32( absA ) + 17;
1191     zSig0 = absA;
1192     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1193
1194 }
1195
1196 #endif
1197
1198 /*----------------------------------------------------------------------------
1199 | Returns the result of converting the 64-bit two's complement integer `a'
1200 | to the single-precision floating-point format.  The conversion is performed
1201 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1202 *----------------------------------------------------------------------------*/
1203
1204 float32 int64_to_float32( int64 a STATUS_PARAM )
1205 {
1206     flag zSign;
1207     uint64 absA;
1208     int8 shiftCount;
1209
1210     if ( a == 0 ) return float32_zero;
1211     zSign = ( a < 0 );
1212     absA = zSign ? - a : a;
1213     shiftCount = countLeadingZeros64( absA ) - 40;
1214     if ( 0 <= shiftCount ) {
1215         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1216     }
1217     else {
1218         shiftCount += 7;
1219         if ( shiftCount < 0 ) {
1220             shift64RightJamming( absA, - shiftCount, &absA );
1221         }
1222         else {
1223             absA <<= shiftCount;
1224         }
1225         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1226     }
1227
1228 }
1229
1230 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1231 {
1232     int8 shiftCount;
1233
1234     if ( a == 0 ) return float32_zero;
1235     shiftCount = countLeadingZeros64( a ) - 40;
1236     if ( 0 <= shiftCount ) {
1237         return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1238     }
1239     else {
1240         shiftCount += 7;
1241         if ( shiftCount < 0 ) {
1242             shift64RightJamming( a, - shiftCount, &a );
1243         }
1244         else {
1245             a <<= shiftCount;
1246         }
1247         return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1248     }
1249 }
1250
1251 /*----------------------------------------------------------------------------
1252 | Returns the result of converting the 64-bit two's complement integer `a'
1253 | to the double-precision floating-point format.  The conversion is performed
1254 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1255 *----------------------------------------------------------------------------*/
1256
1257 float64 int64_to_float64( int64 a STATUS_PARAM )
1258 {
1259     flag zSign;
1260
1261     if ( a == 0 ) return float64_zero;
1262     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1263         return packFloat64( 1, 0x43E, 0 );
1264     }
1265     zSign = ( a < 0 );
1266     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1267
1268 }
1269
1270 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1271 {
1272     if ( a == 0 ) return float64_zero;
1273     return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1274
1275 }
1276
1277 #ifdef FLOATX80
1278
1279 /*----------------------------------------------------------------------------
1280 | Returns the result of converting the 64-bit two's complement integer `a'
1281 | to the extended double-precision floating-point format.  The conversion
1282 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1283 | Arithmetic.
1284 *----------------------------------------------------------------------------*/
1285
1286 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1287 {
1288     flag zSign;
1289     uint64 absA;
1290     int8 shiftCount;
1291
1292     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1293     zSign = ( a < 0 );
1294     absA = zSign ? - a : a;
1295     shiftCount = countLeadingZeros64( absA );
1296     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1297
1298 }
1299
1300 #endif
1301
1302 #ifdef FLOAT128
1303
1304 /*----------------------------------------------------------------------------
1305 | Returns the result of converting the 64-bit two's complement integer `a' to
1306 | the quadruple-precision floating-point format.  The conversion is performed
1307 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1308 *----------------------------------------------------------------------------*/
1309
1310 float128 int64_to_float128( int64 a STATUS_PARAM )
1311 {
1312     flag zSign;
1313     uint64 absA;
1314     int8 shiftCount;
1315     int32 zExp;
1316     bits64 zSig0, zSig1;
1317
1318     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1319     zSign = ( a < 0 );
1320     absA = zSign ? - a : a;
1321     shiftCount = countLeadingZeros64( absA ) + 49;
1322     zExp = 0x406E - shiftCount;
1323     if ( 64 <= shiftCount ) {
1324         zSig1 = 0;
1325         zSig0 = absA;
1326         shiftCount -= 64;
1327     }
1328     else {
1329         zSig1 = absA;
1330         zSig0 = 0;
1331     }
1332     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1333     return packFloat128( zSign, zExp, zSig0, zSig1 );
1334
1335 }
1336
1337 #endif
1338
1339 /*----------------------------------------------------------------------------
1340 | Returns the result of converting the single-precision floating-point value
1341 | `a' to the 32-bit two's complement integer format.  The conversion is
1342 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1343 | Arithmetic---which means in particular that the conversion is rounded
1344 | according to the current rounding mode.  If `a' is a NaN, the largest
1345 | positive integer is returned.  Otherwise, if the conversion overflows, the
1346 | largest integer with the same sign as `a' is returned.
1347 *----------------------------------------------------------------------------*/
1348
1349 int32 float32_to_int32( float32 a STATUS_PARAM )
1350 {
1351     flag aSign;
1352     int16 aExp, shiftCount;
1353     bits32 aSig;
1354     bits64 aSig64;
1355
1356     a = float32_squash_input_denormal(a STATUS_VAR);
1357     aSig = extractFloat32Frac( a );
1358     aExp = extractFloat32Exp( a );
1359     aSign = extractFloat32Sign( a );
1360     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1361     if ( aExp ) aSig |= 0x00800000;
1362     shiftCount = 0xAF - aExp;
1363     aSig64 = aSig;
1364     aSig64 <<= 32;
1365     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1366     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1367
1368 }
1369
1370 /*----------------------------------------------------------------------------
1371 | Returns the result of converting the single-precision floating-point value
1372 | `a' to the 32-bit two's complement integer format.  The conversion is
1373 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1374 | Arithmetic, except that the conversion is always rounded toward zero.
1375 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1376 | the conversion overflows, the largest integer with the same sign as `a' is
1377 | returned.
1378 *----------------------------------------------------------------------------*/
1379
1380 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1381 {
1382     flag aSign;
1383     int16 aExp, shiftCount;
1384     bits32 aSig;
1385     int32 z;
1386     a = float32_squash_input_denormal(a STATUS_VAR);
1387
1388     aSig = extractFloat32Frac( a );
1389     aExp = extractFloat32Exp( a );
1390     aSign = extractFloat32Sign( a );
1391     shiftCount = aExp - 0x9E;
1392     if ( 0 <= shiftCount ) {
1393         if ( float32_val(a) != 0xCF000000 ) {
1394             float_raise( float_flag_invalid STATUS_VAR);
1395             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1396         }
1397         return (sbits32) 0x80000000;
1398     }
1399     else if ( aExp <= 0x7E ) {
1400         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1401         return 0;
1402     }
1403     aSig = ( aSig | 0x00800000 )<<8;
1404     z = aSig>>( - shiftCount );
1405     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1406         STATUS(float_exception_flags) |= float_flag_inexact;
1407     }
1408     if ( aSign ) z = - z;
1409     return z;
1410
1411 }
1412
1413 /*----------------------------------------------------------------------------
1414 | Returns the result of converting the single-precision floating-point value
1415 | `a' to the 16-bit two's complement integer format.  The conversion is
1416 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1417 | Arithmetic, except that the conversion is always rounded toward zero.
1418 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1419 | the conversion overflows, the largest integer with the same sign as `a' is
1420 | returned.
1421 *----------------------------------------------------------------------------*/
1422
1423 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1424 {
1425     flag aSign;
1426     int16 aExp, shiftCount;
1427     bits32 aSig;
1428     int32 z;
1429
1430     aSig = extractFloat32Frac( a );
1431     aExp = extractFloat32Exp( a );
1432     aSign = extractFloat32Sign( a );
1433     shiftCount = aExp - 0x8E;
1434     if ( 0 <= shiftCount ) {
1435         if ( float32_val(a) != 0xC7000000 ) {
1436             float_raise( float_flag_invalid STATUS_VAR);
1437             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1438                 return 0x7FFF;
1439             }
1440         }
1441         return (sbits32) 0xffff8000;
1442     }
1443     else if ( aExp <= 0x7E ) {
1444         if ( aExp | aSig ) {
1445             STATUS(float_exception_flags) |= float_flag_inexact;
1446         }
1447         return 0;
1448     }
1449     shiftCount -= 0x10;
1450     aSig = ( aSig | 0x00800000 )<<8;
1451     z = aSig>>( - shiftCount );
1452     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1453         STATUS(float_exception_flags) |= float_flag_inexact;
1454     }
1455     if ( aSign ) {
1456         z = - z;
1457     }
1458     return z;
1459
1460 }
1461
1462 /*----------------------------------------------------------------------------
1463 | Returns the result of converting the single-precision floating-point value
1464 | `a' to the 64-bit two's complement integer format.  The conversion is
1465 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1466 | Arithmetic---which means in particular that the conversion is rounded
1467 | according to the current rounding mode.  If `a' is a NaN, the largest
1468 | positive integer is returned.  Otherwise, if the conversion overflows, the
1469 | largest integer with the same sign as `a' is returned.
1470 *----------------------------------------------------------------------------*/
1471
1472 int64 float32_to_int64( float32 a STATUS_PARAM )
1473 {
1474     flag aSign;
1475     int16 aExp, shiftCount;
1476     bits32 aSig;
1477     bits64 aSig64, aSigExtra;
1478     a = float32_squash_input_denormal(a STATUS_VAR);
1479
1480     aSig = extractFloat32Frac( a );
1481     aExp = extractFloat32Exp( a );
1482     aSign = extractFloat32Sign( a );
1483     shiftCount = 0xBE - aExp;
1484     if ( shiftCount < 0 ) {
1485         float_raise( float_flag_invalid STATUS_VAR);
1486         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1487             return LIT64( 0x7FFFFFFFFFFFFFFF );
1488         }
1489         return (sbits64) LIT64( 0x8000000000000000 );
1490     }
1491     if ( aExp ) aSig |= 0x00800000;
1492     aSig64 = aSig;
1493     aSig64 <<= 40;
1494     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1495     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1496
1497 }
1498
1499 /*----------------------------------------------------------------------------
1500 | Returns the result of converting the single-precision floating-point value
1501 | `a' to the 64-bit two's complement integer format.  The conversion is
1502 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1503 | Arithmetic, except that the conversion is always rounded toward zero.  If
1504 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1505 | conversion overflows, the largest integer with the same sign as `a' is
1506 | returned.
1507 *----------------------------------------------------------------------------*/
1508
1509 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1510 {
1511     flag aSign;
1512     int16 aExp, shiftCount;
1513     bits32 aSig;
1514     bits64 aSig64;
1515     int64 z;
1516     a = float32_squash_input_denormal(a STATUS_VAR);
1517
1518     aSig = extractFloat32Frac( a );
1519     aExp = extractFloat32Exp( a );
1520     aSign = extractFloat32Sign( a );
1521     shiftCount = aExp - 0xBE;
1522     if ( 0 <= shiftCount ) {
1523         if ( float32_val(a) != 0xDF000000 ) {
1524             float_raise( float_flag_invalid STATUS_VAR);
1525             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1526                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1527             }
1528         }
1529         return (sbits64) LIT64( 0x8000000000000000 );
1530     }
1531     else if ( aExp <= 0x7E ) {
1532         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1533         return 0;
1534     }
1535     aSig64 = aSig | 0x00800000;
1536     aSig64 <<= 40;
1537     z = aSig64>>( - shiftCount );
1538     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1539         STATUS(float_exception_flags) |= float_flag_inexact;
1540     }
1541     if ( aSign ) z = - z;
1542     return z;
1543
1544 }
1545
1546 /*----------------------------------------------------------------------------
1547 | Returns the result of converting the single-precision floating-point value
1548 | `a' to the double-precision floating-point format.  The conversion is
1549 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1550 | Arithmetic.
1551 *----------------------------------------------------------------------------*/
1552
1553 float64 float32_to_float64( float32 a STATUS_PARAM )
1554 {
1555     flag aSign;
1556     int16 aExp;
1557     bits32 aSig;
1558     a = float32_squash_input_denormal(a STATUS_VAR);
1559
1560     aSig = extractFloat32Frac( a );
1561     aExp = extractFloat32Exp( a );
1562     aSign = extractFloat32Sign( a );
1563     if ( aExp == 0xFF ) {
1564         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1565         return packFloat64( aSign, 0x7FF, 0 );
1566     }
1567     if ( aExp == 0 ) {
1568         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1569         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1570         --aExp;
1571     }
1572     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1573
1574 }
1575
1576 #ifdef FLOATX80
1577
1578 /*----------------------------------------------------------------------------
1579 | Returns the result of converting the single-precision floating-point value
1580 | `a' to the extended double-precision floating-point format.  The conversion
1581 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1582 | Arithmetic.
1583 *----------------------------------------------------------------------------*/
1584
1585 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1586 {
1587     flag aSign;
1588     int16 aExp;
1589     bits32 aSig;
1590
1591     a = float32_squash_input_denormal(a STATUS_VAR);
1592     aSig = extractFloat32Frac( a );
1593     aExp = extractFloat32Exp( a );
1594     aSign = extractFloat32Sign( a );
1595     if ( aExp == 0xFF ) {
1596         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1597         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1598     }
1599     if ( aExp == 0 ) {
1600         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1601         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1602     }
1603     aSig |= 0x00800000;
1604     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1605
1606 }
1607
1608 #endif
1609
1610 #ifdef FLOAT128
1611
1612 /*----------------------------------------------------------------------------
1613 | Returns the result of converting the single-precision floating-point value
1614 | `a' to the double-precision floating-point format.  The conversion is
1615 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1616 | Arithmetic.
1617 *----------------------------------------------------------------------------*/
1618
1619 float128 float32_to_float128( float32 a STATUS_PARAM )
1620 {
1621     flag aSign;
1622     int16 aExp;
1623     bits32 aSig;
1624
1625     a = float32_squash_input_denormal(a STATUS_VAR);
1626     aSig = extractFloat32Frac( a );
1627     aExp = extractFloat32Exp( a );
1628     aSign = extractFloat32Sign( a );
1629     if ( aExp == 0xFF ) {
1630         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1631         return packFloat128( aSign, 0x7FFF, 0, 0 );
1632     }
1633     if ( aExp == 0 ) {
1634         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1635         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1636         --aExp;
1637     }
1638     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1639
1640 }
1641
1642 #endif
1643
1644 /*----------------------------------------------------------------------------
1645 | Rounds the single-precision floating-point value `a' to an integer, and
1646 | returns the result as a single-precision floating-point value.  The
1647 | operation is performed according to the IEC/IEEE Standard for Binary
1648 | Floating-Point Arithmetic.
1649 *----------------------------------------------------------------------------*/
1650
1651 float32 float32_round_to_int( float32 a STATUS_PARAM)
1652 {
1653     flag aSign;
1654     int16 aExp;
1655     bits32 lastBitMask, roundBitsMask;
1656     int8 roundingMode;
1657     bits32 z;
1658     a = float32_squash_input_denormal(a STATUS_VAR);
1659
1660     aExp = extractFloat32Exp( a );
1661     if ( 0x96 <= aExp ) {
1662         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1663             return propagateFloat32NaN( a, a STATUS_VAR );
1664         }
1665         return a;
1666     }
1667     if ( aExp <= 0x7E ) {
1668         if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1669         STATUS(float_exception_flags) |= float_flag_inexact;
1670         aSign = extractFloat32Sign( a );
1671         switch ( STATUS(float_rounding_mode) ) {
1672          case float_round_nearest_even:
1673             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1674                 return packFloat32( aSign, 0x7F, 0 );
1675             }
1676             break;
1677          case float_round_down:
1678             return make_float32(aSign ? 0xBF800000 : 0);
1679          case float_round_up:
1680             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1681         }
1682         return packFloat32( aSign, 0, 0 );
1683     }
1684     lastBitMask = 1;
1685     lastBitMask <<= 0x96 - aExp;
1686     roundBitsMask = lastBitMask - 1;
1687     z = float32_val(a);
1688     roundingMode = STATUS(float_rounding_mode);
1689     if ( roundingMode == float_round_nearest_even ) {
1690         z += lastBitMask>>1;
1691         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1692     }
1693     else if ( roundingMode != float_round_to_zero ) {
1694         if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1695             z += roundBitsMask;
1696         }
1697     }
1698     z &= ~ roundBitsMask;
1699     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1700     return make_float32(z);
1701
1702 }
1703
1704 /*----------------------------------------------------------------------------
1705 | Returns the result of adding the absolute values of the single-precision
1706 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1707 | before being returned.  `zSign' is ignored if the result is a NaN.
1708 | The addition is performed according to the IEC/IEEE Standard for Binary
1709 | Floating-Point Arithmetic.
1710 *----------------------------------------------------------------------------*/
1711
1712 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1713 {
1714     int16 aExp, bExp, zExp;
1715     bits32 aSig, bSig, zSig;
1716     int16 expDiff;
1717
1718     aSig = extractFloat32Frac( a );
1719     aExp = extractFloat32Exp( a );
1720     bSig = extractFloat32Frac( b );
1721     bExp = extractFloat32Exp( b );
1722     expDiff = aExp - bExp;
1723     aSig <<= 6;
1724     bSig <<= 6;
1725     if ( 0 < expDiff ) {
1726         if ( aExp == 0xFF ) {
1727             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1728             return a;
1729         }
1730         if ( bExp == 0 ) {
1731             --expDiff;
1732         }
1733         else {
1734             bSig |= 0x20000000;
1735         }
1736         shift32RightJamming( bSig, expDiff, &bSig );
1737         zExp = aExp;
1738     }
1739     else if ( expDiff < 0 ) {
1740         if ( bExp == 0xFF ) {
1741             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1742             return packFloat32( zSign, 0xFF, 0 );
1743         }
1744         if ( aExp == 0 ) {
1745             ++expDiff;
1746         }
1747         else {
1748             aSig |= 0x20000000;
1749         }
1750         shift32RightJamming( aSig, - expDiff, &aSig );
1751         zExp = bExp;
1752     }
1753     else {
1754         if ( aExp == 0xFF ) {
1755             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1756             return a;
1757         }
1758         if ( aExp == 0 ) {
1759             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1760             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1761         }
1762         zSig = 0x40000000 + aSig + bSig;
1763         zExp = aExp;
1764         goto roundAndPack;
1765     }
1766     aSig |= 0x20000000;
1767     zSig = ( aSig + bSig )<<1;
1768     --zExp;
1769     if ( (sbits32) zSig < 0 ) {
1770         zSig = aSig + bSig;
1771         ++zExp;
1772     }
1773  roundAndPack:
1774     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1775
1776 }
1777
1778 /*----------------------------------------------------------------------------
1779 | Returns the result of subtracting the absolute values of the single-
1780 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1781 | difference is negated before being returned.  `zSign' is ignored if the
1782 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1783 | Standard for Binary Floating-Point Arithmetic.
1784 *----------------------------------------------------------------------------*/
1785
1786 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1787 {
1788     int16 aExp, bExp, zExp;
1789     bits32 aSig, bSig, zSig;
1790     int16 expDiff;
1791
1792     aSig = extractFloat32Frac( a );
1793     aExp = extractFloat32Exp( a );
1794     bSig = extractFloat32Frac( b );
1795     bExp = extractFloat32Exp( b );
1796     expDiff = aExp - bExp;
1797     aSig <<= 7;
1798     bSig <<= 7;
1799     if ( 0 < expDiff ) goto aExpBigger;
1800     if ( expDiff < 0 ) goto bExpBigger;
1801     if ( aExp == 0xFF ) {
1802         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1803         float_raise( float_flag_invalid STATUS_VAR);
1804         return float32_default_nan;
1805     }
1806     if ( aExp == 0 ) {
1807         aExp = 1;
1808         bExp = 1;
1809     }
1810     if ( bSig < aSig ) goto aBigger;
1811     if ( aSig < bSig ) goto bBigger;
1812     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1813  bExpBigger:
1814     if ( bExp == 0xFF ) {
1815         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1816         return packFloat32( zSign ^ 1, 0xFF, 0 );
1817     }
1818     if ( aExp == 0 ) {
1819         ++expDiff;
1820     }
1821     else {
1822         aSig |= 0x40000000;
1823     }
1824     shift32RightJamming( aSig, - expDiff, &aSig );
1825     bSig |= 0x40000000;
1826  bBigger:
1827     zSig = bSig - aSig;
1828     zExp = bExp;
1829     zSign ^= 1;
1830     goto normalizeRoundAndPack;
1831  aExpBigger:
1832     if ( aExp == 0xFF ) {
1833         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1834         return a;
1835     }
1836     if ( bExp == 0 ) {
1837         --expDiff;
1838     }
1839     else {
1840         bSig |= 0x40000000;
1841     }
1842     shift32RightJamming( bSig, expDiff, &bSig );
1843     aSig |= 0x40000000;
1844  aBigger:
1845     zSig = aSig - bSig;
1846     zExp = aExp;
1847  normalizeRoundAndPack:
1848     --zExp;
1849     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1850
1851 }
1852
1853 /*----------------------------------------------------------------------------
1854 | Returns the result of adding the single-precision floating-point values `a'
1855 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1856 | Binary Floating-Point Arithmetic.
1857 *----------------------------------------------------------------------------*/
1858
1859 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1860 {
1861     flag aSign, bSign;
1862     a = float32_squash_input_denormal(a STATUS_VAR);
1863     b = float32_squash_input_denormal(b STATUS_VAR);
1864
1865     aSign = extractFloat32Sign( a );
1866     bSign = extractFloat32Sign( b );
1867     if ( aSign == bSign ) {
1868         return addFloat32Sigs( a, b, aSign STATUS_VAR);
1869     }
1870     else {
1871         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1872     }
1873
1874 }
1875
1876 /*----------------------------------------------------------------------------
1877 | Returns the result of subtracting the single-precision floating-point values
1878 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1879 | for Binary Floating-Point Arithmetic.
1880 *----------------------------------------------------------------------------*/
1881
1882 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1883 {
1884     flag aSign, bSign;
1885     a = float32_squash_input_denormal(a STATUS_VAR);
1886     b = float32_squash_input_denormal(b STATUS_VAR);
1887
1888     aSign = extractFloat32Sign( a );
1889     bSign = extractFloat32Sign( b );
1890     if ( aSign == bSign ) {
1891         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1892     }
1893     else {
1894         return addFloat32Sigs( a, b, aSign STATUS_VAR );
1895     }
1896
1897 }
1898
1899 /*----------------------------------------------------------------------------
1900 | Returns the result of multiplying the single-precision floating-point values
1901 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1902 | for Binary Floating-Point Arithmetic.
1903 *----------------------------------------------------------------------------*/
1904
1905 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1906 {
1907     flag aSign, bSign, zSign;
1908     int16 aExp, bExp, zExp;
1909     bits32 aSig, bSig;
1910     bits64 zSig64;
1911     bits32 zSig;
1912
1913     a = float32_squash_input_denormal(a STATUS_VAR);
1914     b = float32_squash_input_denormal(b STATUS_VAR);
1915
1916     aSig = extractFloat32Frac( a );
1917     aExp = extractFloat32Exp( a );
1918     aSign = extractFloat32Sign( a );
1919     bSig = extractFloat32Frac( b );
1920     bExp = extractFloat32Exp( b );
1921     bSign = extractFloat32Sign( b );
1922     zSign = aSign ^ bSign;
1923     if ( aExp == 0xFF ) {
1924         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1925             return propagateFloat32NaN( a, b STATUS_VAR );
1926         }
1927         if ( ( bExp | bSig ) == 0 ) {
1928             float_raise( float_flag_invalid STATUS_VAR);
1929             return float32_default_nan;
1930         }
1931         return packFloat32( zSign, 0xFF, 0 );
1932     }
1933     if ( bExp == 0xFF ) {
1934         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1935         if ( ( aExp | aSig ) == 0 ) {
1936             float_raise( float_flag_invalid STATUS_VAR);
1937             return float32_default_nan;
1938         }
1939         return packFloat32( zSign, 0xFF, 0 );
1940     }
1941     if ( aExp == 0 ) {
1942         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1943         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1944     }
1945     if ( bExp == 0 ) {
1946         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1947         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1948     }
1949     zExp = aExp + bExp - 0x7F;
1950     aSig = ( aSig | 0x00800000 )<<7;
1951     bSig = ( bSig | 0x00800000 )<<8;
1952     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1953     zSig = zSig64;
1954     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1955         zSig <<= 1;
1956         --zExp;
1957     }
1958     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1959
1960 }
1961
1962 /*----------------------------------------------------------------------------
1963 | Returns the result of dividing the single-precision floating-point value `a'
1964 | by the corresponding value `b'.  The operation is performed according to the
1965 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1966 *----------------------------------------------------------------------------*/
1967
1968 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1969 {
1970     flag aSign, bSign, zSign;
1971     int16 aExp, bExp, zExp;
1972     bits32 aSig, bSig, zSig;
1973     a = float32_squash_input_denormal(a STATUS_VAR);
1974     b = float32_squash_input_denormal(b STATUS_VAR);
1975
1976     aSig = extractFloat32Frac( a );
1977     aExp = extractFloat32Exp( a );
1978     aSign = extractFloat32Sign( a );
1979     bSig = extractFloat32Frac( b );
1980     bExp = extractFloat32Exp( b );
1981     bSign = extractFloat32Sign( b );
1982     zSign = aSign ^ bSign;
1983     if ( aExp == 0xFF ) {
1984         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1985         if ( bExp == 0xFF ) {
1986             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1987             float_raise( float_flag_invalid STATUS_VAR);
1988             return float32_default_nan;
1989         }
1990         return packFloat32( zSign, 0xFF, 0 );
1991     }
1992     if ( bExp == 0xFF ) {
1993         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1994         return packFloat32( zSign, 0, 0 );
1995     }
1996     if ( bExp == 0 ) {
1997         if ( bSig == 0 ) {
1998             if ( ( aExp | aSig ) == 0 ) {
1999                 float_raise( float_flag_invalid STATUS_VAR);
2000                 return float32_default_nan;
2001             }
2002             float_raise( float_flag_divbyzero STATUS_VAR);
2003             return packFloat32( zSign, 0xFF, 0 );
2004         }
2005         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2006     }
2007     if ( aExp == 0 ) {
2008         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2009         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2010     }
2011     zExp = aExp - bExp + 0x7D;
2012     aSig = ( aSig | 0x00800000 )<<7;
2013     bSig = ( bSig | 0x00800000 )<<8;
2014     if ( bSig <= ( aSig + aSig ) ) {
2015         aSig >>= 1;
2016         ++zExp;
2017     }
2018     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
2019     if ( ( zSig & 0x3F ) == 0 ) {
2020         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2021     }
2022     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2023
2024 }
2025
2026 /*----------------------------------------------------------------------------
2027 | Returns the remainder of the single-precision floating-point value `a'
2028 | with respect to the corresponding value `b'.  The operation is performed
2029 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2030 *----------------------------------------------------------------------------*/
2031
2032 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2033 {
2034     flag aSign, zSign;
2035     int16 aExp, bExp, expDiff;
2036     bits32 aSig, bSig;
2037     bits32 q;
2038     bits64 aSig64, bSig64, q64;
2039     bits32 alternateASig;
2040     sbits32 sigMean;
2041     a = float32_squash_input_denormal(a STATUS_VAR);
2042     b = float32_squash_input_denormal(b STATUS_VAR);
2043
2044     aSig = extractFloat32Frac( a );
2045     aExp = extractFloat32Exp( a );
2046     aSign = extractFloat32Sign( a );
2047     bSig = extractFloat32Frac( b );
2048     bExp = extractFloat32Exp( b );
2049     if ( aExp == 0xFF ) {
2050         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2051             return propagateFloat32NaN( a, b STATUS_VAR );
2052         }
2053         float_raise( float_flag_invalid STATUS_VAR);
2054         return float32_default_nan;
2055     }
2056     if ( bExp == 0xFF ) {
2057         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2058         return a;
2059     }
2060     if ( bExp == 0 ) {
2061         if ( bSig == 0 ) {
2062             float_raise( float_flag_invalid STATUS_VAR);
2063             return float32_default_nan;
2064         }
2065         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2066     }
2067     if ( aExp == 0 ) {
2068         if ( aSig == 0 ) return a;
2069         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2070     }
2071     expDiff = aExp - bExp;
2072     aSig |= 0x00800000;
2073     bSig |= 0x00800000;
2074     if ( expDiff < 32 ) {
2075         aSig <<= 8;
2076         bSig <<= 8;
2077         if ( expDiff < 0 ) {
2078             if ( expDiff < -1 ) return a;
2079             aSig >>= 1;
2080         }
2081         q = ( bSig <= aSig );
2082         if ( q ) aSig -= bSig;
2083         if ( 0 < expDiff ) {
2084             q = ( ( (bits64) aSig )<<32 ) / bSig;
2085             q >>= 32 - expDiff;
2086             bSig >>= 2;
2087             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2088         }
2089         else {
2090             aSig >>= 2;
2091             bSig >>= 2;
2092         }
2093     }
2094     else {
2095         if ( bSig <= aSig ) aSig -= bSig;
2096         aSig64 = ( (bits64) aSig )<<40;
2097         bSig64 = ( (bits64) bSig )<<40;
2098         expDiff -= 64;
2099         while ( 0 < expDiff ) {
2100             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2101             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2102             aSig64 = - ( ( bSig * q64 )<<38 );
2103             expDiff -= 62;
2104         }
2105         expDiff += 64;
2106         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2107         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2108         q = q64>>( 64 - expDiff );
2109         bSig <<= 6;
2110         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2111     }
2112     do {
2113         alternateASig = aSig;
2114         ++q;
2115         aSig -= bSig;
2116     } while ( 0 <= (sbits32) aSig );
2117     sigMean = aSig + alternateASig;
2118     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2119         aSig = alternateASig;
2120     }
2121     zSign = ( (sbits32) aSig < 0 );
2122     if ( zSign ) aSig = - aSig;
2123     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2124
2125 }
2126
2127 /*----------------------------------------------------------------------------
2128 | Returns the square root of the single-precision floating-point value `a'.
2129 | The operation is performed according to the IEC/IEEE Standard for Binary
2130 | Floating-Point Arithmetic.
2131 *----------------------------------------------------------------------------*/
2132
2133 float32 float32_sqrt( float32 a STATUS_PARAM )
2134 {
2135     flag aSign;
2136     int16 aExp, zExp;
2137     bits32 aSig, zSig;
2138     bits64 rem, term;
2139     a = float32_squash_input_denormal(a STATUS_VAR);
2140
2141     aSig = extractFloat32Frac( a );
2142     aExp = extractFloat32Exp( a );
2143     aSign = extractFloat32Sign( a );
2144     if ( aExp == 0xFF ) {
2145         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2146         if ( ! aSign ) return a;
2147         float_raise( float_flag_invalid STATUS_VAR);
2148         return float32_default_nan;
2149     }
2150     if ( aSign ) {
2151         if ( ( aExp | aSig ) == 0 ) return a;
2152         float_raise( float_flag_invalid STATUS_VAR);
2153         return float32_default_nan;
2154     }
2155     if ( aExp == 0 ) {
2156         if ( aSig == 0 ) return float32_zero;
2157         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2158     }
2159     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2160     aSig = ( aSig | 0x00800000 )<<8;
2161     zSig = estimateSqrt32( aExp, aSig ) + 2;
2162     if ( ( zSig & 0x7F ) <= 5 ) {
2163         if ( zSig < 2 ) {
2164             zSig = 0x7FFFFFFF;
2165             goto roundAndPack;
2166         }
2167         aSig >>= aExp & 1;
2168         term = ( (bits64) zSig ) * zSig;
2169         rem = ( ( (bits64) aSig )<<32 ) - term;
2170         while ( (sbits64) rem < 0 ) {
2171             --zSig;
2172             rem += ( ( (bits64) zSig )<<1 ) | 1;
2173         }
2174         zSig |= ( rem != 0 );
2175     }
2176     shift32RightJamming( zSig, 1, &zSig );
2177  roundAndPack:
2178     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2179
2180 }
2181
2182 /*----------------------------------------------------------------------------
2183 | Returns the binary exponential of the single-precision floating-point value
2184 | `a'. The operation is performed according to the IEC/IEEE Standard for
2185 | Binary Floating-Point Arithmetic.
2186 |
2187 | Uses the following identities:
2188 |
2189 | 1. -------------------------------------------------------------------------
2190 |      x    x*ln(2)
2191 |     2  = e
2192 |
2193 | 2. -------------------------------------------------------------------------
2194 |                      2     3     4     5           n
2195 |      x        x     x     x     x     x           x
2196 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2197 |               1!    2!    3!    4!    5!          n!
2198 *----------------------------------------------------------------------------*/
2199
2200 static const float64 float32_exp2_coefficients[15] =
2201 {
2202     const_float64( 0x3ff0000000000000ll ), /*  1 */
2203     const_float64( 0x3fe0000000000000ll ), /*  2 */
2204     const_float64( 0x3fc5555555555555ll ), /*  3 */
2205     const_float64( 0x3fa5555555555555ll ), /*  4 */
2206     const_float64( 0x3f81111111111111ll ), /*  5 */
2207     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2208     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2209     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2210     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2211     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2212     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2213     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2214     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2215     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2216     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2217 };
2218
2219 float32 float32_exp2( float32 a STATUS_PARAM )
2220 {
2221     flag aSign;
2222     int16 aExp;
2223     bits32 aSig;
2224     float64 r, x, xn;
2225     int i;
2226     a = float32_squash_input_denormal(a STATUS_VAR);
2227
2228     aSig = extractFloat32Frac( a );
2229     aExp = extractFloat32Exp( a );
2230     aSign = extractFloat32Sign( a );
2231
2232     if ( aExp == 0xFF) {
2233         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2234         return (aSign) ? float32_zero : a;
2235     }
2236     if (aExp == 0) {
2237         if (aSig == 0) return float32_one;
2238     }
2239
2240     float_raise( float_flag_inexact STATUS_VAR);
2241
2242     /* ******************************* */
2243     /* using float64 for approximation */
2244     /* ******************************* */
2245     x = float32_to_float64(a STATUS_VAR);
2246     x = float64_mul(x, float64_ln2 STATUS_VAR);
2247
2248     xn = x;
2249     r = float64_one;
2250     for (i = 0 ; i < 15 ; i++) {
2251         float64 f;
2252
2253         f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2254         r = float64_add(r, f STATUS_VAR);
2255
2256         xn = float64_mul(xn, x STATUS_VAR);
2257     }
2258
2259     return float64_to_float32(r, status);
2260 }
2261
2262 /*----------------------------------------------------------------------------
2263 | Returns the binary log of the single-precision floating-point value `a'.
2264 | The operation is performed according to the IEC/IEEE Standard for Binary
2265 | Floating-Point Arithmetic.
2266 *----------------------------------------------------------------------------*/
2267 float32 float32_log2( float32 a STATUS_PARAM )
2268 {
2269     flag aSign, zSign;
2270     int16 aExp;
2271     bits32 aSig, zSig, i;
2272
2273     a = float32_squash_input_denormal(a STATUS_VAR);
2274     aSig = extractFloat32Frac( a );
2275     aExp = extractFloat32Exp( a );
2276     aSign = extractFloat32Sign( a );
2277
2278     if ( aExp == 0 ) {
2279         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2280         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2281     }
2282     if ( aSign ) {
2283         float_raise( float_flag_invalid STATUS_VAR);
2284         return float32_default_nan;
2285     }
2286     if ( aExp == 0xFF ) {
2287         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2288         return a;
2289     }
2290
2291     aExp -= 0x7F;
2292     aSig |= 0x00800000;
2293     zSign = aExp < 0;
2294     zSig = aExp << 23;
2295
2296     for (i = 1 << 22; i > 0; i >>= 1) {
2297         aSig = ( (bits64)aSig * aSig ) >> 23;
2298         if ( aSig & 0x01000000 ) {
2299             aSig >>= 1;
2300             zSig |= i;
2301         }
2302     }
2303
2304     if ( zSign )
2305         zSig = -zSig;
2306
2307     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2308 }
2309
2310 /*----------------------------------------------------------------------------
2311 | Returns 1 if the single-precision floating-point value `a' is equal to
2312 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2313 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2314 *----------------------------------------------------------------------------*/
2315
2316 int float32_eq( float32 a, float32 b STATUS_PARAM )
2317 {
2318     a = float32_squash_input_denormal(a STATUS_VAR);
2319     b = float32_squash_input_denormal(b STATUS_VAR);
2320
2321     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2322          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2323        ) {
2324         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2325             float_raise( float_flag_invalid STATUS_VAR);
2326         }
2327         return 0;
2328     }
2329     return ( float32_val(a) == float32_val(b) ) ||
2330             ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2331
2332 }
2333
2334 /*----------------------------------------------------------------------------
2335 | Returns 1 if the single-precision floating-point value `a' is less than
2336 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
2337 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2338 | Arithmetic.
2339 *----------------------------------------------------------------------------*/
2340
2341 int float32_le( float32 a, float32 b STATUS_PARAM )
2342 {
2343     flag aSign, bSign;
2344     bits32 av, bv;
2345     a = float32_squash_input_denormal(a STATUS_VAR);
2346     b = float32_squash_input_denormal(b STATUS_VAR);
2347
2348     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2349          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2350        ) {
2351         float_raise( float_flag_invalid STATUS_VAR);
2352         return 0;
2353     }
2354     aSign = extractFloat32Sign( a );
2355     bSign = extractFloat32Sign( b );
2356     av = float32_val(a);
2357     bv = float32_val(b);
2358     if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2359     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2360
2361 }
2362
2363 /*----------------------------------------------------------------------------
2364 | Returns 1 if the single-precision floating-point value `a' is less than
2365 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2366 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2367 *----------------------------------------------------------------------------*/
2368
2369 int float32_lt( float32 a, float32 b STATUS_PARAM )
2370 {
2371     flag aSign, bSign;
2372     bits32 av, bv;
2373     a = float32_squash_input_denormal(a STATUS_VAR);
2374     b = float32_squash_input_denormal(b STATUS_VAR);
2375
2376     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2377          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2378        ) {
2379         float_raise( float_flag_invalid STATUS_VAR);
2380         return 0;
2381     }
2382     aSign = extractFloat32Sign( a );
2383     bSign = extractFloat32Sign( b );
2384     av = float32_val(a);
2385     bv = float32_val(b);
2386     if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2387     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2388
2389 }
2390
2391 /*----------------------------------------------------------------------------
2392 | Returns 1 if the single-precision floating-point value `a' is equal to
2393 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2394 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2395 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2396 *----------------------------------------------------------------------------*/
2397
2398 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2399 {
2400     bits32 av, bv;
2401     a = float32_squash_input_denormal(a STATUS_VAR);
2402     b = float32_squash_input_denormal(b STATUS_VAR);
2403
2404     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2405          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2406        ) {
2407         float_raise( float_flag_invalid STATUS_VAR);
2408         return 0;
2409     }
2410     av = float32_val(a);
2411     bv = float32_val(b);
2412     return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2413
2414 }
2415
2416 /*----------------------------------------------------------------------------
2417 | Returns 1 if the single-precision floating-point value `a' is less than or
2418 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2419 | cause an exception.  Otherwise, the comparison is performed according to the
2420 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2421 *----------------------------------------------------------------------------*/
2422
2423 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2424 {
2425     flag aSign, bSign;
2426     bits32 av, bv;
2427     a = float32_squash_input_denormal(a STATUS_VAR);
2428     b = float32_squash_input_denormal(b STATUS_VAR);
2429
2430     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2431          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2432        ) {
2433         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2434             float_raise( float_flag_invalid STATUS_VAR);
2435         }
2436         return 0;
2437     }
2438     aSign = extractFloat32Sign( a );
2439     bSign = extractFloat32Sign( b );
2440     av = float32_val(a);
2441     bv = float32_val(b);
2442     if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2443     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2444
2445 }
2446
2447 /*----------------------------------------------------------------------------
2448 | Returns 1 if the single-precision floating-point value `a' is less than
2449 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2450 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2451 | Standard for Binary Floating-Point Arithmetic.
2452 *----------------------------------------------------------------------------*/
2453
2454 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2455 {
2456     flag aSign, bSign;
2457     bits32 av, bv;
2458     a = float32_squash_input_denormal(a STATUS_VAR);
2459     b = float32_squash_input_denormal(b STATUS_VAR);
2460
2461     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2462          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2463        ) {
2464         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2465             float_raise( float_flag_invalid STATUS_VAR);
2466         }
2467         return 0;
2468     }
2469     aSign = extractFloat32Sign( a );
2470     bSign = extractFloat32Sign( b );
2471     av = float32_val(a);
2472     bv = float32_val(b);
2473     if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2474     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2475
2476 }
2477
2478 /*----------------------------------------------------------------------------
2479 | Returns the result of converting the double-precision floating-point value
2480 | `a' to the 32-bit two's complement integer format.  The conversion is
2481 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2482 | Arithmetic---which means in particular that the conversion is rounded
2483 | according to the current rounding mode.  If `a' is a NaN, the largest
2484 | positive integer is returned.  Otherwise, if the conversion overflows, the
2485 | largest integer with the same sign as `a' is returned.
2486 *----------------------------------------------------------------------------*/
2487
2488 int32 float64_to_int32( float64 a STATUS_PARAM )
2489 {
2490     flag aSign;
2491     int16 aExp, shiftCount;
2492     bits64 aSig;
2493     a = float64_squash_input_denormal(a STATUS_VAR);
2494
2495     aSig = extractFloat64Frac( a );
2496     aExp = extractFloat64Exp( a );
2497     aSign = extractFloat64Sign( a );
2498     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2499     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2500     shiftCount = 0x42C - aExp;
2501     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2502     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2503
2504 }
2505
2506 /*----------------------------------------------------------------------------
2507 | Returns the result of converting the double-precision floating-point value
2508 | `a' to the 32-bit two's complement integer format.  The conversion is
2509 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2510 | Arithmetic, except that the conversion is always rounded toward zero.
2511 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2512 | the conversion overflows, the largest integer with the same sign as `a' is
2513 | returned.
2514 *----------------------------------------------------------------------------*/
2515
2516 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2517 {
2518     flag aSign;
2519     int16 aExp, shiftCount;
2520     bits64 aSig, savedASig;
2521     int32 z;
2522     a = float64_squash_input_denormal(a STATUS_VAR);
2523
2524     aSig = extractFloat64Frac( a );
2525     aExp = extractFloat64Exp( a );
2526     aSign = extractFloat64Sign( a );
2527     if ( 0x41E < aExp ) {
2528         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2529         goto invalid;
2530     }
2531     else if ( aExp < 0x3FF ) {
2532         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2533         return 0;
2534     }
2535     aSig |= LIT64( 0x0010000000000000 );
2536     shiftCount = 0x433 - aExp;
2537     savedASig = aSig;
2538     aSig >>= shiftCount;
2539     z = aSig;
2540     if ( aSign ) z = - z;
2541     if ( ( z < 0 ) ^ aSign ) {
2542  invalid:
2543         float_raise( float_flag_invalid STATUS_VAR);
2544         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2545     }
2546     if ( ( aSig<<shiftCount ) != savedASig ) {
2547         STATUS(float_exception_flags) |= float_flag_inexact;
2548     }
2549     return z;
2550
2551 }
2552
2553 /*----------------------------------------------------------------------------
2554 | Returns the result of converting the double-precision floating-point value
2555 | `a' to the 16-bit two's complement integer format.  The conversion is
2556 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2557 | Arithmetic, except that the conversion is always rounded toward zero.
2558 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2559 | the conversion overflows, the largest integer with the same sign as `a' is
2560 | returned.
2561 *----------------------------------------------------------------------------*/
2562
2563 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2564 {
2565     flag aSign;
2566     int16 aExp, shiftCount;
2567     bits64 aSig, savedASig;
2568     int32 z;
2569
2570     aSig = extractFloat64Frac( a );
2571     aExp = extractFloat64Exp( a );
2572     aSign = extractFloat64Sign( a );
2573     if ( 0x40E < aExp ) {
2574         if ( ( aExp == 0x7FF ) && aSig ) {
2575             aSign = 0;
2576         }
2577         goto invalid;
2578     }
2579     else if ( aExp < 0x3FF ) {
2580         if ( aExp || aSig ) {
2581             STATUS(float_exception_flags) |= float_flag_inexact;
2582         }
2583         return 0;
2584     }
2585     aSig |= LIT64( 0x0010000000000000 );
2586     shiftCount = 0x433 - aExp;
2587     savedASig = aSig;
2588     aSig >>= shiftCount;
2589     z = aSig;
2590     if ( aSign ) {
2591         z = - z;
2592     }
2593     if ( ( (int16_t)z < 0 ) ^ aSign ) {
2594  invalid:
2595         float_raise( float_flag_invalid STATUS_VAR);
2596         return aSign ? (sbits32) 0xffff8000 : 0x7FFF;
2597     }
2598     if ( ( aSig<<shiftCount ) != savedASig ) {
2599         STATUS(float_exception_flags) |= float_flag_inexact;
2600     }
2601     return z;
2602 }
2603
2604 /*----------------------------------------------------------------------------
2605 | Returns the result of converting the double-precision floating-point value
2606 | `a' to the 64-bit two's complement integer format.  The conversion is
2607 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2608 | Arithmetic---which means in particular that the conversion is rounded
2609 | according to the current rounding mode.  If `a' is a NaN, the largest
2610 | positive integer is returned.  Otherwise, if the conversion overflows, the
2611 | largest integer with the same sign as `a' is returned.
2612 *----------------------------------------------------------------------------*/
2613
2614 int64 float64_to_int64( float64 a STATUS_PARAM )
2615 {
2616     flag aSign;
2617     int16 aExp, shiftCount;
2618     bits64 aSig, aSigExtra;
2619     a = float64_squash_input_denormal(a STATUS_VAR);
2620
2621     aSig = extractFloat64Frac( a );
2622     aExp = extractFloat64Exp( a );
2623     aSign = extractFloat64Sign( a );
2624     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2625     shiftCount = 0x433 - aExp;
2626     if ( shiftCount <= 0 ) {
2627         if ( 0x43E < aExp ) {
2628             float_raise( float_flag_invalid STATUS_VAR);
2629             if (    ! aSign
2630                  || (    ( aExp == 0x7FF )
2631                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2632                ) {
2633                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2634             }
2635             return (sbits64) LIT64( 0x8000000000000000 );
2636         }
2637         aSigExtra = 0;
2638         aSig <<= - shiftCount;
2639     }
2640     else {
2641         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2642     }
2643     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2644
2645 }
2646
2647 /*----------------------------------------------------------------------------
2648 | Returns the result of converting the double-precision floating-point value
2649 | `a' to the 64-bit two's complement integer format.  The conversion is
2650 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2651 | Arithmetic, except that the conversion is always rounded toward zero.
2652 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2653 | the conversion overflows, the largest integer with the same sign as `a' is
2654 | returned.
2655 *----------------------------------------------------------------------------*/
2656
2657 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2658 {
2659     flag aSign;
2660     int16 aExp, shiftCount;
2661     bits64 aSig;
2662     int64 z;
2663     a = float64_squash_input_denormal(a STATUS_VAR);
2664
2665     aSig = extractFloat64Frac( a );
2666     aExp = extractFloat64Exp( a );
2667     aSign = extractFloat64Sign( a );
2668     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2669     shiftCount = aExp - 0x433;
2670     if ( 0 <= shiftCount ) {
2671         if ( 0x43E <= aExp ) {
2672             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2673                 float_raise( float_flag_invalid STATUS_VAR);
2674                 if (    ! aSign
2675                      || (    ( aExp == 0x7FF )
2676                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2677                    ) {
2678                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2679                 }
2680             }
2681             return (sbits64) LIT64( 0x8000000000000000 );
2682         }
2683         z = aSig<<shiftCount;
2684     }
2685     else {
2686         if ( aExp < 0x3FE ) {
2687             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2688             return 0;
2689         }
2690         z = aSig>>( - shiftCount );
2691         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2692             STATUS(float_exception_flags) |= float_flag_inexact;
2693         }
2694     }
2695     if ( aSign ) z = - z;
2696     return z;
2697
2698 }
2699
2700 /*----------------------------------------------------------------------------
2701 | Returns the result of converting the double-precision floating-point value
2702 | `a' to the single-precision floating-point format.  The conversion is
2703 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2704 | Arithmetic.
2705 *----------------------------------------------------------------------------*/
2706
2707 float32 float64_to_float32( float64 a STATUS_PARAM )
2708 {
2709     flag aSign;
2710     int16 aExp;
2711     bits64 aSig;
2712     bits32 zSig;
2713     a = float64_squash_input_denormal(a STATUS_VAR);
2714
2715     aSig = extractFloat64Frac( a );
2716     aExp = extractFloat64Exp( a );
2717     aSign = extractFloat64Sign( a );
2718     if ( aExp == 0x7FF ) {
2719         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2720         return packFloat32( aSign, 0xFF, 0 );
2721     }
2722     shift64RightJamming( aSig, 22, &aSig );
2723     zSig = aSig;
2724     if ( aExp || zSig ) {
2725         zSig |= 0x40000000;
2726         aExp -= 0x381;
2727     }
2728     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2729
2730 }
2731
2732
2733 /*----------------------------------------------------------------------------
2734 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2735 | half-precision floating-point value, returning the result.  After being
2736 | shifted into the proper positions, the three fields are simply added
2737 | together to form the result.  This means that any integer portion of `zSig'
2738 | will be added into the exponent.  Since a properly normalized significand
2739 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2740 | than the desired result exponent whenever `zSig' is a complete, normalized
2741 | significand.
2742 *----------------------------------------------------------------------------*/
2743 static float16 packFloat16(flag zSign, int16 zExp, bits16 zSig)
2744 {
2745     return make_float16(
2746         (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig);
2747 }
2748
2749 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2750    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2751
2752 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2753 {
2754     flag aSign;
2755     int16 aExp;
2756     bits32 aSig;
2757
2758     aSign = extractFloat16Sign(a);
2759     aExp = extractFloat16Exp(a);
2760     aSig = extractFloat16Frac(a);
2761
2762     if (aExp == 0x1f && ieee) {
2763         if (aSig) {
2764             return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
2765         }
2766         return packFloat32(aSign, 0xff, aSig << 13);
2767     }
2768     if (aExp == 0) {
2769         int8 shiftCount;
2770
2771         if (aSig == 0) {
2772             return packFloat32(aSign, 0, 0);
2773         }
2774
2775         shiftCount = countLeadingZeros32( aSig ) - 21;
2776         aSig = aSig << shiftCount;
2777         aExp = -shiftCount;
2778     }
2779     return packFloat32( aSign, aExp + 0x70, aSig << 13);
2780 }
2781
2782 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
2783 {
2784     flag aSign;
2785     int16 aExp;
2786     bits32 aSig;
2787     bits32 mask;
2788     bits32 increment;
2789     int8 roundingMode;
2790     a = float32_squash_input_denormal(a STATUS_VAR);
2791
2792     aSig = extractFloat32Frac( a );
2793     aExp = extractFloat32Exp( a );
2794     aSign = extractFloat32Sign( a );
2795     if ( aExp == 0xFF ) {
2796         if (aSig) {
2797             /* Input is a NaN */
2798             float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2799             if (!ieee) {
2800                 return packFloat16(aSign, 0, 0);
2801             }
2802             return r;
2803         }
2804         /* Infinity */
2805         if (!ieee) {
2806             float_raise(float_flag_invalid STATUS_VAR);
2807             return packFloat16(aSign, 0x1f, 0x3ff);
2808         }
2809         return packFloat16(aSign, 0x1f, 0);
2810     }
2811     if (aExp == 0 && aSig == 0) {
2812         return packFloat16(aSign, 0, 0);
2813     }
2814     /* Decimal point between bits 22 and 23.  */
2815     aSig |= 0x00800000;
2816     aExp -= 0x7f;
2817     if (aExp < -14) {
2818         mask = 0x00ffffff;
2819         if (aExp >= -24) {
2820             mask >>= 25 + aExp;
2821         }
2822     } else {
2823         mask = 0x00001fff;
2824     }
2825     if (aSig & mask) {
2826         float_raise( float_flag_underflow STATUS_VAR );
2827         roundingMode = STATUS(float_rounding_mode);
2828         switch (roundingMode) {
2829         case float_round_nearest_even:
2830             increment = (mask + 1) >> 1;
2831             if ((aSig & mask) == increment) {
2832                 increment = aSig & (increment << 1);
2833             }
2834             break;
2835         case float_round_up:
2836             increment = aSign ? 0 : mask;
2837             break;
2838         case float_round_down:
2839             increment = aSign ? mask : 0;
2840             break;
2841         default: /* round_to_zero */
2842             increment = 0;
2843             break;
2844         }
2845         aSig += increment;
2846         if (aSig >= 0x01000000) {
2847             aSig >>= 1;
2848             aExp++;
2849         }
2850     } else if (aExp < -14
2851           && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2852         float_raise( float_flag_underflow STATUS_VAR);
2853     }
2854
2855     if (ieee) {
2856         if (aExp > 15) {
2857             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2858             return packFloat16(aSign, 0x1f, 0);
2859         }
2860     } else {
2861         if (aExp > 16) {
2862             float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
2863             return packFloat16(aSign, 0x1f, 0x3ff);
2864         }
2865     }
2866     if (aExp < -24) {
2867         return packFloat16(aSign, 0, 0);
2868     }
2869     if (aExp < -14) {
2870         aSig >>= -14 - aExp;
2871         aExp = -14;
2872     }
2873     return packFloat16(aSign, aExp + 14, aSig >> 13);
2874 }
2875
2876 #ifdef FLOATX80
2877
2878 /*----------------------------------------------------------------------------
2879 | Returns the result of converting the double-precision floating-point value
2880 | `a' to the extended double-precision floating-point format.  The conversion
2881 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2882 | Arithmetic.
2883 *----------------------------------------------------------------------------*/
2884
2885 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2886 {
2887     flag aSign;
2888     int16 aExp;
2889     bits64 aSig;
2890
2891     a = float64_squash_input_denormal(a STATUS_VAR);
2892     aSig = extractFloat64Frac( a );
2893     aExp = extractFloat64Exp( a );
2894     aSign = extractFloat64Sign( a );
2895     if ( aExp == 0x7FF ) {
2896         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2897         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2898     }
2899     if ( aExp == 0 ) {
2900         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2901         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2902     }
2903     return
2904         packFloatx80(
2905             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2906
2907 }
2908
2909 #endif
2910
2911 #ifdef FLOAT128
2912
2913 /*----------------------------------------------------------------------------
2914 | Returns the result of converting the double-precision floating-point value
2915 | `a' to the quadruple-precision floating-point format.  The conversion is
2916 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2917 | Arithmetic.
2918 *----------------------------------------------------------------------------*/
2919
2920 float128 float64_to_float128( float64 a STATUS_PARAM )
2921 {
2922     flag aSign;
2923     int16 aExp;
2924     bits64 aSig, zSig0, zSig1;
2925
2926     a = float64_squash_input_denormal(a STATUS_VAR);
2927     aSig = extractFloat64Frac( a );
2928     aExp = extractFloat64Exp( a );
2929     aSign = extractFloat64Sign( a );
2930     if ( aExp == 0x7FF ) {
2931         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2932         return packFloat128( aSign, 0x7FFF, 0, 0 );
2933     }
2934     if ( aExp == 0 ) {
2935         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2936         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2937         --aExp;
2938     }
2939     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2940     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2941
2942 }
2943
2944 #endif
2945
2946 /*----------------------------------------------------------------------------
2947 | Rounds the double-precision floating-point value `a' to an integer, and
2948 | returns the result as a double-precision floating-point value.  The
2949 | operation is performed according to the IEC/IEEE Standard for Binary
2950 | Floating-Point Arithmetic.
2951 *----------------------------------------------------------------------------*/
2952
2953 float64 float64_round_to_int( float64 a STATUS_PARAM )
2954 {
2955     flag aSign;
2956     int16 aExp;
2957     bits64 lastBitMask, roundBitsMask;
2958     int8 roundingMode;
2959     bits64 z;
2960     a = float64_squash_input_denormal(a STATUS_VAR);
2961
2962     aExp = extractFloat64Exp( a );
2963     if ( 0x433 <= aExp ) {
2964         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2965             return propagateFloat64NaN( a, a STATUS_VAR );
2966         }
2967         return a;
2968     }
2969     if ( aExp < 0x3FF ) {
2970         if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2971         STATUS(float_exception_flags) |= float_flag_inexact;
2972         aSign = extractFloat64Sign( a );
2973         switch ( STATUS(float_rounding_mode) ) {
2974          case float_round_nearest_even:
2975             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2976                 return packFloat64( aSign, 0x3FF, 0 );
2977             }
2978             break;
2979          case float_round_down:
2980             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2981          case float_round_up:
2982             return make_float64(
2983             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2984         }
2985         return packFloat64( aSign, 0, 0 );
2986     }
2987     lastBitMask = 1;
2988     lastBitMask <<= 0x433 - aExp;
2989     roundBitsMask = lastBitMask - 1;
2990     z = float64_val(a);
2991     roundingMode = STATUS(float_rounding_mode);
2992     if ( roundingMode == float_round_nearest_even ) {
2993         z += lastBitMask>>1;
2994         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2995     }
2996     else if ( roundingMode != float_round_to_zero ) {
2997         if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2998             z += roundBitsMask;
2999         }
3000     }
3001     z &= ~ roundBitsMask;
3002     if ( z != float64_val(a) )
3003         STATUS(float_exception_flags) |= float_flag_inexact;
3004     return make_float64(z);
3005
3006 }
3007
3008 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3009 {
3010     int oldmode;
3011     float64 res;
3012     oldmode = STATUS(float_rounding_mode);
3013     STATUS(float_rounding_mode) = float_round_to_zero;
3014     res = float64_round_to_int(a STATUS_VAR);
3015     STATUS(float_rounding_mode) = oldmode;
3016     return res;
3017 }
3018
3019 /*----------------------------------------------------------------------------
3020 | Returns the result of adding the absolute values of the double-precision
3021 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3022 | before being returned.  `zSign' is ignored if the result is a NaN.
3023 | The addition is performed according to the IEC/IEEE Standard for Binary
3024 | Floating-Point Arithmetic.
3025 *----------------------------------------------------------------------------*/
3026
3027 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3028 {
3029     int16 aExp, bExp, zExp;
3030     bits64 aSig, bSig, zSig;
3031     int16 expDiff;
3032
3033     aSig = extractFloat64Frac( a );
3034     aExp = extractFloat64Exp( a );
3035     bSig = extractFloat64Frac( b );
3036     bExp = extractFloat64Exp( b );
3037     expDiff = aExp - bExp;
3038     aSig <<= 9;
3039     bSig <<= 9;
3040     if ( 0 < expDiff ) {
3041         if ( aExp == 0x7FF ) {
3042             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3043             return a;
3044         }
3045         if ( bExp == 0 ) {
3046             --expDiff;
3047         }
3048         else {
3049             bSig |= LIT64( 0x2000000000000000 );
3050         }
3051         shift64RightJamming( bSig, expDiff, &bSig );
3052         zExp = aExp;
3053     }
3054     else if ( expDiff < 0 ) {
3055         if ( bExp == 0x7FF ) {
3056             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3057             return packFloat64( zSign, 0x7FF, 0 );
3058         }
3059         if ( aExp == 0 ) {
3060             ++expDiff;
3061         }
3062         else {
3063             aSig |= LIT64( 0x2000000000000000 );
3064         }
3065         shift64RightJamming( aSig, - expDiff, &aSig );
3066         zExp = bExp;
3067     }
3068     else {
3069         if ( aExp == 0x7FF ) {
3070             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3071             return a;
3072         }
3073         if ( aExp == 0 ) {
3074             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
3075             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3076         }
3077         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3078         zExp = aExp;
3079         goto roundAndPack;
3080     }
3081     aSig |= LIT64( 0x2000000000000000 );
3082     zSig = ( aSig + bSig )<<1;
3083     --zExp;
3084     if ( (sbits64) zSig < 0 ) {
3085         zSig = aSig + bSig;
3086         ++zExp;
3087     }
3088  roundAndPack:
3089     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3090
3091 }
3092
3093 /*----------------------------------------------------------------------------
3094 | Returns the result of subtracting the absolute values of the double-
3095 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3096 | difference is negated before being returned.  `zSign' is ignored if the
3097 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3098 | Standard for Binary Floating-Point Arithmetic.
3099 *----------------------------------------------------------------------------*/
3100
3101 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3102 {
3103     int16 aExp, bExp, zExp;
3104     bits64 aSig, bSig, zSig;
3105     int16 expDiff;
3106
3107     aSig = extractFloat64Frac( a );
3108     aExp = extractFloat64Exp( a );
3109     bSig = extractFloat64Frac( b );
3110     bExp = extractFloat64Exp( b );
3111     expDiff = aExp - bExp;
3112     aSig <<= 10;
3113     bSig <<= 10;
3114     if ( 0 < expDiff ) goto aExpBigger;
3115     if ( expDiff < 0 ) goto bExpBigger;
3116     if ( aExp == 0x7FF ) {
3117         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3118         float_raise( float_flag_invalid STATUS_VAR);
3119         return float64_default_nan;
3120     }
3121     if ( aExp == 0 ) {
3122         aExp = 1;
3123         bExp = 1;
3124     }
3125     if ( bSig < aSig ) goto aBigger;
3126     if ( aSig < bSig ) goto bBigger;
3127     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3128  bExpBigger:
3129     if ( bExp == 0x7FF ) {
3130         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3131         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3132     }
3133     if ( aExp == 0 ) {
3134         ++expDiff;
3135     }
3136     else {
3137         aSig |= LIT64( 0x4000000000000000 );
3138     }
3139     shift64RightJamming( aSig, - expDiff, &aSig );
3140     bSig |= LIT64( 0x4000000000000000 );
3141  bBigger:
3142     zSig = bSig - aSig;
3143     zExp = bExp;
3144     zSign ^= 1;
3145     goto normalizeRoundAndPack;
3146  aExpBigger:
3147     if ( aExp == 0x7FF ) {
3148         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3149         return a;
3150     }
3151     if ( bExp == 0 ) {
3152         --expDiff;
3153     }
3154     else {
3155         bSig |= LIT64( 0x4000000000000000 );
3156     }
3157     shift64RightJamming( bSig, expDiff, &bSig );
3158     aSig |= LIT64( 0x4000000000000000 );
3159  aBigger:
3160     zSig = aSig - bSig;
3161     zExp = aExp;
3162  normalizeRoundAndPack:
3163     --zExp;
3164     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3165
3166 }
3167
3168 /*----------------------------------------------------------------------------
3169 | Returns the result of adding the double-precision floating-point values `a'
3170 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3171 | Binary Floating-Point Arithmetic.
3172 *----------------------------------------------------------------------------*/
3173
3174 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3175 {
3176     flag aSign, bSign;
3177     a = float64_squash_input_denormal(a STATUS_VAR);
3178     b = float64_squash_input_denormal(b STATUS_VAR);
3179
3180     aSign = extractFloat64Sign( a );
3181     bSign = extractFloat64Sign( b );
3182     if ( aSign == bSign ) {
3183         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3184     }
3185     else {
3186         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3187     }
3188
3189 }
3190
3191 /*----------------------------------------------------------------------------
3192 | Returns the result of subtracting the double-precision floating-point values
3193 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3194 | for Binary Floating-Point Arithmetic.
3195 *----------------------------------------------------------------------------*/
3196
3197 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3198 {
3199     flag aSign, bSign;
3200     a = float64_squash_input_denormal(a STATUS_VAR);
3201     b = float64_squash_input_denormal(b STATUS_VAR);
3202
3203     aSign = extractFloat64Sign( a );
3204     bSign = extractFloat64Sign( b );
3205     if ( aSign == bSign ) {
3206         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3207     }
3208     else {
3209         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3210     }
3211
3212 }
3213
3214 /*----------------------------------------------------------------------------
3215 | Returns the result of multiplying the double-precision floating-point values
3216 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3217 | for Binary Floating-Point Arithmetic.
3218 *----------------------------------------------------------------------------*/
3219
3220 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3221 {
3222     flag aSign, bSign, zSign;
3223     int16 aExp, bExp, zExp;
3224     bits64 aSig, bSig, zSig0, zSig1;
3225
3226     a = float64_squash_input_denormal(a STATUS_VAR);
3227     b = float64_squash_input_denormal(b STATUS_VAR);
3228
3229     aSig = extractFloat64Frac( a );
3230     aExp = extractFloat64Exp( a );
3231     aSign = extractFloat64Sign( a );
3232     bSig = extractFloat64Frac( b );
3233     bExp = extractFloat64Exp( b );
3234     bSign = extractFloat64Sign( b );
3235     zSign = aSign ^ bSign;
3236     if ( aExp == 0x7FF ) {
3237         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3238             return propagateFloat64NaN( a, b STATUS_VAR );
3239         }
3240         if ( ( bExp | bSig ) == 0 ) {
3241             float_raise( float_flag_invalid STATUS_VAR);
3242             return float64_default_nan;
3243         }
3244         return packFloat64( zSign, 0x7FF, 0 );
3245     }
3246     if ( bExp == 0x7FF ) {
3247         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3248         if ( ( aExp | aSig ) == 0 ) {
3249             float_raise( float_flag_invalid STATUS_VAR);
3250             return float64_default_nan;
3251         }
3252         return packFloat64( zSign, 0x7FF, 0 );
3253     }
3254     if ( aExp == 0 ) {
3255         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3256         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3257     }
3258     if ( bExp == 0 ) {
3259         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3260         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3261     }
3262     zExp = aExp + bExp - 0x3FF;
3263     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3264     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3265     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3266     zSig0 |= ( zSig1 != 0 );
3267     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
3268         zSig0 <<= 1;
3269         --zExp;
3270     }
3271     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3272
3273 }
3274
3275 /*----------------------------------------------------------------------------
3276 | Returns the result of dividing the double-precision floating-point value `a'
3277 | by the corresponding value `b'.  The operation is performed according to
3278 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3279 *----------------------------------------------------------------------------*/
3280
3281 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3282 {
3283     flag aSign, bSign, zSign;
3284     int16 aExp, bExp, zExp;
3285     bits64 aSig, bSig, zSig;
3286     bits64 rem0, rem1;
3287     bits64 term0, term1;
3288     a = float64_squash_input_denormal(a STATUS_VAR);
3289     b = float64_squash_input_denormal(b STATUS_VAR);
3290
3291     aSig = extractFloat64Frac( a );
3292     aExp = extractFloat64Exp( a );
3293     aSign = extractFloat64Sign( a );
3294     bSig = extractFloat64Frac( b );
3295     bExp = extractFloat64Exp( b );
3296     bSign = extractFloat64Sign( b );
3297     zSign = aSign ^ bSign;
3298     if ( aExp == 0x7FF ) {
3299         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3300         if ( bExp == 0x7FF ) {
3301             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3302             float_raise( float_flag_invalid STATUS_VAR);
3303             return float64_default_nan;
3304         }
3305         return packFloat64( zSign, 0x7FF, 0 );
3306     }
3307     if ( bExp == 0x7FF ) {
3308         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3309         return packFloat64( zSign, 0, 0 );
3310     }
3311     if ( bExp == 0 ) {
3312         if ( bSig == 0 ) {
3313             if ( ( aExp | aSig ) == 0 ) {
3314                 float_raise( float_flag_invalid STATUS_VAR);
3315                 return float64_default_nan;
3316             }
3317             float_raise( float_flag_divbyzero STATUS_VAR);
3318             return packFloat64( zSign, 0x7FF, 0 );
3319         }
3320         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3321     }
3322     if ( aExp == 0 ) {
3323         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3324         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3325     }
3326     zExp = aExp - bExp + 0x3FD;
3327     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3328     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3329     if ( bSig <= ( aSig + aSig ) ) {
3330         aSig >>= 1;
3331         ++zExp;
3332     }
3333     zSig = estimateDiv128To64( aSig, 0, bSig );
3334     if ( ( zSig & 0x1FF ) <= 2 ) {
3335         mul64To128( bSig, zSig, &term0, &term1 );
3336         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3337         while ( (sbits64) rem0 < 0 ) {
3338             --zSig;
3339             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3340         }
3341         zSig |= ( rem1 != 0 );
3342     }
3343     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3344
3345 }
3346
3347 /*----------------------------------------------------------------------------
3348 | Returns the remainder of the double-precision floating-point value `a'
3349 | with respect to the corresponding value `b'.  The operation is performed
3350 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3351 *----------------------------------------------------------------------------*/
3352
3353 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3354 {
3355     flag aSign, zSign;
3356     int16 aExp, bExp, expDiff;
3357     bits64 aSig, bSig;
3358     bits64 q, alternateASig;
3359     sbits64 sigMean;
3360
3361     a = float64_squash_input_denormal(a STATUS_VAR);
3362     b = float64_squash_input_denormal(b STATUS_VAR);
3363     aSig = extractFloat64Frac( a );
3364     aExp = extractFloat64Exp( a );
3365     aSign = extractFloat64Sign( a );
3366     bSig = extractFloat64Frac( b );
3367     bExp = extractFloat64Exp( b );
3368     if ( aExp == 0x7FF ) {
3369         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3370             return propagateFloat64NaN( a, b STATUS_VAR );
3371         }
3372         float_raise( float_flag_invalid STATUS_VAR);
3373         return float64_default_nan;
3374     }
3375     if ( bExp == 0x7FF ) {
3376         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3377         return a;
3378     }
3379     if ( bExp == 0 ) {
3380         if ( bSig == 0 ) {
3381             float_raise( float_flag_invalid STATUS_VAR);
3382             return float64_default_nan;
3383         }
3384         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3385     }
3386     if ( aExp == 0 ) {
3387         if ( aSig == 0 ) return a;
3388         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3389     }
3390     expDiff = aExp - bExp;
3391     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3392     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3393     if ( expDiff < 0 ) {
3394         if ( expDiff < -1 ) return a;
3395         aSig >>= 1;
3396     }
3397     q = ( bSig <= aSig );
3398     if ( q ) aSig -= bSig;
3399     expDiff -= 64;
3400     while ( 0 < expDiff ) {
3401         q = estimateDiv128To64( aSig, 0, bSig );
3402         q = ( 2 < q ) ? q - 2 : 0;
3403         aSig = - ( ( bSig>>2 ) * q );
3404         expDiff -= 62;
3405     }
3406     expDiff += 64;
3407     if ( 0 < expDiff ) {
3408         q = estimateDiv128To64( aSig, 0, bSig );
3409         q = ( 2 < q ) ? q - 2 : 0;
3410         q >>= 64 - expDiff;
3411         bSig >>= 2;
3412         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3413     }
3414     else {
3415         aSig >>= 2;
3416         bSig >>= 2;
3417     }
3418     do {
3419         alternateASig = aSig;
3420         ++q;
3421         aSig -= bSig;
3422     } while ( 0 <= (sbits64) aSig );
3423     sigMean = aSig + alternateASig;
3424     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3425         aSig = alternateASig;
3426     }
3427     zSign = ( (sbits64) aSig < 0 );
3428     if ( zSign ) aSig = - aSig;
3429     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3430
3431 }
3432
3433 /*----------------------------------------------------------------------------
3434 | Returns the square root of the double-precision floating-point value `a'.
3435 | The operation is performed according to the IEC/IEEE Standard for Binary
3436 | Floating-Point Arithmetic.
3437 *----------------------------------------------------------------------------*/
3438
3439 float64 float64_sqrt( float64 a STATUS_PARAM )
3440 {
3441     flag aSign;
3442     int16 aExp, zExp;
3443     bits64 aSig, zSig, doubleZSig;
3444     bits64 rem0, rem1, term0, term1;
3445     a = float64_squash_input_denormal(a STATUS_VAR);
3446
3447     aSig = extractFloat64Frac( a );
3448     aExp = extractFloat64Exp( a );
3449     aSign = extractFloat64Sign( a );
3450     if ( aExp == 0x7FF ) {
3451         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3452         if ( ! aSign ) return a;
3453         float_raise( float_flag_invalid STATUS_VAR);
3454         return float64_default_nan;
3455     }
3456     if ( aSign ) {
3457         if ( ( aExp | aSig ) == 0 ) return a;
3458         float_raise( float_flag_invalid STATUS_VAR);
3459         return float64_default_nan;
3460     }
3461     if ( aExp == 0 ) {
3462         if ( aSig == 0 ) return float64_zero;
3463         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3464     }
3465     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3466     aSig |= LIT64( 0x0010000000000000 );
3467     zSig = estimateSqrt32( aExp, aSig>>21 );
3468     aSig <<= 9 - ( aExp & 1 );
3469     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3470     if ( ( zSig & 0x1FF ) <= 5 ) {
3471         doubleZSig = zSig<<1;
3472         mul64To128( zSig, zSig, &term0, &term1 );
3473         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3474         while ( (sbits64) rem0 < 0 ) {
3475             --zSig;
3476             doubleZSig -= 2;
3477             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3478         }
3479         zSig |= ( ( rem0 | rem1 ) != 0 );
3480     }
3481     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3482
3483 }
3484
3485 /*----------------------------------------------------------------------------
3486 | Returns the binary log of the double-precision floating-point value `a'.
3487 | The operation is performed according to the IEC/IEEE Standard for Binary
3488 | Floating-Point Arithmetic.
3489 *----------------------------------------------------------------------------*/
3490 float64 float64_log2( float64 a STATUS_PARAM )
3491 {
3492     flag aSign, zSign;
3493     int16 aExp;
3494     bits64 aSig, aSig0, aSig1, zSig, i;
3495     a = float64_squash_input_denormal(a STATUS_VAR);
3496
3497     aSig = extractFloat64Frac( a );
3498     aExp = extractFloat64Exp( a );
3499     aSign = extractFloat64Sign( a );
3500
3501     if ( aExp == 0 ) {
3502         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3503         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3504     }
3505     if ( aSign ) {
3506         float_raise( float_flag_invalid STATUS_VAR);
3507         return float64_default_nan;
3508     }
3509     if ( aExp == 0x7FF ) {
3510         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3511         return a;
3512     }
3513
3514     aExp -= 0x3FF;
3515     aSig |= LIT64( 0x0010000000000000 );
3516     zSign = aExp < 0;
3517     zSig = (bits64)aExp << 52;
3518     for (i = 1LL << 51; i > 0; i >>= 1) {
3519         mul64To128( aSig, aSig, &aSig0, &aSig1 );
3520         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3521         if ( aSig & LIT64( 0x0020000000000000 ) ) {
3522             aSig >>= 1;
3523             zSig |= i;
3524         }
3525     }
3526
3527     if ( zSign )
3528         zSig = -zSig;
3529     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3530 }
3531
3532 /*----------------------------------------------------------------------------
3533 | Returns 1 if the double-precision floating-point value `a' is equal to the
3534 | corresponding value `b', and 0 otherwise.  The comparison is performed
3535 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3536 *----------------------------------------------------------------------------*/
3537
3538 int float64_eq( float64 a, float64 b STATUS_PARAM )
3539 {
3540     bits64 av, bv;
3541     a = float64_squash_input_denormal(a STATUS_VAR);
3542     b = float64_squash_input_denormal(b STATUS_VAR);
3543
3544     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3545          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3546        ) {
3547         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3548             float_raise( float_flag_invalid STATUS_VAR);
3549         }
3550         return 0;
3551     }
3552     av = float64_val(a);
3553     bv = float64_val(b);
3554     return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3555
3556 }
3557
3558 /*----------------------------------------------------------------------------
3559 | Returns 1 if the double-precision floating-point value `a' is less than or
3560 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
3561 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3562 | Arithmetic.
3563 *----------------------------------------------------------------------------*/
3564
3565 int float64_le( float64 a, float64 b STATUS_PARAM )
3566 {
3567     flag aSign, bSign;
3568     bits64 av, bv;
3569     a = float64_squash_input_denormal(a STATUS_VAR);
3570     b = float64_squash_input_denormal(b STATUS_VAR);
3571
3572     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3573          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3574        ) {
3575         float_raise( float_flag_invalid STATUS_VAR);
3576         return 0;
3577     }
3578     aSign = extractFloat64Sign( a );
3579     bSign = extractFloat64Sign( b );
3580     av = float64_val(a);
3581     bv = float64_val(b);
3582     if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3583     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3584
3585 }
3586
3587 /*----------------------------------------------------------------------------
3588 | Returns 1 if the double-precision floating-point value `a' is less than
3589 | the corresponding value `b', and 0 otherwise.  The comparison is performed
3590 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3591 *----------------------------------------------------------------------------*/
3592
3593 int float64_lt( float64 a, float64 b STATUS_PARAM )
3594 {
3595     flag aSign, bSign;
3596     bits64 av, bv;
3597
3598     a = float64_squash_input_denormal(a STATUS_VAR);
3599     b = float64_squash_input_denormal(b STATUS_VAR);
3600     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3601          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3602        ) {
3603         float_raise( float_flag_invalid STATUS_VAR);
3604         return 0;
3605     }
3606     aSign = extractFloat64Sign( a );
3607     bSign = extractFloat64Sign( b );
3608     av = float64_val(a);
3609     bv = float64_val(b);
3610     if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3611     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3612
3613 }
3614
3615 /*----------------------------------------------------------------------------
3616 | Returns 1 if the double-precision floating-point value `a' is equal to the
3617 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
3618 | if either operand is a NaN.  Otherwise, the comparison is performed
3619 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3620 *----------------------------------------------------------------------------*/
3621
3622 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3623 {
3624     bits64 av, bv;
3625     a = float64_squash_input_denormal(a STATUS_VAR);
3626     b = float64_squash_input_denormal(b STATUS_VAR);
3627
3628     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3629          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3630        ) {
3631         float_raise( float_flag_invalid STATUS_VAR);
3632         return 0;
3633     }
3634     av = float64_val(a);
3635     bv = float64_val(b);
3636     return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3637
3638 }
3639
3640 /*----------------------------------------------------------------------------
3641 | Returns 1 if the double-precision floating-point value `a' is less than or
3642 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3643 | cause an exception.  Otherwise, the comparison is performed according to the
3644 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3645 *----------------------------------------------------------------------------*/
3646
3647 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3648 {
3649     flag aSign, bSign;
3650     bits64 av, bv;
3651     a = float64_squash_input_denormal(a STATUS_VAR);
3652     b = float64_squash_input_denormal(b STATUS_VAR);
3653
3654     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3655          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3656        ) {
3657         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3658             float_raise( float_flag_invalid STATUS_VAR);
3659         }
3660         return 0;
3661     }
3662     aSign = extractFloat64Sign( a );
3663     bSign = extractFloat64Sign( b );
3664     av = float64_val(a);
3665     bv = float64_val(b);
3666     if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3667     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3668
3669 }
3670
3671 /*----------------------------------------------------------------------------
3672 | Returns 1 if the double-precision floating-point value `a' is less than
3673 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3674 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3675 | Standard for Binary Floating-Point Arithmetic.
3676 *----------------------------------------------------------------------------*/
3677
3678 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3679 {
3680     flag aSign, bSign;
3681     bits64 av, bv;
3682     a = float64_squash_input_denormal(a STATUS_VAR);
3683     b = float64_squash_input_denormal(b STATUS_VAR);
3684
3685     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3686          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3687        ) {
3688         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3689             float_raise( float_flag_invalid STATUS_VAR);
3690         }
3691         return 0;
3692     }
3693     aSign = extractFloat64Sign( a );
3694     bSign = extractFloat64Sign( b );
3695     av = float64_val(a);
3696     bv = float64_val(b);
3697     if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3698     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3699
3700 }
3701
3702 #ifdef FLOATX80
3703
3704 /*----------------------------------------------------------------------------
3705 | Returns the result of converting the extended double-precision floating-
3706 | point value `a' to the 32-bit two's complement integer format.  The
3707 | conversion is performed according to the IEC/IEEE Standard for Binary
3708 | Floating-Point Arithmetic---which means in particular that the conversion
3709 | is rounded according to the current rounding mode.  If `a' is a NaN, the
3710 | largest positive integer is returned.  Otherwise, if the conversion
3711 | overflows, the largest integer with the same sign as `a' is returned.
3712 *----------------------------------------------------------------------------*/
3713
3714 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3715 {
3716     flag aSign;
3717     int32 aExp, shiftCount;
3718     bits64 aSig;
3719
3720     aSig = extractFloatx80Frac( a );
3721     aExp = extractFloatx80Exp( a );
3722     aSign = extractFloatx80Sign( a );
3723     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3724     shiftCount = 0x4037 - aExp;
3725     if ( shiftCount <= 0 ) shiftCount = 1;
3726     shift64RightJamming( aSig, shiftCount, &aSig );
3727     return roundAndPackInt32( aSign, aSig STATUS_VAR );
3728
3729 }
3730
3731 /*----------------------------------------------------------------------------
3732 | Returns the result of converting the extended double-precision floating-
3733 | point value `a' to the 32-bit two's complement integer format.  The
3734 | conversion is performed according to the IEC/IEEE Standard for Binary
3735 | Floating-Point Arithmetic, except that the conversion is always rounded
3736 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3737 | Otherwise, if the conversion overflows, the largest integer with the same
3738 | sign as `a' is returned.
3739 *----------------------------------------------------------------------------*/
3740
3741 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3742 {
3743     flag aSign;
3744     int32 aExp, shiftCount;
3745     bits64 aSig, savedASig;
3746     int32 z;
3747
3748     aSig = extractFloatx80Frac( a );
3749     aExp = extractFloatx80Exp( a );
3750     aSign = extractFloatx80Sign( a );
3751     if ( 0x401E < aExp ) {
3752         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3753         goto invalid;
3754     }
3755     else if ( aExp < 0x3FFF ) {
3756         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3757         return 0;
3758     }
3759     shiftCount = 0x403E - aExp;
3760     savedASig = aSig;
3761     aSig >>= shiftCount;
3762     z = aSig;
3763     if ( aSign ) z = - z;
3764     if ( ( z < 0 ) ^ aSign ) {
3765  invalid:
3766         float_raise( float_flag_invalid STATUS_VAR);
3767         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3768     }
3769     if ( ( aSig<<shiftCount ) != savedASig ) {
3770         STATUS(float_exception_flags) |= float_flag_inexact;
3771     }
3772     return z;
3773
3774 }
3775
3776 /*----------------------------------------------------------------------------
3777 | Returns the result of converting the extended double-precision floating-
3778 | point value `a' to the 64-bit two's complement integer format.  The
3779 | conversion is performed according to the IEC/IEEE Standard for Binary
3780 | Floating-Point Arithmetic---which means in particular that the conversion
3781 | is rounded according to the current rounding mode.  If `a' is a NaN,
3782 | the largest positive integer is returned.  Otherwise, if the conversion
3783 | overflows, the largest integer with the same sign as `a' is returned.
3784 *----------------------------------------------------------------------------*/
3785
3786 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3787 {
3788     flag aSign;
3789     int32 aExp, shiftCount;
3790     bits64 aSig, aSigExtra;
3791
3792     aSig = extractFloatx80Frac( a );
3793     aExp = extractFloatx80Exp( a );
3794     aSign = extractFloatx80Sign( a );
3795     shiftCount = 0x403E - aExp;
3796     if ( shiftCount <= 0 ) {
3797         if ( shiftCount ) {
3798             float_raise( float_flag_invalid STATUS_VAR);
3799             if (    ! aSign
3800                  || (    ( aExp == 0x7FFF )
3801                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3802                ) {
3803                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3804             }
3805             return (sbits64) LIT64( 0x8000000000000000 );
3806         }
3807         aSigExtra = 0;
3808     }
3809     else {
3810         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3811     }
3812     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3813
3814 }
3815
3816 /*----------------------------------------------------------------------------
3817 | Returns the result of converting the extended double-precision floating-
3818 | point value `a' to the 64-bit two's complement integer format.  The
3819 | conversion is performed according to the IEC/IEEE Standard for Binary
3820 | Floating-Point Arithmetic, except that the conversion is always rounded
3821 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3822 | Otherwise, if the conversion overflows, the largest integer with the same
3823 | sign as `a' is returned.
3824 *----------------------------------------------------------------------------*/
3825
3826 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3827 {
3828     flag aSign;
3829     int32 aExp, shiftCount;
3830     bits64 aSig;
3831     int64 z;
3832
3833     aSig = extractFloatx80Frac( a );
3834     aExp = extractFloatx80Exp( a );
3835     aSign = extractFloatx80Sign( a );
3836     shiftCount = aExp - 0x403E;
3837     if ( 0 <= shiftCount ) {
3838         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3839         if ( ( a.high != 0xC03E ) || aSig ) {
3840             float_raise( float_flag_invalid STATUS_VAR);
3841             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3842                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3843             }
3844         }
3845         return (sbits64) LIT64( 0x8000000000000000 );
3846     }
3847     else if ( aExp < 0x3FFF ) {
3848         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3849         return 0;
3850     }
3851     z = aSig>>( - shiftCount );
3852     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3853         STATUS(float_exception_flags) |= float_flag_inexact;
3854     }
3855     if ( aSign ) z = - z;
3856     return z;
3857
3858 }
3859
3860 /*----------------------------------------------------------------------------
3861 | Returns the result of converting the extended double-precision floating-
3862 | point value `a' to the single-precision floating-point format.  The
3863 | conversion is performed according to the IEC/IEEE Standard for Binary
3864 | Floating-Point Arithmetic.
3865 *----------------------------------------------------------------------------*/
3866
3867 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3868 {
3869     flag aSign;
3870     int32 aExp;
3871     bits64 aSig;
3872
3873     aSig = extractFloatx80Frac( a );
3874     aExp = extractFloatx80Exp( a );
3875     aSign = extractFloatx80Sign( a );
3876     if ( aExp == 0x7FFF ) {
3877         if ( (bits64) ( aSig<<1 ) ) {
3878             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3879         }
3880         return packFloat32( aSign, 0xFF, 0 );
3881     }
3882     shift64RightJamming( aSig, 33, &aSig );
3883     if ( aExp || aSig ) aExp -= 0x3F81;
3884     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3885
3886 }
3887
3888 /*----------------------------------------------------------------------------
3889 | Returns the result of converting the extended double-precision floating-
3890 | point value `a' to the double-precision floating-point format.  The
3891 | conversion is performed according to the IEC/IEEE Standard for Binary
3892 | Floating-Point Arithmetic.
3893 *----------------------------------------------------------------------------*/
3894
3895 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3896 {
3897     flag aSign;
3898     int32 aExp;
3899     bits64 aSig, zSig;
3900
3901     aSig = extractFloatx80Frac( a );
3902     aExp = extractFloatx80Exp( a );
3903     aSign = extractFloatx80Sign( a );
3904     if ( aExp == 0x7FFF ) {
3905         if ( (bits64) ( aSig<<1 ) ) {
3906             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3907         }
3908         return packFloat64( aSign, 0x7FF, 0 );
3909     }
3910     shift64RightJamming( aSig, 1, &zSig );
3911     if ( aExp || aSig ) aExp -= 0x3C01;
3912     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3913
3914 }
3915
3916 #ifdef FLOAT128
3917
3918 /*----------------------------------------------------------------------------
3919 | Returns the result of converting the extended double-precision floating-
3920 | point value `a' to the quadruple-precision floating-point format.  The
3921 | conversion is performed according to the IEC/IEEE Standard for Binary
3922 | Floating-Point Arithmetic.
3923 *----------------------------------------------------------------------------*/
3924
3925 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3926 {
3927     flag aSign;
3928     int16 aExp;
3929     bits64 aSig, zSig0, zSig1;
3930
3931     aSig = extractFloatx80Frac( a );
3932     aExp = extractFloatx80Exp( a );
3933     aSign = extractFloatx80Sign( a );
3934     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3935         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3936     }
3937     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3938     return packFloat128( aSign, aExp, zSig0, zSig1 );
3939
3940 }
3941
3942 #endif
3943
3944 /*----------------------------------------------------------------------------
3945 | Rounds the extended double-precision floating-point value `a' to an integer,
3946 | and returns the result as an extended quadruple-precision floating-point
3947 | value.  The operation is performed according to the IEC/IEEE Standard for
3948 | Binary Floating-Point Arithmetic.
3949 *----------------------------------------------------------------------------*/
3950
3951 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3952 {
3953     flag aSign;
3954     int32 aExp;
3955     bits64 lastBitMask, roundBitsMask;
3956     int8 roundingMode;
3957     floatx80 z;
3958
3959     aExp = extractFloatx80Exp( a );
3960     if ( 0x403E <= aExp ) {
3961         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3962             return propagateFloatx80NaN( a, a STATUS_VAR );
3963         }
3964         return a;
3965     }
3966     if ( aExp < 0x3FFF ) {
3967         if (    ( aExp == 0 )
3968              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3969             return a;
3970         }
3971         STATUS(float_exception_flags) |= float_flag_inexact;
3972         aSign = extractFloatx80Sign( a );
3973         switch ( STATUS(float_rounding_mode) ) {
3974          case float_round_nearest_even:
3975             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3976                ) {
3977                 return
3978                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3979             }
3980             break;
3981          case float_round_down:
3982             return
3983                   aSign ?
3984                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3985                 : packFloatx80( 0, 0, 0 );
3986          case float_round_up:
3987             return
3988                   aSign ? packFloatx80( 1, 0, 0 )
3989                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3990         }
3991         return packFloatx80( aSign, 0, 0 );
3992     }
3993     lastBitMask = 1;
3994     lastBitMask <<= 0x403E - aExp;
3995     roundBitsMask = lastBitMask - 1;
3996     z = a;
3997     roundingMode = STATUS(float_rounding_mode);
3998     if ( roundingMode == float_round_nearest_even ) {
3999         z.low += lastBitMask>>1;
4000         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4001     }
4002     else if ( roundingMode != float_round_to_zero ) {
4003         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4004             z.low += roundBitsMask;
4005         }
4006     }
4007     z.low &= ~ roundBitsMask;
4008     if ( z.low == 0 ) {
4009         ++z.high;
4010         z.low = LIT64( 0x8000000000000000 );
4011     }
4012     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4013     return z;
4014
4015 }
4016
4017 /*----------------------------------------------------------------------------
4018 | Returns the result of adding the absolute values of the extended double-
4019 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4020 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4021 | The addition is performed according to the IEC/IEEE Standard for Binary
4022 | Floating-Point Arithmetic.
4023 *----------------------------------------------------------------------------*/
4024
4025 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4026 {
4027     int32 aExp, bExp, zExp;
4028     bits64 aSig, bSig, zSig0, zSig1;
4029     int32 expDiff;
4030
4031     aSig = extractFloatx80Frac( a );
4032     aExp = extractFloatx80Exp( a );
4033     bSig = extractFloatx80Frac( b );
4034     bExp = extractFloatx80Exp( b );
4035     expDiff = aExp - bExp;
4036     if ( 0 < expDiff ) {
4037         if ( aExp == 0x7FFF ) {
4038             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4039             return a;
4040         }
4041         if ( bExp == 0 ) --expDiff;
4042         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4043         zExp = aExp;
4044     }
4045     else if ( expDiff < 0 ) {
4046         if ( bExp == 0x7FFF ) {
4047             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4048             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4049         }
4050         if ( aExp == 0 ) ++expDiff;
4051         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4052         zExp = bExp;
4053     }
4054     else {
4055         if ( aExp == 0x7FFF ) {
4056             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
4057                 return propagateFloatx80NaN( a, b STATUS_VAR );
4058             }
4059             return a;
4060         }
4061         zSig1 = 0;
4062         zSig0 = aSig + bSig;
4063         if ( aExp == 0 ) {
4064             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4065             goto roundAndPack;
4066         }
4067         zExp = aExp;
4068         goto shiftRight1;
4069     }
4070     zSig0 = aSig + bSig;
4071     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
4072  shiftRight1:
4073     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4074     zSig0 |= LIT64( 0x8000000000000000 );
4075     ++zExp;
4076  roundAndPack:
4077     return
4078         roundAndPackFloatx80(
4079             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4080
4081 }
4082
4083 /*----------------------------------------------------------------------------
4084 | Returns the result of subtracting the absolute values of the extended
4085 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4086 | difference is negated before being returned.  `zSign' is ignored if the
4087 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4088 | Standard for Binary Floating-Point Arithmetic.
4089 *----------------------------------------------------------------------------*/
4090
4091 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4092 {
4093     int32 aExp, bExp, zExp;
4094     bits64 aSig, bSig, zSig0, zSig1;
4095     int32 expDiff;
4096     floatx80 z;
4097
4098     aSig = extractFloatx80Frac( a );
4099     aExp = extractFloatx80Exp( a );
4100     bSig = extractFloatx80Frac( b );
4101     bExp = extractFloatx80Exp( b );
4102     expDiff = aExp - bExp;
4103     if ( 0 < expDiff ) goto aExpBigger;
4104     if ( expDiff < 0 ) goto bExpBigger;
4105     if ( aExp == 0x7FFF ) {
4106         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
4107             return propagateFloatx80NaN( a, b STATUS_VAR );
4108         }
4109         float_raise( float_flag_invalid STATUS_VAR);
4110         z.low = floatx80_default_nan_low;
4111         z.high = floatx80_default_nan_high;
4112         return z;
4113     }
4114     if ( aExp == 0 ) {
4115         aExp = 1;
4116         bExp = 1;
4117     }
4118     zSig1 = 0;
4119     if ( bSig < aSig ) goto aBigger;
4120     if ( aSig < bSig ) goto bBigger;
4121     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4122  bExpBigger:
4123     if ( bExp == 0x7FFF ) {
4124         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4125         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4126     }
4127     if ( aExp == 0 ) ++expDiff;
4128     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4129  bBigger:
4130     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4131     zExp = bExp;
4132     zSign ^= 1;
4133     goto normalizeRoundAndPack;
4134  aExpBigger:
4135     if ( aExp == 0x7FFF ) {
4136         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4137         return a;
4138     }
4139     if ( bExp == 0 ) --expDiff;
4140     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4141  aBigger:
4142     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4143     zExp = aExp;
4144  normalizeRoundAndPack:
4145     return
4146         normalizeRoundAndPackFloatx80(
4147             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4148
4149 }
4150
4151 /*----------------------------------------------------------------------------
4152 | Returns the result of adding the extended double-precision floating-point
4153 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4154 | Standard for Binary Floating-Point Arithmetic.
4155 *----------------------------------------------------------------------------*/
4156
4157 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4158 {
4159     flag aSign, bSign;
4160
4161     aSign = extractFloatx80Sign( a );
4162     bSign = extractFloatx80Sign( b );
4163     if ( aSign == bSign ) {
4164         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4165     }
4166     else {
4167         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4168     }
4169
4170 }
4171
4172 /*----------------------------------------------------------------------------
4173 | Returns the result of subtracting the extended double-precision floating-
4174 | point values `a' and `b'.  The operation is performed according to the
4175 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4176 *----------------------------------------------------------------------------*/
4177
4178 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4179 {
4180     flag aSign, bSign;
4181
4182     aSign = extractFloatx80Sign( a );
4183     bSign = extractFloatx80Sign( b );
4184     if ( aSign == bSign ) {
4185         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4186     }
4187     else {
4188         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4189     }
4190
4191 }
4192
4193 /*----------------------------------------------------------------------------
4194 | Returns the result of multiplying the extended double-precision floating-
4195 | point values `a' and `b'.  The operation is performed according to the
4196 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4197 *----------------------------------------------------------------------------*/
4198
4199 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4200 {
4201     flag aSign, bSign, zSign;
4202     int32 aExp, bExp, zExp;
4203     bits64 aSig, bSig, zSig0, zSig1;
4204     floatx80 z;
4205
4206     aSig = extractFloatx80Frac( a );
4207     aExp = extractFloatx80Exp( a );
4208     aSign = extractFloatx80Sign( a );
4209     bSig = extractFloatx80Frac( b );
4210     bExp = extractFloatx80Exp( b );
4211     bSign = extractFloatx80Sign( b );
4212     zSign = aSign ^ bSign;
4213     if ( aExp == 0x7FFF ) {
4214         if (    (bits64) ( aSig<<1 )
4215              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4216             return propagateFloatx80NaN( a, b STATUS_VAR );
4217         }
4218         if ( ( bExp | bSig ) == 0 ) goto invalid;
4219         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4220     }
4221     if ( bExp == 0x7FFF ) {
4222         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4223         if ( ( aExp | aSig ) == 0 ) {
4224  invalid:
4225             float_raise( float_flag_invalid STATUS_VAR);
4226             z.low = floatx80_default_nan_low;
4227             z.high = floatx80_default_nan_high;
4228             return z;
4229         }
4230         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4231     }
4232     if ( aExp == 0 ) {
4233         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4234         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4235     }
4236     if ( bExp == 0 ) {
4237         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4238         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4239     }
4240     zExp = aExp + bExp - 0x3FFE;
4241     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4242     if ( 0 < (sbits64) zSig0 ) {
4243         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4244         --zExp;
4245     }
4246     return
4247         roundAndPackFloatx80(
4248             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4249
4250 }
4251
4252 /*----------------------------------------------------------------------------
4253 | Returns the result of dividing the extended double-precision floating-point
4254 | value `a' by the corresponding value `b'.  The operation is performed
4255 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4256 *----------------------------------------------------------------------------*/
4257
4258 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4259 {
4260     flag aSign, bSign, zSign;
4261     int32 aExp, bExp, zExp;
4262     bits64 aSig, bSig, zSig0, zSig1;
4263     bits64 rem0, rem1, rem2, term0, term1, term2;
4264     floatx80 z;
4265
4266     aSig = extractFloatx80Frac( a );
4267     aExp = extractFloatx80Exp( a );
4268     aSign = extractFloatx80Sign( a );
4269     bSig = extractFloatx80Frac( b );
4270     bExp = extractFloatx80Exp( b );
4271     bSign = extractFloatx80Sign( b );
4272     zSign = aSign ^ bSign;
4273     if ( aExp == 0x7FFF ) {
4274         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4275         if ( bExp == 0x7FFF ) {
4276             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4277             goto invalid;
4278         }
4279         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4280     }
4281     if ( bExp == 0x7FFF ) {
4282         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4283         return packFloatx80( zSign, 0, 0 );
4284     }
4285     if ( bExp == 0 ) {
4286         if ( bSig == 0 ) {
4287             if ( ( aExp | aSig ) == 0 ) {
4288  invalid:
4289                 float_raise( float_flag_invalid STATUS_VAR);
4290                 z.low = floatx80_default_nan_low;
4291                 z.high = floatx80_default_nan_high;
4292                 return z;
4293             }
4294             float_raise( float_flag_divbyzero STATUS_VAR);
4295             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4296         }
4297         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4298     }
4299     if ( aExp == 0 ) {
4300         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4301         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4302     }
4303     zExp = aExp - bExp + 0x3FFE;
4304     rem1 = 0;
4305     if ( bSig <= aSig ) {
4306         shift128Right( aSig, 0, 1, &aSig, &rem1 );
4307         ++zExp;
4308     }
4309     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4310     mul64To128( bSig, zSig0, &term0, &term1 );
4311     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4312     while ( (sbits64) rem0 < 0 ) {
4313         --zSig0;
4314         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4315     }
4316     zSig1 = estimateDiv128To64( rem1, 0, bSig );
4317     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
4318         mul64To128( bSig, zSig1, &term1, &term2 );
4319         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4320         while ( (sbits64) rem1 < 0 ) {
4321             --zSig1;
4322             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4323         }
4324         zSig1 |= ( ( rem1 | rem2 ) != 0 );
4325     }
4326     return
4327         roundAndPackFloatx80(
4328             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4329
4330 }
4331
4332 /*----------------------------------------------------------------------------
4333 | Returns the remainder of the extended double-precision floating-point value
4334 | `a' with respect to the corresponding value `b'.  The operation is performed
4335 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4336 *----------------------------------------------------------------------------*/
4337
4338 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4339 {
4340     flag aSign, zSign;
4341     int32 aExp, bExp, expDiff;
4342     bits64 aSig0, aSig1, bSig;
4343     bits64 q, term0, term1, alternateASig0, alternateASig1;
4344     floatx80 z;
4345
4346     aSig0 = extractFloatx80Frac( a );
4347     aExp = extractFloatx80Exp( a );
4348     aSign = extractFloatx80Sign( a );
4349     bSig = extractFloatx80Frac( b );
4350     bExp = extractFloatx80Exp( b );
4351     if ( aExp == 0x7FFF ) {
4352         if (    (bits64) ( aSig0<<1 )
4353              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4354             return propagateFloatx80NaN( a, b STATUS_VAR );
4355         }
4356         goto invalid;
4357     }
4358     if ( bExp == 0x7FFF ) {
4359         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4360         return a;
4361     }
4362     if ( bExp == 0 ) {
4363         if ( bSig == 0 ) {
4364  invalid:
4365             float_raise( float_flag_invalid STATUS_VAR);
4366             z.low = floatx80_default_nan_low;
4367             z.high = floatx80_default_nan_high;
4368             return z;
4369         }
4370         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4371     }
4372     if ( aExp == 0 ) {
4373         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4374         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4375     }
4376     bSig |= LIT64( 0x8000000000000000 );
4377     zSign = aSign;
4378     expDiff = aExp - bExp;
4379     aSig1 = 0;
4380     if ( expDiff < 0 ) {
4381         if ( expDiff < -1 ) return a;
4382         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4383         expDiff = 0;
4384     }
4385     q = ( bSig <= aSig0 );
4386     if ( q ) aSig0 -= bSig;
4387     expDiff -= 64;
4388     while ( 0 < expDiff ) {
4389         q = estimateDiv128To64( aSig0, aSig1, bSig );
4390         q = ( 2 < q ) ? q - 2 : 0;
4391         mul64To128( bSig, q, &term0, &term1 );
4392         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4393         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4394         expDiff -= 62;
4395     }
4396     expDiff += 64;
4397     if ( 0 < expDiff ) {
4398         q = estimateDiv128To64( aSig0, aSig1, bSig );
4399         q = ( 2 < q ) ? q - 2 : 0;
4400         q >>= 64 - expDiff;
4401         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4402         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4403         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4404         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4405             ++q;
4406             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4407         }
4408     }
4409     else {
4410         term1 = 0;
4411         term0 = bSig;
4412     }
4413     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4414     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4415          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4416               && ( q & 1 ) )
4417        ) {
4418         aSig0 = alternateASig0;
4419         aSig1 = alternateASig1;
4420         zSign = ! zSign;
4421     }
4422     return
4423         normalizeRoundAndPackFloatx80(
4424             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4425
4426 }
4427
4428 /*----------------------------------------------------------------------------
4429 | Returns the square root of the extended double-precision floating-point
4430 | value `a'.  The operation is performed according to the IEC/IEEE Standard
4431 | for Binary Floating-Point Arithmetic.
4432 *----------------------------------------------------------------------------*/
4433
4434 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4435 {
4436     flag aSign;
4437     int32 aExp, zExp;
4438     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4439     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4440     floatx80 z;
4441
4442     aSig0 = extractFloatx80Frac( a );
4443     aExp = extractFloatx80Exp( a );
4444     aSign = extractFloatx80Sign( a );
4445     if ( aExp == 0x7FFF ) {
4446         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4447         if ( ! aSign ) return a;
4448         goto invalid;
4449     }
4450     if ( aSign ) {
4451         if ( ( aExp | aSig0 ) == 0 ) return a;
4452  invalid:
4453         float_raise( float_flag_invalid STATUS_VAR);
4454         z.low = floatx80_default_nan_low;
4455         z.high = floatx80_default_nan_high;
4456         return z;
4457     }
4458     if ( aExp == 0 ) {
4459         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4460         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4461     }
4462     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4463     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4464     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4465     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4466     doubleZSig0 = zSig0<<1;
4467     mul64To128( zSig0, zSig0, &term0, &term1 );
4468     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4469     while ( (sbits64) rem0 < 0 ) {
4470         --zSig0;
4471         doubleZSig0 -= 2;
4472         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4473     }
4474     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4475     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4476         if ( zSig1 == 0 ) zSig1 = 1;
4477         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4478         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4479         mul64To128( zSig1, zSig1, &term2, &term3 );
4480         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4481         while ( (sbits64) rem1 < 0 ) {
4482             --zSig1;
4483             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4484             term3 |= 1;
4485             term2 |= doubleZSig0;
4486             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4487         }
4488         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4489     }
4490     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4491     zSig0 |= doubleZSig0;
4492     return
4493         roundAndPackFloatx80(
4494             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4495
4496 }
4497
4498 /*----------------------------------------------------------------------------
4499 | Returns 1 if the extended double-precision floating-point value `a' is
4500 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
4501 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4502 | Arithmetic.
4503 *----------------------------------------------------------------------------*/
4504
4505 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4506 {
4507
4508     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4509               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4510          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4511               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4512        ) {
4513         if (    floatx80_is_signaling_nan( a )
4514              || floatx80_is_signaling_nan( b ) ) {
4515             float_raise( float_flag_invalid STATUS_VAR);
4516         }
4517         return 0;
4518     }
4519     return
4520            ( a.low == b.low )
4521         && (    ( a.high == b.high )
4522              || (    ( a.low == 0 )
4523                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4524            );
4525
4526 }
4527
4528 /*----------------------------------------------------------------------------
4529 | Returns 1 if the extended double-precision floating-point value `a' is
4530 | less than or equal to the corresponding value `b', and 0 otherwise.  The
4531 | comparison is performed according to the IEC/IEEE Standard for Binary
4532 | Floating-Point Arithmetic.
4533 *----------------------------------------------------------------------------*/
4534
4535 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4536 {
4537     flag aSign, bSign;
4538
4539     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4540               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4541          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4542               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4543        ) {
4544         float_raise( float_flag_invalid STATUS_VAR);
4545         return 0;
4546     }
4547     aSign = extractFloatx80Sign( a );
4548     bSign = extractFloatx80Sign( b );
4549     if ( aSign != bSign ) {
4550         return
4551                aSign
4552             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4553                  == 0 );
4554     }
4555     return
4556           aSign ? le128( b.high, b.low, a.high, a.low )
4557         : le128( a.high, a.low, b.high, b.low );
4558
4559 }
4560
4561 /*----------------------------------------------------------------------------
4562 | Returns 1 if the extended double-precision floating-point value `a' is
4563 | less than the corresponding value `b', and 0 otherwise.  The comparison
4564 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4565 | Arithmetic.
4566 *----------------------------------------------------------------------------*/
4567
4568 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4569 {
4570     flag aSign, bSign;
4571
4572     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4573               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4574          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4575               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4576        ) {
4577         float_raise( float_flag_invalid STATUS_VAR);
4578         return 0;
4579     }
4580     aSign = extractFloatx80Sign( a );
4581     bSign = extractFloatx80Sign( b );
4582     if ( aSign != bSign ) {
4583         return
4584                aSign
4585             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4586                  != 0 );
4587     }
4588     return
4589           aSign ? lt128( b.high, b.low, a.high, a.low )
4590         : lt128( a.high, a.low, b.high, b.low );
4591
4592 }
4593
4594 /*----------------------------------------------------------------------------
4595 | Returns 1 if the extended double-precision floating-point value `a' is equal
4596 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
4597 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4598 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4599 *----------------------------------------------------------------------------*/
4600
4601 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4602 {
4603
4604     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4605               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4606          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4607               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4608        ) {
4609         float_raise( float_flag_invalid STATUS_VAR);
4610         return 0;
4611     }
4612     return
4613            ( a.low == b.low )
4614         && (    ( a.high == b.high )
4615              || (    ( a.low == 0 )
4616                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4617            );
4618
4619 }
4620
4621 /*----------------------------------------------------------------------------
4622 | Returns 1 if the extended double-precision floating-point value `a' is less
4623 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4624 | do not cause an exception.  Otherwise, the comparison is performed according
4625 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4626 *----------------------------------------------------------------------------*/
4627
4628 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4629 {
4630     flag aSign, bSign;
4631
4632     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4633               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4634          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4635               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4636        ) {
4637         if (    floatx80_is_signaling_nan( a )
4638              || floatx80_is_signaling_nan( b ) ) {
4639             float_raise( float_flag_invalid STATUS_VAR);
4640         }
4641         return 0;
4642     }
4643     aSign = extractFloatx80Sign( a );
4644     bSign = extractFloatx80Sign( b );
4645     if ( aSign != bSign ) {
4646         return
4647                aSign
4648             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4649                  == 0 );
4650     }
4651     return
4652           aSign ? le128( b.high, b.low, a.high, a.low )
4653         : le128( a.high, a.low, b.high, b.low );
4654
4655 }
4656
4657 /*----------------------------------------------------------------------------
4658 | Returns 1 if the extended double-precision floating-point value `a' is less
4659 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4660 | an exception.  Otherwise, the comparison is performed according to the
4661 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4662 *----------------------------------------------------------------------------*/
4663
4664 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4665 {
4666     flag aSign, bSign;
4667
4668     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4669               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4670          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4671               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4672        ) {
4673         if (    floatx80_is_signaling_nan( a )
4674              || floatx80_is_signaling_nan( b ) ) {
4675             float_raise( float_flag_invalid STATUS_VAR);
4676         }
4677         return 0;
4678     }
4679     aSign = extractFloatx80Sign( a );
4680     bSign = extractFloatx80Sign( b );
4681     if ( aSign != bSign ) {
4682         return
4683                aSign
4684             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4685                  != 0 );
4686     }
4687     return
4688           aSign ? lt128( b.high, b.low, a.high, a.low )
4689         : lt128( a.high, a.low, b.high, b.low );
4690
4691 }
4692
4693 #endif
4694
4695 #ifdef FLOAT128
4696
4697 /*----------------------------------------------------------------------------
4698 | Returns the result of converting the quadruple-precision floating-point
4699 | value `a' to the 32-bit two's complement integer format.  The conversion
4700 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4701 | Arithmetic---which means in particular that the conversion is rounded
4702 | according to the current rounding mode.  If `a' is a NaN, the largest
4703 | positive integer is returned.  Otherwise, if the conversion overflows, the
4704 | largest integer with the same sign as `a' is returned.
4705 *----------------------------------------------------------------------------*/
4706
4707 int32 float128_to_int32( float128 a STATUS_PARAM )
4708 {
4709     flag aSign;
4710     int32 aExp, shiftCount;
4711     bits64 aSig0, aSig1;
4712
4713     aSig1 = extractFloat128Frac1( a );
4714     aSig0 = extractFloat128Frac0( a );
4715     aExp = extractFloat128Exp( a );
4716     aSign = extractFloat128Sign( a );
4717     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4718     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4719     aSig0 |= ( aSig1 != 0 );
4720     shiftCount = 0x4028 - aExp;
4721     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4722     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4723
4724 }
4725
4726 /*----------------------------------------------------------------------------
4727 | Returns the result of converting the quadruple-precision floating-point
4728 | value `a' to the 32-bit two's complement integer format.  The conversion
4729 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4730 | Arithmetic, except that the conversion is always rounded toward zero.  If
4731 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4732 | conversion overflows, the largest integer with the same sign as `a' is
4733 | returned.
4734 *----------------------------------------------------------------------------*/
4735
4736 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4737 {
4738     flag aSign;
4739     int32 aExp, shiftCount;
4740     bits64 aSig0, aSig1, savedASig;
4741     int32 z;
4742
4743     aSig1 = extractFloat128Frac1( a );
4744     aSig0 = extractFloat128Frac0( a );
4745     aExp = extractFloat128Exp( a );
4746     aSign = extractFloat128Sign( a );
4747     aSig0 |= ( aSig1 != 0 );
4748     if ( 0x401E < aExp ) {
4749         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4750         goto invalid;
4751     }
4752     else if ( aExp < 0x3FFF ) {
4753         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4754         return 0;
4755     }
4756     aSig0 |= LIT64( 0x0001000000000000 );
4757     shiftCount = 0x402F - aExp;
4758     savedASig = aSig0;
4759     aSig0 >>= shiftCount;
4760     z = aSig0;
4761     if ( aSign ) z = - z;
4762     if ( ( z < 0 ) ^ aSign ) {
4763  invalid:
4764         float_raise( float_flag_invalid STATUS_VAR);
4765         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4766     }
4767     if ( ( aSig0<<shiftCount ) != savedASig ) {
4768         STATUS(float_exception_flags) |= float_flag_inexact;
4769     }
4770     return z;
4771
4772 }
4773
4774 /*----------------------------------------------------------------------------
4775 | Returns the result of converting the quadruple-precision floating-point
4776 | value `a' to the 64-bit two's complement integer format.  The conversion
4777 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4778 | Arithmetic---which means in particular that the conversion is rounded
4779 | according to the current rounding mode.  If `a' is a NaN, the largest
4780 | positive integer is returned.  Otherwise, if the conversion overflows, the
4781 | largest integer with the same sign as `a' is returned.
4782 *----------------------------------------------------------------------------*/
4783
4784 int64 float128_to_int64( float128 a STATUS_PARAM )
4785 {
4786     flag aSign;
4787     int32 aExp, shiftCount;
4788     bits64 aSig0, aSig1;
4789
4790     aSig1 = extractFloat128Frac1( a );
4791     aSig0 = extractFloat128Frac0( a );
4792     aExp = extractFloat128Exp( a );
4793     aSign = extractFloat128Sign( a );
4794     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4795     shiftCount = 0x402F - aExp;
4796     if ( shiftCount <= 0 ) {
4797         if ( 0x403E < aExp ) {
4798             float_raise( float_flag_invalid STATUS_VAR);
4799             if (    ! aSign
4800                  || (    ( aExp == 0x7FFF )
4801                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4802                     )
4803                ) {
4804                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4805             }
4806             return (sbits64) LIT64( 0x8000000000000000 );
4807         }
4808         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4809     }
4810     else {
4811         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4812     }
4813     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4814
4815 }
4816
4817 /*----------------------------------------------------------------------------
4818 | Returns the result of converting the quadruple-precision floating-point
4819 | value `a' to the 64-bit two's complement integer format.  The conversion
4820 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4821 | Arithmetic, except that the conversion is always rounded toward zero.
4822 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4823 | the conversion overflows, the largest integer with the same sign as `a' is
4824 | returned.
4825 *----------------------------------------------------------------------------*/
4826
4827 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4828 {
4829     flag aSign;
4830     int32 aExp, shiftCount;
4831     bits64 aSig0, aSig1;
4832     int64 z;
4833
4834     aSig1 = extractFloat128Frac1( a );
4835     aSig0 = extractFloat128Frac0( a );
4836     aExp = extractFloat128Exp( a );
4837     aSign = extractFloat128Sign( a );
4838     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4839     shiftCount = aExp - 0x402F;
4840     if ( 0 < shiftCount ) {
4841         if ( 0x403E <= aExp ) {
4842             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4843             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4844                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4845                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4846             }
4847             else {
4848                 float_raise( float_flag_invalid STATUS_VAR);
4849                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4850                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4851                 }
4852             }
4853             return (sbits64) LIT64( 0x8000000000000000 );
4854         }
4855         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4856         if ( (bits64) ( aSig1<<shiftCount ) ) {
4857             STATUS(float_exception_flags) |= float_flag_inexact;
4858         }
4859     }
4860     else {
4861         if ( aExp < 0x3FFF ) {
4862             if ( aExp | aSig0 | aSig1 ) {
4863                 STATUS(float_exception_flags) |= float_flag_inexact;
4864             }
4865             return 0;
4866         }
4867         z = aSig0>>( - shiftCount );
4868         if (    aSig1
4869              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4870             STATUS(float_exception_flags) |= float_flag_inexact;
4871         }
4872     }
4873     if ( aSign ) z = - z;
4874     return z;
4875
4876 }
4877
4878 /*----------------------------------------------------------------------------
4879 | Returns the result of converting the quadruple-precision floating-point
4880 | value `a' to the single-precision floating-point format.  The conversion
4881 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4882 | Arithmetic.
4883 *----------------------------------------------------------------------------*/
4884
4885 float32 float128_to_float32( float128 a STATUS_PARAM )
4886 {
4887     flag aSign;
4888     int32 aExp;
4889     bits64 aSig0, aSig1;
4890     bits32 zSig;
4891
4892     aSig1 = extractFloat128Frac1( a );
4893     aSig0 = extractFloat128Frac0( a );
4894     aExp = extractFloat128Exp( a );
4895     aSign = extractFloat128Sign( a );
4896     if ( aExp == 0x7FFF ) {
4897         if ( aSig0 | aSig1 ) {
4898             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4899         }
4900         return packFloat32( aSign, 0xFF, 0 );
4901     }
4902     aSig0 |= ( aSig1 != 0 );
4903     shift64RightJamming( aSig0, 18, &aSig0 );
4904     zSig = aSig0;
4905     if ( aExp || zSig ) {
4906         zSig |= 0x40000000;
4907         aExp -= 0x3F81;
4908     }
4909     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4910
4911 }
4912
4913 /*----------------------------------------------------------------------------
4914 | Returns the result of converting the quadruple-precision floating-point
4915 | value `a' to the double-precision floating-point format.  The conversion
4916 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4917 | Arithmetic.
4918 *----------------------------------------------------------------------------*/
4919
4920 float64 float128_to_float64( float128 a STATUS_PARAM )
4921 {
4922     flag aSign;
4923     int32 aExp;
4924     bits64 aSig0, aSig1;
4925
4926     aSig1 = extractFloat128Frac1( a );
4927     aSig0 = extractFloat128Frac0( a );
4928     aExp = extractFloat128Exp( a );
4929     aSign = extractFloat128Sign( a );
4930     if ( aExp == 0x7FFF ) {
4931         if ( aSig0 | aSig1 ) {
4932             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4933         }
4934         return packFloat64( aSign, 0x7FF, 0 );
4935     }
4936     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4937     aSig0 |= ( aSig1 != 0 );
4938     if ( aExp || aSig0 ) {
4939         aSig0 |= LIT64( 0x4000000000000000 );
4940         aExp -= 0x3C01;
4941     }
4942     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4943
4944 }
4945
4946 #ifdef FLOATX80
4947
4948 /*----------------------------------------------------------------------------
4949 | Returns the result of converting the quadruple-precision floating-point
4950 | value `a' to the extended double-precision floating-point format.  The
4951 | conversion is performed according to the IEC/IEEE Standard for Binary
4952 | Floating-Point Arithmetic.
4953 *----------------------------------------------------------------------------*/
4954
4955 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4956 {
4957     flag aSign;
4958     int32 aExp;
4959     bits64 aSig0, aSig1;
4960
4961     aSig1 = extractFloat128Frac1( a );
4962     aSig0 = extractFloat128Frac0( a );
4963     aExp = extractFloat128Exp( a );
4964     aSign = extractFloat128Sign( a );
4965     if ( aExp == 0x7FFF ) {
4966         if ( aSig0 | aSig1 ) {
4967             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4968         }
4969         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4970     }
4971     if ( aExp == 0 ) {
4972         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4973         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4974     }
4975     else {
4976         aSig0 |= LIT64( 0x0001000000000000 );
4977     }
4978     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4979     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4980
4981 }
4982
4983 #endif
4984
4985 /*----------------------------------------------------------------------------
4986 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4987 | returns the result as a quadruple-precision floating-point value.  The
4988 | operation is performed according to the IEC/IEEE Standard for Binary
4989 | Floating-Point Arithmetic.
4990 *----------------------------------------------------------------------------*/
4991
4992 float128 float128_round_to_int( float128 a STATUS_PARAM )
4993 {
4994     flag aSign;
4995     int32 aExp;
4996     bits64 lastBitMask, roundBitsMask;
4997     int8 roundingMode;
4998     float128 z;
4999
5000     aExp = extractFloat128Exp( a );
5001     if ( 0x402F <= aExp ) {
5002         if ( 0x406F <= aExp ) {
5003             if (    ( aExp == 0x7FFF )
5004                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5005                ) {
5006                 return propagateFloat128NaN( a, a STATUS_VAR );
5007             }
5008             return a;
5009         }
5010         lastBitMask = 1;
5011         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5012         roundBitsMask = lastBitMask - 1;
5013         z = a;
5014         roundingMode = STATUS(float_rounding_mode);
5015         if ( roundingMode == float_round_nearest_even ) {
5016             if ( lastBitMask ) {
5017                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5018                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5019             }
5020             else {
5021                 if ( (sbits64) z.low < 0 ) {
5022                     ++z.high;
5023                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
5024                 }
5025             }
5026         }
5027         else if ( roundingMode != float_round_to_zero ) {
5028             if (   extractFloat128Sign( z )
5029                  ^ ( roundingMode == float_round_up ) ) {
5030                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5031             }
5032         }
5033         z.low &= ~ roundBitsMask;
5034     }
5035     else {
5036         if ( aExp < 0x3FFF ) {
5037             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5038             STATUS(float_exception_flags) |= float_flag_inexact;
5039             aSign = extractFloat128Sign( a );
5040             switch ( STATUS(float_rounding_mode) ) {
5041              case float_round_nearest_even:
5042                 if (    ( aExp == 0x3FFE )
5043                      && (   extractFloat128Frac0( a )
5044                           | extractFloat128Frac1( a ) )
5045                    ) {
5046                     return packFloat128( aSign, 0x3FFF, 0, 0 );
5047                 }
5048                 break;
5049              case float_round_down:
5050                 return
5051                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5052                     : packFloat128( 0, 0, 0, 0 );
5053              case float_round_up:
5054                 return
5055                       aSign ? packFloat128( 1, 0, 0, 0 )
5056                     : packFloat128( 0, 0x3FFF, 0, 0 );
5057             }
5058             return packFloat128( aSign, 0, 0, 0 );
5059         }
5060         lastBitMask = 1;
5061         lastBitMask <<= 0x402F - aExp;
5062         roundBitsMask = lastBitMask - 1;
5063         z.low = 0;
5064         z.high = a.high;
5065         roundingMode = STATUS(float_rounding_mode);
5066         if ( roundingMode == float_round_nearest_even ) {
5067             z.high += lastBitMask>>1;
5068             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5069                 z.high &= ~ lastBitMask;
5070             }
5071         }
5072         else if ( roundingMode != float_round_to_zero ) {
5073             if (   extractFloat128Sign( z )
5074                  ^ ( roundingMode == float_round_up ) ) {
5075                 z.high |= ( a.low != 0 );
5076                 z.high += roundBitsMask;
5077             }
5078         }
5079         z.high &= ~ roundBitsMask;
5080     }
5081     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5082         STATUS(float_exception_flags) |= float_flag_inexact;
5083     }
5084     return z;
5085
5086 }
5087
5088 /*----------------------------------------------------------------------------
5089 | Returns the result of adding the absolute values of the quadruple-precision
5090 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5091 | before being returned.  `zSign' is ignored if the result is a NaN.
5092 | The addition is performed according to the IEC/IEEE Standard for Binary
5093 | Floating-Point Arithmetic.
5094 *----------------------------------------------------------------------------*/
5095
5096 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5097 {
5098     int32 aExp, bExp, zExp;
5099     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5100     int32 expDiff;
5101
5102     aSig1 = extractFloat128Frac1( a );
5103     aSig0 = extractFloat128Frac0( a );
5104     aExp = extractFloat128Exp( a );
5105     bSig1 = extractFloat128Frac1( b );
5106     bSig0 = extractFloat128Frac0( b );
5107     bExp = extractFloat128Exp( b );
5108     expDiff = aExp - bExp;
5109     if ( 0 < expDiff ) {
5110         if ( aExp == 0x7FFF ) {
5111             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5112             return a;
5113         }
5114         if ( bExp == 0 ) {
5115             --expDiff;
5116         }
5117         else {
5118             bSig0 |= LIT64( 0x0001000000000000 );
5119         }
5120         shift128ExtraRightJamming(
5121             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5122         zExp = aExp;
5123     }
5124     else if ( expDiff < 0 ) {
5125         if ( bExp == 0x7FFF ) {
5126             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5127             return packFloat128( zSign, 0x7FFF, 0, 0 );
5128         }
5129         if ( aExp == 0 ) {
5130             ++expDiff;
5131         }
5132         else {
5133             aSig0 |= LIT64( 0x0001000000000000 );
5134         }
5135         shift128ExtraRightJamming(
5136             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5137         zExp = bExp;
5138     }
5139     else {
5140         if ( aExp == 0x7FFF ) {
5141             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5142                 return propagateFloat128NaN( a, b STATUS_VAR );
5143             }
5144             return a;
5145         }
5146         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5147         if ( aExp == 0 ) {
5148             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
5149             return packFloat128( zSign, 0, zSig0, zSig1 );
5150         }
5151         zSig2 = 0;
5152         zSig0 |= LIT64( 0x0002000000000000 );
5153         zExp = aExp;
5154         goto shiftRight1;
5155     }
5156     aSig0 |= LIT64( 0x0001000000000000 );
5157     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5158     --zExp;
5159     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5160     ++zExp;
5161  shiftRight1:
5162     shift128ExtraRightJamming(
5163         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5164  roundAndPack:
5165     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5166
5167 }
5168
5169 /*----------------------------------------------------------------------------
5170 | Returns the result of subtracting the absolute values of the quadruple-
5171 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
5172 | difference is negated before being returned.  `zSign' is ignored if the
5173 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5174 | Standard for Binary Floating-Point Arithmetic.
5175 *----------------------------------------------------------------------------*/
5176
5177 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5178 {
5179     int32 aExp, bExp, zExp;
5180     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5181     int32 expDiff;
5182     float128 z;
5183
5184     aSig1 = extractFloat128Frac1( a );
5185     aSig0 = extractFloat128Frac0( a );
5186     aExp = extractFloat128Exp( a );
5187     bSig1 = extractFloat128Frac1( b );
5188     bSig0 = extractFloat128Frac0( b );
5189     bExp = extractFloat128Exp( b );
5190     expDiff = aExp - bExp;
5191     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5192     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5193     if ( 0 < expDiff ) goto aExpBigger;
5194     if ( expDiff < 0 ) goto bExpBigger;
5195     if ( aExp == 0x7FFF ) {
5196         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5197             return propagateFloat128NaN( a, b STATUS_VAR );
5198         }
5199         float_raise( float_flag_invalid STATUS_VAR);
5200         z.low = float128_default_nan_low;
5201         z.high = float128_default_nan_high;
5202         return z;
5203     }
5204     if ( aExp == 0 ) {
5205         aExp = 1;
5206         bExp = 1;
5207     }
5208     if ( bSig0 < aSig0 ) goto aBigger;
5209     if ( aSig0 < bSig0 ) goto bBigger;
5210     if ( bSig1 < aSig1 ) goto aBigger;
5211     if ( aSig1 < bSig1 ) goto bBigger;
5212     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5213  bExpBigger:
5214     if ( bExp == 0x7FFF ) {
5215         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5216         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5217     }
5218     if ( aExp == 0 ) {
5219         ++expDiff;
5220     }
5221     else {
5222         aSig0 |= LIT64( 0x4000000000000000 );
5223     }
5224     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5225     bSig0 |= LIT64( 0x4000000000000000 );
5226  bBigger:
5227     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5228     zExp = bExp;
5229     zSign ^= 1;
5230     goto normalizeRoundAndPack;
5231  aExpBigger:
5232     if ( aExp == 0x7FFF ) {
5233         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5234         return a;
5235     }
5236     if ( bExp == 0 ) {
5237         --expDiff;
5238     }
5239     else {
5240         bSig0 |= LIT64( 0x4000000000000000 );
5241     }
5242     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5243     aSig0 |= LIT64( 0x4000000000000000 );
5244  aBigger:
5245     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5246     zExp = aExp;
5247  normalizeRoundAndPack:
5248     --zExp;
5249     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5250
5251 }
5252
5253 /*----------------------------------------------------------------------------
5254 | Returns the result of adding the quadruple-precision floating-point values
5255 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5256 | for Binary Floating-Point Arithmetic.
5257 *----------------------------------------------------------------------------*/
5258
5259 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5260 {
5261     flag aSign, bSign;
5262
5263     aSign = extractFloat128Sign( a );
5264     bSign = extractFloat128Sign( b );
5265     if ( aSign == bSign ) {
5266         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5267     }
5268     else {
5269         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5270     }
5271
5272 }
5273
5274 /*----------------------------------------------------------------------------
5275 | Returns the result of subtracting the quadruple-precision floating-point
5276 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5277 | Standard for Binary Floating-Point Arithmetic.
5278 *----------------------------------------------------------------------------*/
5279
5280 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5281 {
5282     flag aSign, bSign;
5283
5284     aSign = extractFloat128Sign( a );
5285     bSign = extractFloat128Sign( b );
5286     if ( aSign == bSign ) {
5287         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5288     }
5289     else {
5290         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5291     }
5292
5293 }
5294
5295 /*----------------------------------------------------------------------------
5296 | Returns the result of multiplying the quadruple-precision floating-point
5297 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5298 | Standard for Binary Floating-Point Arithmetic.
5299 *----------------------------------------------------------------------------*/
5300
5301 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5302 {
5303     flag aSign, bSign, zSign;
5304     int32 aExp, bExp, zExp;
5305     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5306     float128 z;
5307
5308     aSig1 = extractFloat128Frac1( a );
5309     aSig0 = extractFloat128Frac0( a );
5310     aExp = extractFloat128Exp( a );
5311     aSign = extractFloat128Sign( a );
5312     bSig1 = extractFloat128Frac1( b );
5313     bSig0 = extractFloat128Frac0( b );
5314     bExp = extractFloat128Exp( b );
5315     bSign = extractFloat128Sign( b );
5316     zSign = aSign ^ bSign;
5317     if ( aExp == 0x7FFF ) {
5318         if (    ( aSig0 | aSig1 )
5319              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5320             return propagateFloat128NaN( a, b STATUS_VAR );
5321         }
5322         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5323         return packFloat128( zSign, 0x7FFF, 0, 0 );
5324     }
5325     if ( bExp == 0x7FFF ) {
5326         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5327         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5328  invalid:
5329             float_raise( float_flag_invalid STATUS_VAR);
5330             z.low = float128_default_nan_low;
5331             z.high = float128_default_nan_high;
5332             return z;
5333         }
5334         return packFloat128( zSign, 0x7FFF, 0, 0 );
5335     }
5336     if ( aExp == 0 ) {
5337         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5338         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5339     }
5340     if ( bExp == 0 ) {
5341         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5342         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5343     }
5344     zExp = aExp + bExp - 0x4000;
5345     aSig0 |= LIT64( 0x0001000000000000 );
5346     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5347     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5348     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5349     zSig2 |= ( zSig3 != 0 );
5350     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5351         shift128ExtraRightJamming(
5352             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5353         ++zExp;
5354     }
5355     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5356
5357 }
5358
5359 /*----------------------------------------------------------------------------
5360 | Returns the result of dividing the quadruple-precision floating-point value
5361 | `a' by the corresponding value `b'.  The operation is performed according to
5362 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5363 *----------------------------------------------------------------------------*/
5364
5365 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5366 {
5367     flag aSign, bSign, zSign;
5368     int32 aExp, bExp, zExp;
5369     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5370     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5371     float128 z;
5372
5373     aSig1 = extractFloat128Frac1( a );
5374     aSig0 = extractFloat128Frac0( a );
5375     aExp = extractFloat128Exp( a );
5376     aSign = extractFloat128Sign( a );
5377     bSig1 = extractFloat128Frac1( b );
5378     bSig0 = extractFloat128Frac0( b );
5379     bExp = extractFloat128Exp( b );
5380     bSign = extractFloat128Sign( b );
5381     zSign = aSign ^ bSign;
5382     if ( aExp == 0x7FFF ) {
5383         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5384         if ( bExp == 0x7FFF ) {
5385             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5386             goto invalid;
5387         }
5388         return packFloat128( zSign, 0x7FFF, 0, 0 );
5389     }
5390     if ( bExp == 0x7FFF ) {
5391         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5392         return packFloat128( zSign, 0, 0, 0 );
5393     }
5394     if ( bExp == 0 ) {
5395         if ( ( bSig0 | bSig1 ) == 0 ) {
5396             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5397  invalid:
5398                 float_raise( float_flag_invalid STATUS_VAR);
5399                 z.low = float128_default_nan_low;
5400                 z.high = float128_default_nan_high;
5401                 return z;
5402             }
5403             float_raise( float_flag_divbyzero STATUS_VAR);
5404             return packFloat128( zSign, 0x7FFF, 0, 0 );
5405         }
5406         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5407     }
5408     if ( aExp == 0 ) {
5409         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5410         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5411     }
5412     zExp = aExp - bExp + 0x3FFD;
5413     shortShift128Left(
5414         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5415     shortShift128Left(
5416         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5417     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5418         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5419         ++zExp;
5420     }
5421     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5422     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5423     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5424     while ( (sbits64) rem0 < 0 ) {
5425         --zSig0;
5426         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5427     }
5428     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5429     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5430         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5431         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5432         while ( (sbits64) rem1 < 0 ) {
5433             --zSig1;
5434             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5435         }
5436         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5437     }
5438     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5439     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5440
5441 }
5442
5443 /*----------------------------------------------------------------------------
5444 | Returns the remainder of the quadruple-precision floating-point value `a'
5445 | with respect to the corresponding value `b'.  The operation is performed
5446 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5447 *----------------------------------------------------------------------------*/
5448
5449 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5450 {
5451     flag aSign, zSign;
5452     int32 aExp, bExp, expDiff;
5453     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5454     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5455     sbits64 sigMean0;
5456     float128 z;
5457
5458     aSig1 = extractFloat128Frac1( a );
5459     aSig0 = extractFloat128Frac0( a );
5460     aExp = extractFloat128Exp( a );
5461     aSign = extractFloat128Sign( a );
5462     bSig1 = extractFloat128Frac1( b );
5463     bSig0 = extractFloat128Frac0( b );
5464     bExp = extractFloat128Exp( b );
5465     if ( aExp == 0x7FFF ) {
5466         if (    ( aSig0 | aSig1 )
5467              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5468             return propagateFloat128NaN( a, b STATUS_VAR );
5469         }
5470         goto invalid;
5471     }
5472     if ( bExp == 0x7FFF ) {
5473         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5474         return a;
5475     }
5476     if ( bExp == 0 ) {
5477         if ( ( bSig0 | bSig1 ) == 0 ) {
5478  invalid:
5479             float_raise( float_flag_invalid STATUS_VAR);
5480             z.low = float128_default_nan_low;
5481             z.high = float128_default_nan_high;
5482             return z;
5483         }
5484         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5485     }
5486     if ( aExp == 0 ) {
5487         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5488         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5489     }
5490     expDiff = aExp - bExp;
5491     if ( expDiff < -1 ) return a;
5492     shortShift128Left(
5493         aSig0 | LIT64( 0x0001000000000000 ),
5494         aSig1,
5495         15 - ( expDiff < 0 ),
5496         &aSig0,
5497         &aSig1
5498     );
5499     shortShift128Left(
5500         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5501     q = le128( bSig0, bSig1, aSig0, aSig1 );
5502     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5503     expDiff -= 64;
5504     while ( 0 < expDiff ) {
5505         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5506         q = ( 4 < q ) ? q - 4 : 0;
5507         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5508         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5509         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5510         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5511         expDiff -= 61;
5512     }
5513     if ( -64 < expDiff ) {
5514         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5515         q = ( 4 < q ) ? q - 4 : 0;
5516         q >>= - expDiff;
5517         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5518         expDiff += 52;
5519         if ( expDiff < 0 ) {
5520             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5521         }
5522         else {
5523             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5524         }
5525         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5526         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5527     }
5528     else {
5529         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5530         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5531     }
5532     do {
5533         alternateASig0 = aSig0;
5534         alternateASig1 = aSig1;
5535         ++q;
5536         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5537     } while ( 0 <= (sbits64) aSig0 );
5538     add128(
5539         aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5540     if (    ( sigMean0 < 0 )
5541          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5542         aSig0 = alternateASig0;
5543         aSig1 = alternateASig1;
5544     }
5545     zSign = ( (sbits64) aSig0 < 0 );
5546     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5547     return
5548         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5549
5550 }
5551
5552 /*----------------------------------------------------------------------------
5553 | Returns the square root of the quadruple-precision floating-point value `a'.
5554 | The operation is performed according to the IEC/IEEE Standard for Binary
5555 | Floating-Point Arithmetic.
5556 *----------------------------------------------------------------------------*/
5557
5558 float128 float128_sqrt( float128 a STATUS_PARAM )
5559 {
5560     flag aSign;
5561     int32 aExp, zExp;
5562     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5563     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5564     float128 z;
5565
5566     aSig1 = extractFloat128Frac1( a );
5567     aSig0 = extractFloat128Frac0( a );
5568     aExp = extractFloat128Exp( a );
5569     aSign = extractFloat128Sign( a );
5570     if ( aExp == 0x7FFF ) {
5571         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5572         if ( ! aSign ) return a;
5573         goto invalid;
5574     }
5575     if ( aSign ) {
5576         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5577  invalid:
5578         float_raise( float_flag_invalid STATUS_VAR);
5579         z.low = float128_default_nan_low;
5580         z.high = float128_default_nan_high;
5581         return z;
5582     }
5583     if ( aExp == 0 ) {
5584         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5585         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5586     }
5587     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5588     aSig0 |= LIT64( 0x0001000000000000 );
5589     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5590     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5591     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5592     doubleZSig0 = zSig0<<1;
5593     mul64To128( zSig0, zSig0, &term0, &term1 );
5594     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5595     while ( (sbits64) rem0 < 0 ) {
5596         --zSig0;
5597         doubleZSig0 -= 2;
5598         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5599     }
5600     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5601     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5602         if ( zSig1 == 0 ) zSig1 = 1;
5603         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5604         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5605         mul64To128( zSig1, zSig1, &term2, &term3 );
5606         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5607         while ( (sbits64) rem1 < 0 ) {
5608             --zSig1;
5609             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5610             term3 |= 1;
5611             term2 |= doubleZSig0;
5612             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5613         }
5614         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5615     }
5616     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5617     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5618
5619 }
5620
5621 /*----------------------------------------------------------------------------
5622 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5623 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5624 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5625 *----------------------------------------------------------------------------*/
5626
5627 int float128_eq( float128 a, float128 b STATUS_PARAM )
5628 {
5629
5630     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5631               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5632          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5633               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5634        ) {
5635         if (    float128_is_signaling_nan( a )
5636              || float128_is_signaling_nan( b ) ) {
5637             float_raise( float_flag_invalid STATUS_VAR);
5638         }
5639         return 0;
5640     }
5641     return
5642            ( a.low == b.low )
5643         && (    ( a.high == b.high )
5644              || (    ( a.low == 0 )
5645                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5646            );
5647
5648 }
5649
5650 /*----------------------------------------------------------------------------
5651 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5652 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
5653 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5654 | Arithmetic.
5655 *----------------------------------------------------------------------------*/
5656
5657 int float128_le( float128 a, float128 b STATUS_PARAM )
5658 {
5659     flag aSign, bSign;
5660
5661     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5662               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5663          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5664               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5665        ) {
5666         float_raise( float_flag_invalid STATUS_VAR);
5667         return 0;
5668     }
5669     aSign = extractFloat128Sign( a );
5670     bSign = extractFloat128Sign( b );
5671     if ( aSign != bSign ) {
5672         return
5673                aSign
5674             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5675                  == 0 );
5676     }
5677     return
5678           aSign ? le128( b.high, b.low, a.high, a.low )
5679         : le128( a.high, a.low, b.high, b.low );
5680
5681 }
5682
5683 /*----------------------------------------------------------------------------
5684 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5685 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5686 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5687 *----------------------------------------------------------------------------*/
5688
5689 int float128_lt( float128 a, float128 b STATUS_PARAM )
5690 {
5691     flag aSign, bSign;
5692
5693     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5694               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5695          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5696               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5697        ) {
5698         float_raise( float_flag_invalid STATUS_VAR);
5699         return 0;
5700     }
5701     aSign = extractFloat128Sign( a );
5702     bSign = extractFloat128Sign( b );
5703     if ( aSign != bSign ) {
5704         return
5705                aSign
5706             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5707                  != 0 );
5708     }
5709     return
5710           aSign ? lt128( b.high, b.low, a.high, a.low )
5711         : lt128( a.high, a.low, b.high, b.low );
5712
5713 }
5714
5715 /*----------------------------------------------------------------------------
5716 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5717 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5718 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5719 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5720 *----------------------------------------------------------------------------*/
5721
5722 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5723 {
5724
5725     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5726               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5727          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5728               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5729        ) {
5730         float_raise( float_flag_invalid STATUS_VAR);
5731         return 0;
5732     }
5733     return
5734            ( a.low == b.low )
5735         && (    ( a.high == b.high )
5736              || (    ( a.low == 0 )
5737                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5738            );
5739
5740 }
5741
5742 /*----------------------------------------------------------------------------
5743 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5744 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5745 | cause an exception.  Otherwise, the comparison is performed according to the
5746 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5747 *----------------------------------------------------------------------------*/
5748
5749 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5750 {
5751     flag aSign, bSign;
5752
5753     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5754               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5755          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5756               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5757        ) {
5758         if (    float128_is_signaling_nan( a )
5759              || float128_is_signaling_nan( b ) ) {
5760             float_raise( float_flag_invalid STATUS_VAR);
5761         }
5762         return 0;
5763     }
5764     aSign = extractFloat128Sign( a );
5765     bSign = extractFloat128Sign( b );
5766     if ( aSign != bSign ) {
5767         return
5768                aSign
5769             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5770                  == 0 );
5771     }
5772     return
5773           aSign ? le128( b.high, b.low, a.high, a.low )
5774         : le128( a.high, a.low, b.high, b.low );
5775
5776 }
5777
5778 /*----------------------------------------------------------------------------
5779 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5780 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5781 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5782 | Standard for Binary Floating-Point Arithmetic.
5783 *----------------------------------------------------------------------------*/
5784
5785 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5786 {
5787     flag aSign, bSign;
5788
5789     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5790               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5791          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5792               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5793        ) {
5794         if (    float128_is_signaling_nan( a )
5795              || float128_is_signaling_nan( b ) ) {
5796             float_raise( float_flag_invalid STATUS_VAR);
5797         }
5798         return 0;
5799     }
5800     aSign = extractFloat128Sign( a );
5801     bSign = extractFloat128Sign( b );
5802     if ( aSign != bSign ) {
5803         return
5804                aSign
5805             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5806                  != 0 );
5807     }
5808     return
5809           aSign ? lt128( b.high, b.low, a.high, a.low )
5810         : lt128( a.high, a.low, b.high, b.low );
5811
5812 }
5813
5814 #endif
5815
5816 /* misc functions */
5817 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5818 {
5819     return int64_to_float32(a STATUS_VAR);
5820 }
5821
5822 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5823 {
5824     return int64_to_float64(a STATUS_VAR);
5825 }
5826
5827 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5828 {
5829     int64_t v;
5830     unsigned int res;
5831
5832     v = float32_to_int64(a STATUS_VAR);
5833     if (v < 0) {
5834         res = 0;
5835         float_raise( float_flag_invalid STATUS_VAR);
5836     } else if (v > 0xffffffff) {
5837         res = 0xffffffff;
5838         float_raise( float_flag_invalid STATUS_VAR);
5839     } else {
5840         res = v;
5841     }
5842     return res;
5843 }
5844
5845 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5846 {
5847     int64_t v;
5848     unsigned int res;
5849
5850     v = float32_to_int64_round_to_zero(a STATUS_VAR);
5851     if (v < 0) {
5852         res = 0;
5853         float_raise( float_flag_invalid STATUS_VAR);
5854     } else if (v > 0xffffffff) {
5855         res = 0xffffffff;
5856         float_raise( float_flag_invalid STATUS_VAR);
5857     } else {
5858         res = v;
5859     }
5860     return res;
5861 }
5862
5863 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
5864 {
5865     int64_t v;
5866     unsigned int res;
5867
5868     v = float32_to_int64_round_to_zero(a STATUS_VAR);
5869     if (v < 0) {
5870         res = 0;
5871         float_raise( float_flag_invalid STATUS_VAR);
5872     } else if (v > 0xffff) {
5873         res = 0xffff;
5874         float_raise( float_flag_invalid STATUS_VAR);
5875     } else {
5876         res = v;
5877     }
5878     return res;
5879 }
5880
5881 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5882 {
5883     int64_t v;
5884     unsigned int res;
5885
5886     v = float64_to_int64(a STATUS_VAR);
5887     if (v < 0) {
5888         res = 0;
5889         float_raise( float_flag_invalid STATUS_VAR);
5890     } else if (v > 0xffffffff) {
5891         res = 0xffffffff;
5892         float_raise( float_flag_invalid STATUS_VAR);
5893     } else {
5894         res = v;
5895     }
5896     return res;
5897 }
5898
5899 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5900 {
5901     int64_t v;
5902     unsigned int res;
5903
5904     v = float64_to_int64_round_to_zero(a STATUS_VAR);
5905     if (v < 0) {
5906         res = 0;
5907         float_raise( float_flag_invalid STATUS_VAR);
5908     } else if (v > 0xffffffff) {
5909         res = 0xffffffff;
5910         float_raise( float_flag_invalid STATUS_VAR);
5911     } else {
5912         res = v;
5913     }
5914     return res;
5915 }
5916
5917 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
5918 {
5919     int64_t v;
5920     unsigned int res;
5921
5922     v = float64_to_int64_round_to_zero(a STATUS_VAR);
5923     if (v < 0) {
5924         res = 0;
5925         float_raise( float_flag_invalid STATUS_VAR);
5926     } else if (v > 0xffff) {
5927         res = 0xffff;
5928         float_raise( float_flag_invalid STATUS_VAR);
5929     } else {
5930         res = v;
5931     }
5932     return res;
5933 }
5934
5935 /* FIXME: This looks broken.  */
5936 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5937 {
5938     int64_t v;
5939
5940     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5941     v += float64_val(a);
5942     v = float64_to_int64(make_float64(v) STATUS_VAR);
5943
5944     return v - INT64_MIN;
5945 }
5946
5947 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5948 {
5949     int64_t v;
5950
5951     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5952     v += float64_val(a);
5953     v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5954
5955     return v - INT64_MIN;
5956 }
5957
5958 #define COMPARE(s, nan_exp)                                                  \
5959 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
5960                                       int is_quiet STATUS_PARAM )            \
5961 {                                                                            \
5962     flag aSign, bSign;                                                       \
5963     bits ## s av, bv;                                                        \
5964     a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
5965     b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
5966                                                                              \
5967     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5968          extractFloat ## s ## Frac( a ) ) ||                                 \
5969         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5970           extractFloat ## s ## Frac( b ) )) {                                \
5971         if (!is_quiet ||                                                     \
5972             float ## s ## _is_signaling_nan( a ) ||                          \
5973             float ## s ## _is_signaling_nan( b ) ) {                         \
5974             float_raise( float_flag_invalid STATUS_VAR);                     \
5975         }                                                                    \
5976         return float_relation_unordered;                                     \
5977     }                                                                        \
5978     aSign = extractFloat ## s ## Sign( a );                                  \
5979     bSign = extractFloat ## s ## Sign( b );                                  \
5980     av = float ## s ## _val(a);                                              \
5981     bv = float ## s ## _val(b);                                              \
5982     if ( aSign != bSign ) {                                                  \
5983         if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) {                         \
5984             /* zero case */                                                  \
5985             return float_relation_equal;                                     \
5986         } else {                                                             \
5987             return 1 - (2 * aSign);                                          \
5988         }                                                                    \
5989     } else {                                                                 \
5990         if (av == bv) {                                                      \
5991             return float_relation_equal;                                     \
5992         } else {                                                             \
5993             return 1 - 2 * (aSign ^ ( av < bv ));                            \
5994         }                                                                    \
5995     }                                                                        \
5996 }                                                                            \
5997                                                                              \
5998 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
5999 {                                                                            \
6000     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
6001 }                                                                            \
6002                                                                              \
6003 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
6004 {                                                                            \
6005     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
6006 }
6007
6008 COMPARE(32, 0xff)
6009 COMPARE(64, 0x7ff)
6010
6011 INLINE int float128_compare_internal( float128 a, float128 b,
6012                                       int is_quiet STATUS_PARAM )
6013 {
6014     flag aSign, bSign;
6015
6016     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6017           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6018         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6019           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6020         if (!is_quiet ||
6021             float128_is_signaling_nan( a ) ||
6022             float128_is_signaling_nan( b ) ) {
6023             float_raise( float_flag_invalid STATUS_VAR);
6024         }
6025         return float_relation_unordered;
6026     }
6027     aSign = extractFloat128Sign( a );
6028     bSign = extractFloat128Sign( b );
6029     if ( aSign != bSign ) {
6030         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6031             /* zero case */
6032             return float_relation_equal;
6033         } else {
6034             return 1 - (2 * aSign);
6035         }
6036     } else {
6037         if (a.low == b.low && a.high == b.high) {
6038             return float_relation_equal;
6039         } else {
6040             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6041         }
6042     }
6043 }
6044
6045 int float128_compare( float128 a, float128 b STATUS_PARAM )
6046 {
6047     return float128_compare_internal(a, b, 0 STATUS_VAR);
6048 }
6049
6050 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6051 {
6052     return float128_compare_internal(a, b, 1 STATUS_VAR);
6053 }
6054
6055 /* Multiply A by 2 raised to the power N.  */
6056 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6057 {
6058     flag aSign;
6059     int16 aExp;
6060     bits32 aSig;
6061
6062     a = float32_squash_input_denormal(a STATUS_VAR);
6063     aSig = extractFloat32Frac( a );
6064     aExp = extractFloat32Exp( a );
6065     aSign = extractFloat32Sign( a );
6066
6067     if ( aExp == 0xFF ) {
6068         return a;
6069     }
6070     if ( aExp != 0 )
6071         aSig |= 0x00800000;
6072     else if ( aSig == 0 )
6073         return a;
6074
6075     aExp += n - 1;
6076     aSig <<= 7;
6077     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6078 }
6079
6080 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6081 {
6082     flag aSign;
6083     int16 aExp;
6084     bits64 aSig;
6085
6086     a = float64_squash_input_denormal(a STATUS_VAR);
6087     aSig = extractFloat64Frac( a );
6088     aExp = extractFloat64Exp( a );
6089     aSign = extractFloat64Sign( a );
6090
6091     if ( aExp == 0x7FF ) {
6092         return a;
6093     }
6094     if ( aExp != 0 )
6095         aSig |= LIT64( 0x0010000000000000 );
6096     else if ( aSig == 0 )
6097         return a;
6098
6099     aExp += n - 1;
6100     aSig <<= 10;
6101     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6102 }
6103
6104 #ifdef FLOATX80
6105 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6106 {
6107     flag aSign;
6108     int16 aExp;
6109     bits64 aSig;
6110
6111     aSig = extractFloatx80Frac( a );
6112     aExp = extractFloatx80Exp( a );
6113     aSign = extractFloatx80Sign( a );
6114
6115     if ( aExp == 0x7FF ) {
6116         return a;
6117     }
6118     if (aExp == 0 && aSig == 0)
6119         return a;
6120
6121     aExp += n;
6122     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6123                                           aSign, aExp, aSig, 0 STATUS_VAR );
6124 }
6125 #endif
6126
6127 #ifdef FLOAT128
6128 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6129 {
6130     flag aSign;
6131     int32 aExp;
6132     bits64 aSig0, aSig1;
6133
6134     aSig1 = extractFloat128Frac1( a );
6135     aSig0 = extractFloat128Frac0( a );
6136     aExp = extractFloat128Exp( a );
6137     aSign = extractFloat128Sign( a );
6138     if ( aExp == 0x7FFF ) {
6139         return a;
6140     }
6141     if ( aExp != 0 )
6142         aSig0 |= LIT64( 0x0001000000000000 );
6143     else if ( aSig0 == 0 && aSig1 == 0 )
6144         return a;
6145
6146     aExp += n - 1;
6147     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6148                                           STATUS_VAR );
6149
6150 }
6151 #endif