fpu/softfloat.c

   1
   2 /*============================================================================
   3
   4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
   5 Package, Release 2b.
   6
   7 Written by John R. Hauser.  This work was made possible in part by the
   8 International Computer Science Institute, located at Suite 600, 1947 Center
   9 Street, Berkeley, California 94704.  Funding was partially provided by the
  10 National Science Foundation under grant MIP-9311980.  The original version
  11 of this code was written as part of a project to build a fixed-point vector
  12 processor in collaboration with the University of California at Berkeley,
  13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  15 arithmetic/SoftFloat.html'.
  16
  17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  19 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  25
  26 Derivative works are acceptable, even for commercial purposes, so long as
  27 (1) the source code for the derivative work includes prominent notice that
  28 the work is derivative, and (2) the source code includes prominent notice with
  29 these four paragraphs for those parts of this code that are retained.
  30
  31 =============================================================================*/
  32
  33 #include "softfloat.h"
  34
  35 /*----------------------------------------------------------------------------
  36 | Primitive arithmetic functions, including multi-word arithmetic, and
  37 | division and square root approximations.  (Can be specialized to target if
  38 | desired.)
  39 *----------------------------------------------------------------------------*/
  40 #include "softfloat-macros.h"
  41
  42 /*----------------------------------------------------------------------------
  43 | Functions and definitions to determine:  (1) whether tininess for underflow
  44 | is detected before or after rounding by default, (2) what (if anything)
  45 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  46 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  47 | are propagated from function inputs to output.  These details are target-
  48 | specific.
  49 *----------------------------------------------------------------------------*/
  50 #include "softfloat-specialize.h"
  51
  52 void set_float_rounding_mode(int val STATUS_PARAM)
  53 {
  54     STATUS(float_rounding_mode) = val;
  55 }
  56
  57 void set_float_exception_flags(int val STATUS_PARAM)
  58 {
  59     STATUS(float_exception_flags) = val;
  60 }
  61
  62 #ifdef FLOATX80
  63 void set_floatx80_rounding_precision(int val STATUS_PARAM)
  64 {
  65     STATUS(floatx80_rounding_precision) = val;
  66 }
  67 #endif
  68
  69 /*----------------------------------------------------------------------------
  70 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  71 | and 7, and returns the properly rounded 32-bit integer corresponding to the
  72 | input.  If `zSign' is 1, the input is negated before being converted to an
  73 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
  74 | is simply rounded to an integer, with the inexact exception raised if the
  75 | input cannot be represented exactly as an integer.  However, if the fixed-
  76 | point input is too large, the invalid exception is raised and the largest
  77 | positive or negative integer is returned.
  78 *----------------------------------------------------------------------------*/
  79
  80 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
  81 {
  82     int8 roundingMode;
  83     flag roundNearestEven;
  84     int8 roundIncrement, roundBits;
  85     int32 z;
  86
  87     roundingMode = STATUS(float_rounding_mode);
  88     roundNearestEven = ( roundingMode == float_round_nearest_even );
  89     roundIncrement = 0x40;
  90     if ( ! roundNearestEven ) {
  91         if ( roundingMode == float_round_to_zero ) {
  92             roundIncrement = 0;
  93         }
  94         else {
  95             roundIncrement = 0x7F;
  96             if ( zSign ) {
  97                 if ( roundingMode == float_round_up ) roundIncrement = 0;
  98             }
  99             else {
 100                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 101             }
 102         }
 103     }
 104     roundBits = absZ & 0x7F;
 105     absZ = ( absZ + roundIncrement )>>7;
 106     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 107     z = absZ;
 108     if ( zSign ) z = - z;
 109     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 110         float_raise( float_flag_invalid STATUS_VAR);
 111         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 112     }
 113     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 114     return z;
 115
 116 }
 117
 118 /*----------------------------------------------------------------------------
 119 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 120 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 121 | and returns the properly rounded 64-bit integer corresponding to the input.
 122 | If `zSign' is 1, the input is negated before being converted to an integer.
 123 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 124 | the inexact exception raised if the input cannot be represented exactly as
 125 | an integer.  However, if the fixed-point input is too large, the invalid
 126 | exception is raised and the largest positive or negative integer is
 127 | returned.
 128 *----------------------------------------------------------------------------*/
 129
 130 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
 131 {
 132     int8 roundingMode;
 133     flag roundNearestEven, increment;
 134     int64 z;
 135
 136     roundingMode = STATUS(float_rounding_mode);
 137     roundNearestEven = ( roundingMode == float_round_nearest_even );
 138     increment = ( (sbits64) absZ1 < 0 );
 139     if ( ! roundNearestEven ) {
 140         if ( roundingMode == float_round_to_zero ) {
 141             increment = 0;
 142         }
 143         else {
 144             if ( zSign ) {
 145                 increment = ( roundingMode == float_round_down ) && absZ1;
 146             }
 147             else {
 148                 increment = ( roundingMode == float_round_up ) && absZ1;
 149             }
 150         }
 151     }
 152     if ( increment ) {
 153         ++absZ0;
 154         if ( absZ0 == 0 ) goto overflow;
 155         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 156     }
 157     z = absZ0;
 158     if ( zSign ) z = - z;
 159     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 160  overflow:
 161         float_raise( float_flag_invalid STATUS_VAR);
 162         return
 163               zSign ? (sbits64) LIT64( 0x8000000000000000 )
 164             : LIT64( 0x7FFFFFFFFFFFFFFF );
 165     }
 166     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 167     return z;
 168
 169 }
 170
 171 /*----------------------------------------------------------------------------
 172 | Returns the fraction bits of the single-precision floating-point value `a'.
 173 *----------------------------------------------------------------------------*/
 174
 175 INLINE bits32 extractFloat32Frac( float32 a )
 176 {
 177
 178     return a & 0x007FFFFF;
 179
 180 }
 181
 182 /*----------------------------------------------------------------------------
 183 | Returns the exponent bits of the single-precision floating-point value `a'.
 184 *----------------------------------------------------------------------------*/
 185
 186 INLINE int16 extractFloat32Exp( float32 a )
 187 {
 188
 189     return ( a>>23 ) & 0xFF;
 190
 191 }
 192
 193 /*----------------------------------------------------------------------------
 194 | Returns the sign bit of the single-precision floating-point value `a'.
 195 *----------------------------------------------------------------------------*/
 196
 197 INLINE flag extractFloat32Sign( float32 a )
 198 {
 199
 200     return a>>31;
 201
 202 }
 203
 204 /*----------------------------------------------------------------------------
 205 | Normalizes the subnormal single-precision floating-point value represented
 206 | by the denormalized significand `aSig'.  The normalized exponent and
 207 | significand are stored at the locations pointed to by `zExpPtr' and
 208 | `zSigPtr', respectively.
 209 *----------------------------------------------------------------------------*/
 210
 211 static void
 212  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 213 {
 214     int8 shiftCount;
 215
 216     shiftCount = countLeadingZeros32( aSig ) - 8;
 217     *zSigPtr = aSig<<shiftCount;
 218     *zExpPtr = 1 - shiftCount;
 219
 220 }
 221
 222 /*----------------------------------------------------------------------------
 223 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 224 | single-precision floating-point value, returning the result.  After being
 225 | shifted into the proper positions, the three fields are simply added
 226 | together to form the result.  This means that any integer portion of `zSig'
 227 | will be added into the exponent.  Since a properly normalized significand
 228 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 229 | than the desired result exponent whenever `zSig' is a complete, normalized
 230 | significand.
 231 *----------------------------------------------------------------------------*/
 232
 233 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 234 {
 235
 236     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
 237
 238 }
 239
 240 /*----------------------------------------------------------------------------
 241 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 242 | and significand `zSig', and returns the proper single-precision floating-
 243 | point value corresponding to the abstract input.  Ordinarily, the abstract
 244 | value is simply rounded and packed into the single-precision format, with
 245 | the inexact exception raised if the abstract input cannot be represented
 246 | exactly.  However, if the abstract value is too large, the overflow and
 247 | inexact exceptions are raised and an infinity or maximal finite value is
 248 | returned.  If the abstract value is too small, the input value is rounded to
 249 | a subnormal number, and the underflow and inexact exceptions are raised if
 250 | the abstract input cannot be represented exactly as a subnormal single-
 251 | precision floating-point number.
 252 |     The input significand `zSig' has its binary point between bits 30
 253 | and 29, which is 7 bits to the left of the usual location.  This shifted
 254 | significand must be normalized or smaller.  If `zSig' is not normalized,
 255 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 256 | and it must not require rounding.  In the usual case that `zSig' is
 257 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 258 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 259 | Binary Floating-Point Arithmetic.
 260 *----------------------------------------------------------------------------*/
 261
 262 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 263 {
 264     int8 roundingMode;
 265     flag roundNearestEven;
 266     int8 roundIncrement, roundBits;
 267     flag isTiny;
 268
 269     roundingMode = STATUS(float_rounding_mode);
 270     roundNearestEven = ( roundingMode == float_round_nearest_even );
 271     roundIncrement = 0x40;
 272     if ( ! roundNearestEven ) {
 273         if ( roundingMode == float_round_to_zero ) {
 274             roundIncrement = 0;
 275         }
 276         else {
 277             roundIncrement = 0x7F;
 278             if ( zSign ) {
 279                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 280             }
 281             else {
 282                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 283             }
 284         }
 285     }
 286     roundBits = zSig & 0x7F;
 287     if ( 0xFD <= (bits16) zExp ) {
 288         if (    ( 0xFD < zExp )
 289              || (    ( zExp == 0xFD )
 290                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 291            ) {
 292             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 293             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
 294         }
 295         if ( zExp < 0 ) {
 296             isTiny =
 297                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 298                 || ( zExp < -1 )
 299                 || ( zSig + roundIncrement < 0x80000000 );
 300             shift32RightJamming( zSig, - zExp, &zSig );
 301             zExp = 0;
 302             roundBits = zSig & 0x7F;
 303             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 304         }
 305     }
 306     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 307     zSig = ( zSig + roundIncrement )>>7;
 308     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 309     if ( zSig == 0 ) zExp = 0;
 310     return packFloat32( zSign, zExp, zSig );
 311
 312 }
 313
 314 /*----------------------------------------------------------------------------
 315 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 316 | and significand `zSig', and returns the proper single-precision floating-
 317 | point value corresponding to the abstract input.  This routine is just like
 318 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 319 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 320 | floating-point exponent.
 321 *----------------------------------------------------------------------------*/
 322
 323 static float32
 324  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 325 {
 326     int8 shiftCount;
 327
 328     shiftCount = countLeadingZeros32( zSig ) - 1;
 329     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 330
 331 }
 332
 333 /*----------------------------------------------------------------------------
 334 | Returns the fraction bits of the double-precision floating-point value `a'.
 335 *----------------------------------------------------------------------------*/
 336
 337 INLINE bits64 extractFloat64Frac( float64 a )
 338 {
 339
 340     return a & LIT64( 0x000FFFFFFFFFFFFF );
 341
 342 }
 343
 344 /*----------------------------------------------------------------------------
 345 | Returns the exponent bits of the double-precision floating-point value `a'.
 346 *----------------------------------------------------------------------------*/
 347
 348 INLINE int16 extractFloat64Exp( float64 a )
 349 {
 350
 351     return ( a>>52 ) & 0x7FF;
 352
 353 }
 354
 355 /*----------------------------------------------------------------------------
 356 | Returns the sign bit of the double-precision floating-point value `a'.
 357 *----------------------------------------------------------------------------*/
 358
 359 INLINE flag extractFloat64Sign( float64 a )
 360 {
 361
 362     return a>>63;
 363
 364 }
 365
 366 /*----------------------------------------------------------------------------
 367 | Normalizes the subnormal double-precision floating-point value represented
 368 | by the denormalized significand `aSig'.  The normalized exponent and
 369 | significand are stored at the locations pointed to by `zExpPtr' and
 370 | `zSigPtr', respectively.
 371 *----------------------------------------------------------------------------*/
 372
 373 static void
 374  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 375 {
 376     int8 shiftCount;
 377
 378     shiftCount = countLeadingZeros64( aSig ) - 11;
 379     *zSigPtr = aSig<<shiftCount;
 380     *zExpPtr = 1 - shiftCount;
 381
 382 }
 383
 384 /*----------------------------------------------------------------------------
 385 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 386 | double-precision floating-point value, returning the result.  After being
 387 | shifted into the proper positions, the three fields are simply added
 388 | together to form the result.  This means that any integer portion of `zSig'
 389 | will be added into the exponent.  Since a properly normalized significand
 390 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 391 | than the desired result exponent whenever `zSig' is a complete, normalized
 392 | significand.
 393 *----------------------------------------------------------------------------*/
 394
 395 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 396 {
 397
 398     return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
 399
 400 }
 401
 402 /*----------------------------------------------------------------------------
 403 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 404 | and significand `zSig', and returns the proper double-precision floating-
 405 | point value corresponding to the abstract input.  Ordinarily, the abstract
 406 | value is simply rounded and packed into the double-precision format, with
 407 | the inexact exception raised if the abstract input cannot be represented
 408 | exactly.  However, if the abstract value is too large, the overflow and
 409 | inexact exceptions are raised and an infinity or maximal finite value is
 410 | returned.  If the abstract value is too small, the input value is rounded
 411 | to a subnormal number, and the underflow and inexact exceptions are raised
 412 | if the abstract input cannot be represented exactly as a subnormal double-
 413 | precision floating-point number.
 414 |     The input significand `zSig' has its binary point between bits 62
 415 | and 61, which is 10 bits to the left of the usual location.  This shifted
 416 | significand must be normalized or smaller.  If `zSig' is not normalized,
 417 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 418 | and it must not require rounding.  In the usual case that `zSig' is
 419 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 420 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 421 | Binary Floating-Point Arithmetic.
 422 *----------------------------------------------------------------------------*/
 423
 424 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 425 {
 426     int8 roundingMode;
 427     flag roundNearestEven;
 428     int16 roundIncrement, roundBits;
 429     flag isTiny;
 430
 431     roundingMode = STATUS(float_rounding_mode);
 432     roundNearestEven = ( roundingMode == float_round_nearest_even );
 433     roundIncrement = 0x200;
 434     if ( ! roundNearestEven ) {
 435         if ( roundingMode == float_round_to_zero ) {
 436             roundIncrement = 0;
 437         }
 438         else {
 439             roundIncrement = 0x3FF;
 440             if ( zSign ) {
 441                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 442             }
 443             else {
 444                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 445             }
 446         }
 447     }
 448     roundBits = zSig & 0x3FF;
 449     if ( 0x7FD <= (bits16) zExp ) {
 450         if (    ( 0x7FD < zExp )
 451              || (    ( zExp == 0x7FD )
 452                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 453            ) {
 454             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 455             return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
 456         }
 457         if ( zExp < 0 ) {
 458             isTiny =
 459                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 460                 || ( zExp < -1 )
 461                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 462             shift64RightJamming( zSig, - zExp, &zSig );
 463             zExp = 0;
 464             roundBits = zSig & 0x3FF;
 465             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 466         }
 467     }
 468     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 469     zSig = ( zSig + roundIncrement )>>10;
 470     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 471     if ( zSig == 0 ) zExp = 0;
 472     return packFloat64( zSign, zExp, zSig );
 473
 474 }
 475
 476 /*----------------------------------------------------------------------------
 477 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 478 | and significand `zSig', and returns the proper double-precision floating-
 479 | point value corresponding to the abstract input.  This routine is just like
 480 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 481 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 482 | floating-point exponent.
 483 *----------------------------------------------------------------------------*/
 484
 485 static float64
 486  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 487 {
 488     int8 shiftCount;
 489
 490     shiftCount = countLeadingZeros64( zSig ) - 1;
 491     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 492
 493 }
 494
 495 #ifdef FLOATX80
 496
 497 /*----------------------------------------------------------------------------
 498 | Returns the fraction bits of the extended double-precision floating-point
 499 | value `a'.
 500 *----------------------------------------------------------------------------*/
 501
 502 INLINE bits64 extractFloatx80Frac( floatx80 a )
 503 {
 504
 505     return a.low;
 506
 507 }
 508
 509 /*----------------------------------------------------------------------------
 510 | Returns the exponent bits of the extended double-precision floating-point
 511 | value `a'.
 512 *----------------------------------------------------------------------------*/
 513
 514 INLINE int32 extractFloatx80Exp( floatx80 a )
 515 {
 516
 517     return a.high & 0x7FFF;
 518
 519 }
 520
 521 /*----------------------------------------------------------------------------
 522 | Returns the sign bit of the extended double-precision floating-point value
 523 | `a'.
 524 *----------------------------------------------------------------------------*/
 525
 526 INLINE flag extractFloatx80Sign( floatx80 a )
 527 {
 528
 529     return a.high>>15;
 530
 531 }
 532
 533 /*----------------------------------------------------------------------------
 534 | Normalizes the subnormal extended double-precision floating-point value
 535 | represented by the denormalized significand `aSig'.  The normalized exponent
 536 | and significand are stored at the locations pointed to by `zExpPtr' and
 537 | `zSigPtr', respectively.
 538 *----------------------------------------------------------------------------*/
 539
 540 static void
 541  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 542 {
 543     int8 shiftCount;
 544
 545     shiftCount = countLeadingZeros64( aSig );
 546     *zSigPtr = aSig<<shiftCount;
 547     *zExpPtr = 1 - shiftCount;
 548
 549 }
 550
 551 /*----------------------------------------------------------------------------
 552 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 553 | extended double-precision floating-point value, returning the result.
 554 *----------------------------------------------------------------------------*/
 555
 556 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 557 {
 558     floatx80 z;
 559
 560     z.low = zSig;
 561     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 562     return z;
 563
 564 }
 565
 566 /*----------------------------------------------------------------------------
 567 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 568 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 569 | and returns the proper extended double-precision floating-point value
 570 | corresponding to the abstract input.  Ordinarily, the abstract value is
 571 | rounded and packed into the extended double-precision format, with the
 572 | inexact exception raised if the abstract input cannot be represented
 573 | exactly.  However, if the abstract value is too large, the overflow and
 574 | inexact exceptions are raised and an infinity or maximal finite value is
 575 | returned.  If the abstract value is too small, the input value is rounded to
 576 | a subnormal number, and the underflow and inexact exceptions are raised if
 577 | the abstract input cannot be represented exactly as a subnormal extended
 578 | double-precision floating-point number.
 579 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 580 | number of bits as single or double precision, respectively.  Otherwise, the
 581 | result is rounded to the full precision of the extended double-precision
 582 | format.
 583 |     The input significand must be normalized or smaller.  If the input
 584 | significand is not normalized, `zExp' must be 0; in that case, the result
 585 | returned is a subnormal number, and it must not require rounding.  The
 586 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 587 | Floating-Point Arithmetic.
 588 *----------------------------------------------------------------------------*/
 589
 590 static floatx80
 591  roundAndPackFloatx80(
 592      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 593  STATUS_PARAM)
 594 {
 595     int8 roundingMode;
 596     flag roundNearestEven, increment, isTiny;
 597     int64 roundIncrement, roundMask, roundBits;
 598
 599     roundingMode = STATUS(float_rounding_mode);
 600     roundNearestEven = ( roundingMode == float_round_nearest_even );
 601     if ( roundingPrecision == 80 ) goto precision80;
 602     if ( roundingPrecision == 64 ) {
 603         roundIncrement = LIT64( 0x0000000000000400 );
 604         roundMask = LIT64( 0x00000000000007FF );
 605     }
 606     else if ( roundingPrecision == 32 ) {
 607         roundIncrement = LIT64( 0x0000008000000000 );
 608         roundMask = LIT64( 0x000000FFFFFFFFFF );
 609     }
 610     else {
 611         goto precision80;
 612     }
 613     zSig0 |= ( zSig1 != 0 );
 614     if ( ! roundNearestEven ) {
 615         if ( roundingMode == float_round_to_zero ) {
 616             roundIncrement = 0;
 617         }
 618         else {
 619             roundIncrement = roundMask;
 620             if ( zSign ) {
 621                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 622             }
 623             else {
 624                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 625             }
 626         }
 627     }
 628     roundBits = zSig0 & roundMask;
 629     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 630         if (    ( 0x7FFE < zExp )
 631              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 632            ) {
 633             goto overflow;
 634         }
 635         if ( zExp <= 0 ) {
 636             isTiny =
 637                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 638                 || ( zExp < 0 )
 639                 || ( zSig0 <= zSig0 + roundIncrement );
 640             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 641             zExp = 0;
 642             roundBits = zSig0 & roundMask;
 643             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 644             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 645             zSig0 += roundIncrement;
 646             if ( (sbits64) zSig0 < 0 ) zExp = 1;
 647             roundIncrement = roundMask + 1;
 648             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 649                 roundMask |= roundIncrement;
 650             }
 651             zSig0 &= ~ roundMask;
 652             return packFloatx80( zSign, zExp, zSig0 );
 653         }
 654     }
 655     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 656     zSig0 += roundIncrement;
 657     if ( zSig0 < roundIncrement ) {
 658         ++zExp;
 659         zSig0 = LIT64( 0x8000000000000000 );
 660     }
 661     roundIncrement = roundMask + 1;
 662     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 663         roundMask |= roundIncrement;
 664     }
 665     zSig0 &= ~ roundMask;
 666     if ( zSig0 == 0 ) zExp = 0;
 667     return packFloatx80( zSign, zExp, zSig0 );
 668  precision80:
 669     increment = ( (sbits64) zSig1 < 0 );
 670     if ( ! roundNearestEven ) {
 671         if ( roundingMode == float_round_to_zero ) {
 672             increment = 0;
 673         }
 674         else {
 675             if ( zSign ) {
 676                 increment = ( roundingMode == float_round_down ) && zSig1;
 677             }
 678             else {
 679                 increment = ( roundingMode == float_round_up ) && zSig1;
 680             }
 681         }
 682     }
 683     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 684         if (    ( 0x7FFE < zExp )
 685              || (    ( zExp == 0x7FFE )
 686                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 687                   && increment
 688                 )
 689            ) {
 690             roundMask = 0;
 691  overflow:
 692             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 693             if (    ( roundingMode == float_round_to_zero )
 694                  || ( zSign && ( roundingMode == float_round_up ) )
 695                  || ( ! zSign && ( roundingMode == float_round_down ) )
 696                ) {
 697                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 698             }
 699             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 700         }
 701         if ( zExp <= 0 ) {
 702             isTiny =
 703                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 704                 || ( zExp < 0 )
 705                 || ! increment
 706                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 707             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 708             zExp = 0;
 709             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 710             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 711             if ( roundNearestEven ) {
 712                 increment = ( (sbits64) zSig1 < 0 );
 713             }
 714             else {
 715                 if ( zSign ) {
 716                     increment = ( roundingMode == float_round_down ) && zSig1;
 717                 }
 718                 else {
 719                     increment = ( roundingMode == float_round_up ) && zSig1;
 720                 }
 721             }
 722             if ( increment ) {
 723                 ++zSig0;
 724                 zSig0 &=
 725                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 726                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
 727             }
 728             return packFloatx80( zSign, zExp, zSig0 );
 729         }
 730     }
 731     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 732     if ( increment ) {
 733         ++zSig0;
 734         if ( zSig0 == 0 ) {
 735             ++zExp;
 736             zSig0 = LIT64( 0x8000000000000000 );
 737         }
 738         else {
 739             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 740         }
 741     }
 742     else {
 743         if ( zSig0 == 0 ) zExp = 0;
 744     }
 745     return packFloatx80( zSign, zExp, zSig0 );
 746
 747 }
 748
 749 /*----------------------------------------------------------------------------
 750 | Takes an abstract floating-point value having sign `zSign', exponent
 751 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 752 | and returns the proper extended double-precision floating-point value
 753 | corresponding to the abstract input.  This routine is just like
 754 | `roundAndPackFloatx80' except that the input significand does not have to be
 755 | normalized.
 756 *----------------------------------------------------------------------------*/
 757
 758 static floatx80
 759  normalizeRoundAndPackFloatx80(
 760      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 761  STATUS_PARAM)
 762 {
 763     int8 shiftCount;
 764
 765     if ( zSig0 == 0 ) {
 766         zSig0 = zSig1;
 767         zSig1 = 0;
 768         zExp -= 64;
 769     }
 770     shiftCount = countLeadingZeros64( zSig0 );
 771     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 772     zExp -= shiftCount;
 773     return
 774         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 775
 776 }
 777
 778 #endif
 779
 780 #ifdef FLOAT128
 781
 782 /*----------------------------------------------------------------------------
 783 | Returns the least-significant 64 fraction bits of the quadruple-precision
 784 | floating-point value `a'.
 785 *----------------------------------------------------------------------------*/
 786
 787 INLINE bits64 extractFloat128Frac1( float128 a )
 788 {
 789
 790     return a.low;
 791
 792 }
 793
 794 /*----------------------------------------------------------------------------
 795 | Returns the most-significant 48 fraction bits of the quadruple-precision
 796 | floating-point value `a'.
 797 *----------------------------------------------------------------------------*/
 798
 799 INLINE bits64 extractFloat128Frac0( float128 a )
 800 {
 801
 802     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 803
 804 }
 805
 806 /*----------------------------------------------------------------------------
 807 | Returns the exponent bits of the quadruple-precision floating-point value
 808 | `a'.
 809 *----------------------------------------------------------------------------*/
 810
 811 INLINE int32 extractFloat128Exp( float128 a )
 812 {
 813
 814     return ( a.high>>48 ) & 0x7FFF;
 815
 816 }
 817
 818 /*----------------------------------------------------------------------------
 819 | Returns the sign bit of the quadruple-precision floating-point value `a'.
 820 *----------------------------------------------------------------------------*/
 821
 822 INLINE flag extractFloat128Sign( float128 a )
 823 {
 824
 825     return a.high>>63;
 826
 827 }
 828
 829 /*----------------------------------------------------------------------------
 830 | Normalizes the subnormal quadruple-precision floating-point value
 831 | represented by the denormalized significand formed by the concatenation of
 832 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 833 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 834 | significand are stored at the location pointed to by `zSig0Ptr', and the
 835 | least significant 64 bits of the normalized significand are stored at the
 836 | location pointed to by `zSig1Ptr'.
 837 *----------------------------------------------------------------------------*/
 838
 839 static void
 840  normalizeFloat128Subnormal(
 841      bits64 aSig0,
 842      bits64 aSig1,
 843      int32 *zExpPtr,
 844      bits64 *zSig0Ptr,
 845      bits64 *zSig1Ptr
 846  )
 847 {
 848     int8 shiftCount;
 849
 850     if ( aSig0 == 0 ) {
 851         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 852         if ( shiftCount < 0 ) {
 853             *zSig0Ptr = aSig1>>( - shiftCount );
 854             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 855         }
 856         else {
 857             *zSig0Ptr = aSig1<<shiftCount;
 858             *zSig1Ptr = 0;
 859         }
 860         *zExpPtr = - shiftCount - 63;
 861     }
 862     else {
 863         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 864         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 865         *zExpPtr = 1 - shiftCount;
 866     }
 867
 868 }
 869
 870 /*----------------------------------------------------------------------------
 871 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 872 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 873 | floating-point value, returning the result.  After being shifted into the
 874 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 875 | added together to form the most significant 32 bits of the result.  This
 876 | means that any integer portion of `zSig0' will be added into the exponent.
 877 | Since a properly normalized significand will have an integer portion equal
 878 | to 1, the `zExp' input should be 1 less than the desired result exponent
 879 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 880 | significand.
 881 *----------------------------------------------------------------------------*/
 882
 883 INLINE float128
 884  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 885 {
 886     float128 z;
 887
 888     z.low = zSig1;
 889     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
 890     return z;
 891
 892 }
 893
 894 /*----------------------------------------------------------------------------
 895 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 896 | and extended significand formed by the concatenation of `zSig0', `zSig1',
 897 | and `zSig2', and returns the proper quadruple-precision floating-point value
 898 | corresponding to the abstract input.  Ordinarily, the abstract value is
 899 | simply rounded and packed into the quadruple-precision format, with the
 900 | inexact exception raised if the abstract input cannot be represented
 901 | exactly.  However, if the abstract value is too large, the overflow and
 902 | inexact exceptions are raised and an infinity or maximal finite value is
 903 | returned.  If the abstract value is too small, the input value is rounded to
 904 | a subnormal number, and the underflow and inexact exceptions are raised if
 905 | the abstract input cannot be represented exactly as a subnormal quadruple-
 906 | precision floating-point number.
 907 |     The input significand must be normalized or smaller.  If the input
 908 | significand is not normalized, `zExp' must be 0; in that case, the result
 909 | returned is a subnormal number, and it must not require rounding.  In the
 910 | usual case that the input significand is normalized, `zExp' must be 1 less
 911 | than the ``true'' floating-point exponent.  The handling of underflow and
 912 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 913 *----------------------------------------------------------------------------*/
 914
 915 static float128
 916  roundAndPackFloat128(
 917      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
 918 {
 919     int8 roundingMode;
 920     flag roundNearestEven, increment, isTiny;
 921
 922     roundingMode = STATUS(float_rounding_mode);
 923     roundNearestEven = ( roundingMode == float_round_nearest_even );
 924     increment = ( (sbits64) zSig2 < 0 );
 925     if ( ! roundNearestEven ) {
 926         if ( roundingMode == float_round_to_zero ) {
 927             increment = 0;
 928         }
 929         else {
 930             if ( zSign ) {
 931                 increment = ( roundingMode == float_round_down ) && zSig2;
 932             }
 933             else {
 934                 increment = ( roundingMode == float_round_up ) && zSig2;
 935             }
 936         }
 937     }
 938     if ( 0x7FFD <= (bits32) zExp ) {
 939         if (    ( 0x7FFD < zExp )
 940              || (    ( zExp == 0x7FFD )
 941                   && eq128(
 942                          LIT64( 0x0001FFFFFFFFFFFF ),
 943                          LIT64( 0xFFFFFFFFFFFFFFFF ),
 944                          zSig0,
 945                          zSig1
 946                      )
 947                   && increment
 948                 )
 949            ) {
 950             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 951             if (    ( roundingMode == float_round_to_zero )
 952                  || ( zSign && ( roundingMode == float_round_up ) )
 953                  || ( ! zSign && ( roundingMode == float_round_down ) )
 954                ) {
 955                 return
 956                     packFloat128(
 957                         zSign,
 958                         0x7FFE,
 959                         LIT64( 0x0000FFFFFFFFFFFF ),
 960                         LIT64( 0xFFFFFFFFFFFFFFFF )
 961                     );
 962             }
 963             return packFloat128( zSign, 0x7FFF, 0, 0 );
 964         }
 965         if ( zExp < 0 ) {
 966             isTiny =
 967                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 968                 || ( zExp < -1 )
 969                 || ! increment
 970                 || lt128(
 971                        zSig0,
 972                        zSig1,
 973                        LIT64( 0x0001FFFFFFFFFFFF ),
 974                        LIT64( 0xFFFFFFFFFFFFFFFF )
 975                    );
 976             shift128ExtraRightJamming(
 977                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
 978             zExp = 0;
 979             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
 980             if ( roundNearestEven ) {
 981                 increment = ( (sbits64) zSig2 < 0 );
 982             }
 983             else {
 984                 if ( zSign ) {
 985                     increment = ( roundingMode == float_round_down ) && zSig2;
 986                 }
 987                 else {
 988                     increment = ( roundingMode == float_round_up ) && zSig2;
 989                 }
 990             }
 991         }
 992     }
 993     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
 994     if ( increment ) {
 995         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
 996         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
 997     }
 998     else {
 999         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1000     }
1001     return packFloat128( zSign, zExp, zSig0, zSig1 );
1002
1003 }
1004
1005 /*----------------------------------------------------------------------------
1006 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1007 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1008 | returns the proper quadruple-precision floating-point value corresponding
1009 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1010 | except that the input significand has fewer bits and does not have to be
1011 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1012 | point exponent.
1013 *----------------------------------------------------------------------------*/
1014
1015 static float128
1016  normalizeRoundAndPackFloat128(
1017      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1018 {
1019     int8 shiftCount;
1020     bits64 zSig2;
1021
1022     if ( zSig0 == 0 ) {
1023         zSig0 = zSig1;
1024         zSig1 = 0;
1025         zExp -= 64;
1026     }
1027     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1028     if ( 0 <= shiftCount ) {
1029         zSig2 = 0;
1030         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1031     }
1032     else {
1033         shift128ExtraRightJamming(
1034             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1035     }
1036     zExp -= shiftCount;
1037     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1038
1039 }
1040
1041 #endif
1042
1043 /*----------------------------------------------------------------------------
1044 | Returns the result of converting the 32-bit two's complement integer `a'
1045 | to the single-precision floating-point format.  The conversion is performed
1046 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1047 *----------------------------------------------------------------------------*/
1048
1049 float32 int32_to_float32( int32 a STATUS_PARAM )
1050 {
1051     flag zSign;
1052
1053     if ( a == 0 ) return 0;
1054     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1055     zSign = ( a < 0 );
1056     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1057
1058 }
1059
1060 /*----------------------------------------------------------------------------
1061 | Returns the result of converting the 32-bit two's complement integer `a'
1062 | to the double-precision floating-point format.  The conversion is performed
1063 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1064 *----------------------------------------------------------------------------*/
1065
1066 float64 int32_to_float64( int32 a STATUS_PARAM )
1067 {
1068     flag zSign;
1069     uint32 absA;
1070     int8 shiftCount;
1071     bits64 zSig;
1072
1073     if ( a == 0 ) return 0;
1074     zSign = ( a < 0 );
1075     absA = zSign ? - a : a;
1076     shiftCount = countLeadingZeros32( absA ) + 21;
1077     zSig = absA;
1078     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1079
1080 }
1081
1082 #ifdef FLOATX80
1083
1084 /*----------------------------------------------------------------------------
1085 | Returns the result of converting the 32-bit two's complement integer `a'
1086 | to the extended double-precision floating-point format.  The conversion
1087 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1088 | Arithmetic.
1089 *----------------------------------------------------------------------------*/
1090
1091 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1092 {
1093     flag zSign;
1094     uint32 absA;
1095     int8 shiftCount;
1096     bits64 zSig;
1097
1098     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1099     zSign = ( a < 0 );
1100     absA = zSign ? - a : a;
1101     shiftCount = countLeadingZeros32( absA ) + 32;
1102     zSig = absA;
1103     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1104
1105 }
1106
1107 #endif
1108
1109 #ifdef FLOAT128
1110
1111 /*----------------------------------------------------------------------------
1112 | Returns the result of converting the 32-bit two's complement integer `a' to
1113 | the quadruple-precision floating-point format.  The conversion is performed
1114 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115 *----------------------------------------------------------------------------*/
1116
1117 float128 int32_to_float128( int32 a STATUS_PARAM )
1118 {
1119     flag zSign;
1120     uint32 absA;
1121     int8 shiftCount;
1122     bits64 zSig0;
1123
1124     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1125     zSign = ( a < 0 );
1126     absA = zSign ? - a : a;
1127     shiftCount = countLeadingZeros32( absA ) + 17;
1128     zSig0 = absA;
1129     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1130
1131 }
1132
1133 #endif
1134
1135 /*----------------------------------------------------------------------------
1136 | Returns the result of converting the 64-bit two's complement integer `a'
1137 | to the single-precision floating-point format.  The conversion is performed
1138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139 *----------------------------------------------------------------------------*/
1140
1141 float32 int64_to_float32( int64 a STATUS_PARAM )
1142 {
1143     flag zSign;
1144     uint64 absA;
1145     int8 shiftCount;
1146
1147     if ( a == 0 ) return 0;
1148     zSign = ( a < 0 );
1149     absA = zSign ? - a : a;
1150     shiftCount = countLeadingZeros64( absA ) - 40;
1151     if ( 0 <= shiftCount ) {
1152         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1153     }
1154     else {
1155         shiftCount += 7;
1156         if ( shiftCount < 0 ) {
1157             shift64RightJamming( absA, - shiftCount, &absA );
1158         }
1159         else {
1160             absA <<= shiftCount;
1161         }
1162         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1163     }
1164
1165 }
1166
1167 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1168 {
1169     int8 shiftCount;
1170
1171     if ( a == 0 ) return 0;
1172     shiftCount = countLeadingZeros64( a ) - 40;
1173     if ( 0 <= shiftCount ) {
1174         return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1175     }
1176     else {
1177         shiftCount += 7;
1178         if ( shiftCount < 0 ) {
1179             shift64RightJamming( a, - shiftCount, &a );
1180         }
1181         else {
1182             a <<= shiftCount;
1183         }
1184         return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1185     }
1186 }
1187
1188 /*----------------------------------------------------------------------------
1189 | Returns the result of converting the 64-bit two's complement integer `a'
1190 | to the double-precision floating-point format.  The conversion is performed
1191 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1192 *----------------------------------------------------------------------------*/
1193
1194 float64 int64_to_float64( int64 a STATUS_PARAM )
1195 {
1196     flag zSign;
1197
1198     if ( a == 0 ) return 0;
1199     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1200         return packFloat64( 1, 0x43E, 0 );
1201     }
1202     zSign = ( a < 0 );
1203     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1204
1205 }
1206
1207 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1208 {
1209     if ( a == 0 ) return 0;
1210     return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1211
1212 }
1213
1214 #ifdef FLOATX80
1215
1216 /*----------------------------------------------------------------------------
1217 | Returns the result of converting the 64-bit two's complement integer `a'
1218 | to the extended double-precision floating-point format.  The conversion
1219 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1220 | Arithmetic.
1221 *----------------------------------------------------------------------------*/
1222
1223 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1224 {
1225     flag zSign;
1226     uint64 absA;
1227     int8 shiftCount;
1228
1229     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1230     zSign = ( a < 0 );
1231     absA = zSign ? - a : a;
1232     shiftCount = countLeadingZeros64( absA );
1233     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1234
1235 }
1236
1237 #endif
1238
1239 #ifdef FLOAT128
1240
1241 /*----------------------------------------------------------------------------
1242 | Returns the result of converting the 64-bit two's complement integer `a' to
1243 | the quadruple-precision floating-point format.  The conversion is performed
1244 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1245 *----------------------------------------------------------------------------*/
1246
1247 float128 int64_to_float128( int64 a STATUS_PARAM )
1248 {
1249     flag zSign;
1250     uint64 absA;
1251     int8 shiftCount;
1252     int32 zExp;
1253     bits64 zSig0, zSig1;
1254
1255     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1256     zSign = ( a < 0 );
1257     absA = zSign ? - a : a;
1258     shiftCount = countLeadingZeros64( absA ) + 49;
1259     zExp = 0x406E - shiftCount;
1260     if ( 64 <= shiftCount ) {
1261         zSig1 = 0;
1262         zSig0 = absA;
1263         shiftCount -= 64;
1264     }
1265     else {
1266         zSig1 = absA;
1267         zSig0 = 0;
1268     }
1269     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1270     return packFloat128( zSign, zExp, zSig0, zSig1 );
1271
1272 }
1273
1274 #endif
1275
1276 /*----------------------------------------------------------------------------
1277 | Returns the result of converting the single-precision floating-point value
1278 | `a' to the 32-bit two's complement integer format.  The conversion is
1279 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1280 | Arithmetic---which means in particular that the conversion is rounded
1281 | according to the current rounding mode.  If `a' is a NaN, the largest
1282 | positive integer is returned.  Otherwise, if the conversion overflows, the
1283 | largest integer with the same sign as `a' is returned.
1284 *----------------------------------------------------------------------------*/
1285
1286 int32 float32_to_int32( float32 a STATUS_PARAM )
1287 {
1288     flag aSign;
1289     int16 aExp, shiftCount;
1290     bits32 aSig;
1291     bits64 aSig64;
1292
1293     aSig = extractFloat32Frac( a );
1294     aExp = extractFloat32Exp( a );
1295     aSign = extractFloat32Sign( a );
1296     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1297     if ( aExp ) aSig |= 0x00800000;
1298     shiftCount = 0xAF - aExp;
1299     aSig64 = aSig;
1300     aSig64 <<= 32;
1301     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1302     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1303
1304 }
1305
1306 /*----------------------------------------------------------------------------
1307 | Returns the result of converting the single-precision floating-point value
1308 | `a' to the 32-bit two's complement integer format.  The conversion is
1309 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1310 | Arithmetic, except that the conversion is always rounded toward zero.
1311 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1312 | the conversion overflows, the largest integer with the same sign as `a' is
1313 | returned.
1314 *----------------------------------------------------------------------------*/
1315
1316 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1317 {
1318     flag aSign;
1319     int16 aExp, shiftCount;
1320     bits32 aSig;
1321     int32 z;
1322
1323     aSig = extractFloat32Frac( a );
1324     aExp = extractFloat32Exp( a );
1325     aSign = extractFloat32Sign( a );
1326     shiftCount = aExp - 0x9E;
1327     if ( 0 <= shiftCount ) {
1328         if ( a != 0xCF000000 ) {
1329             float_raise( float_flag_invalid STATUS_VAR);
1330             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1331         }
1332         return (sbits32) 0x80000000;
1333     }
1334     else if ( aExp <= 0x7E ) {
1335         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1336         return 0;
1337     }
1338     aSig = ( aSig | 0x00800000 )<<8;
1339     z = aSig>>( - shiftCount );
1340     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1341         STATUS(float_exception_flags) |= float_flag_inexact;
1342     }
1343     if ( aSign ) z = - z;
1344     return z;
1345
1346 }
1347
1348 /*----------------------------------------------------------------------------
1349 | Returns the result of converting the single-precision floating-point value
1350 | `a' to the 64-bit two's complement integer format.  The conversion is
1351 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1352 | Arithmetic---which means in particular that the conversion is rounded
1353 | according to the current rounding mode.  If `a' is a NaN, the largest
1354 | positive integer is returned.  Otherwise, if the conversion overflows, the
1355 | largest integer with the same sign as `a' is returned.
1356 *----------------------------------------------------------------------------*/
1357
1358 int64 float32_to_int64( float32 a STATUS_PARAM )
1359 {
1360     flag aSign;
1361     int16 aExp, shiftCount;
1362     bits32 aSig;
1363     bits64 aSig64, aSigExtra;
1364
1365     aSig = extractFloat32Frac( a );
1366     aExp = extractFloat32Exp( a );
1367     aSign = extractFloat32Sign( a );
1368     shiftCount = 0xBE - aExp;
1369     if ( shiftCount < 0 ) {
1370         float_raise( float_flag_invalid STATUS_VAR);
1371         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1372             return LIT64( 0x7FFFFFFFFFFFFFFF );
1373         }
1374         return (sbits64) LIT64( 0x8000000000000000 );
1375     }
1376     if ( aExp ) aSig |= 0x00800000;
1377     aSig64 = aSig;
1378     aSig64 <<= 40;
1379     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1380     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1381
1382 }
1383
1384 /*----------------------------------------------------------------------------
1385 | Returns the result of converting the single-precision floating-point value
1386 | `a' to the 64-bit two's complement integer format.  The conversion is
1387 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1388 | Arithmetic, except that the conversion is always rounded toward zero.  If
1389 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1390 | conversion overflows, the largest integer with the same sign as `a' is
1391 | returned.
1392 *----------------------------------------------------------------------------*/
1393
1394 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1395 {
1396     flag aSign;
1397     int16 aExp, shiftCount;
1398     bits32 aSig;
1399     bits64 aSig64;
1400     int64 z;
1401
1402     aSig = extractFloat32Frac( a );
1403     aExp = extractFloat32Exp( a );
1404     aSign = extractFloat32Sign( a );
1405     shiftCount = aExp - 0xBE;
1406     if ( 0 <= shiftCount ) {
1407         if ( a != 0xDF000000 ) {
1408             float_raise( float_flag_invalid STATUS_VAR);
1409             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1410                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1411             }
1412         }
1413         return (sbits64) LIT64( 0x8000000000000000 );
1414     }
1415     else if ( aExp <= 0x7E ) {
1416         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1417         return 0;
1418     }
1419     aSig64 = aSig | 0x00800000;
1420     aSig64 <<= 40;
1421     z = aSig64>>( - shiftCount );
1422     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1423         STATUS(float_exception_flags) |= float_flag_inexact;
1424     }
1425     if ( aSign ) z = - z;
1426     return z;
1427
1428 }
1429
1430 /*----------------------------------------------------------------------------
1431 | Returns the result of converting the single-precision floating-point value
1432 | `a' to the double-precision floating-point format.  The conversion is
1433 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1434 | Arithmetic.
1435 *----------------------------------------------------------------------------*/
1436
1437 float64 float32_to_float64( float32 a STATUS_PARAM )
1438 {
1439     flag aSign;
1440     int16 aExp;
1441     bits32 aSig;
1442
1443     aSig = extractFloat32Frac( a );
1444     aExp = extractFloat32Exp( a );
1445     aSign = extractFloat32Sign( a );
1446     if ( aExp == 0xFF ) {
1447         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1448         return packFloat64( aSign, 0x7FF, 0 );
1449     }
1450     if ( aExp == 0 ) {
1451         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1452         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1453         --aExp;
1454     }
1455     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1456
1457 }
1458
1459 #ifdef FLOATX80
1460
1461 /*----------------------------------------------------------------------------
1462 | Returns the result of converting the single-precision floating-point value
1463 | `a' to the extended double-precision floating-point format.  The conversion
1464 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1465 | Arithmetic.
1466 *----------------------------------------------------------------------------*/
1467
1468 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1469 {
1470     flag aSign;
1471     int16 aExp;
1472     bits32 aSig;
1473
1474     aSig = extractFloat32Frac( a );
1475     aExp = extractFloat32Exp( a );
1476     aSign = extractFloat32Sign( a );
1477     if ( aExp == 0xFF ) {
1478         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1479         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1480     }
1481     if ( aExp == 0 ) {
1482         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1483         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1484     }
1485     aSig |= 0x00800000;
1486     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1487
1488 }
1489
1490 #endif
1491
1492 #ifdef FLOAT128
1493
1494 /*----------------------------------------------------------------------------
1495 | Returns the result of converting the single-precision floating-point value
1496 | `a' to the double-precision floating-point format.  The conversion is
1497 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1498 | Arithmetic.
1499 *----------------------------------------------------------------------------*/
1500
1501 float128 float32_to_float128( float32 a STATUS_PARAM )
1502 {
1503     flag aSign;
1504     int16 aExp;
1505     bits32 aSig;
1506
1507     aSig = extractFloat32Frac( a );
1508     aExp = extractFloat32Exp( a );
1509     aSign = extractFloat32Sign( a );
1510     if ( aExp == 0xFF ) {
1511         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1512         return packFloat128( aSign, 0x7FFF, 0, 0 );
1513     }
1514     if ( aExp == 0 ) {
1515         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1516         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1517         --aExp;
1518     }
1519     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1520
1521 }
1522
1523 #endif
1524
1525 /*----------------------------------------------------------------------------
1526 | Rounds the single-precision floating-point value `a' to an integer, and
1527 | returns the result as a single-precision floating-point value.  The
1528 | operation is performed according to the IEC/IEEE Standard for Binary
1529 | Floating-Point Arithmetic.
1530 *----------------------------------------------------------------------------*/
1531
1532 float32 float32_round_to_int( float32 a STATUS_PARAM)
1533 {
1534     flag aSign;
1535     int16 aExp;
1536     bits32 lastBitMask, roundBitsMask;
1537     int8 roundingMode;
1538     float32 z;
1539
1540     aExp = extractFloat32Exp( a );
1541     if ( 0x96 <= aExp ) {
1542         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1543             return propagateFloat32NaN( a, a STATUS_VAR );
1544         }
1545         return a;
1546     }
1547     if ( aExp <= 0x7E ) {
1548         if ( (bits32) ( a<<1 ) == 0 ) return a;
1549         STATUS(float_exception_flags) |= float_flag_inexact;
1550         aSign = extractFloat32Sign( a );
1551         switch ( STATUS(float_rounding_mode) ) {
1552          case float_round_nearest_even:
1553             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1554                 return packFloat32( aSign, 0x7F, 0 );
1555             }
1556             break;
1557          case float_round_down:
1558             return aSign ? 0xBF800000 : 0;
1559          case float_round_up:
1560             return aSign ? 0x80000000 : 0x3F800000;
1561         }
1562         return packFloat32( aSign, 0, 0 );
1563     }
1564     lastBitMask = 1;
1565     lastBitMask <<= 0x96 - aExp;
1566     roundBitsMask = lastBitMask - 1;
1567     z = a;
1568     roundingMode = STATUS(float_rounding_mode);
1569     if ( roundingMode == float_round_nearest_even ) {
1570         z += lastBitMask>>1;
1571         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1572     }
1573     else if ( roundingMode != float_round_to_zero ) {
1574         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1575             z += roundBitsMask;
1576         }
1577     }
1578     z &= ~ roundBitsMask;
1579     if ( z != a ) STATUS(float_exception_flags) |= float_flag_inexact;
1580     return z;
1581
1582 }
1583
1584 /*----------------------------------------------------------------------------
1585 | Returns the result of adding the absolute values of the single-precision
1586 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1587 | before being returned.  `zSign' is ignored if the result is a NaN.
1588 | The addition is performed according to the IEC/IEEE Standard for Binary
1589 | Floating-Point Arithmetic.
1590 *----------------------------------------------------------------------------*/
1591
1592 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1593 {
1594     int16 aExp, bExp, zExp;
1595     bits32 aSig, bSig, zSig;
1596     int16 expDiff;
1597
1598     aSig = extractFloat32Frac( a );
1599     aExp = extractFloat32Exp( a );
1600     bSig = extractFloat32Frac( b );
1601     bExp = extractFloat32Exp( b );
1602     expDiff = aExp - bExp;
1603     aSig <<= 6;
1604     bSig <<= 6;
1605     if ( 0 < expDiff ) {
1606         if ( aExp == 0xFF ) {
1607             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1608             return a;
1609         }
1610         if ( bExp == 0 ) {
1611             --expDiff;
1612         }
1613         else {
1614             bSig |= 0x20000000;
1615         }
1616         shift32RightJamming( bSig, expDiff, &bSig );
1617         zExp = aExp;
1618     }
1619     else if ( expDiff < 0 ) {
1620         if ( bExp == 0xFF ) {
1621             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1622             return packFloat32( zSign, 0xFF, 0 );
1623         }
1624         if ( aExp == 0 ) {
1625             ++expDiff;
1626         }
1627         else {
1628             aSig |= 0x20000000;
1629         }
1630         shift32RightJamming( aSig, - expDiff, &aSig );
1631         zExp = bExp;
1632     }
1633     else {
1634         if ( aExp == 0xFF ) {
1635             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1636             return a;
1637         }
1638         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1639         zSig = 0x40000000 + aSig + bSig;
1640         zExp = aExp;
1641         goto roundAndPack;
1642     }
1643     aSig |= 0x20000000;
1644     zSig = ( aSig + bSig )<<1;
1645     --zExp;
1646     if ( (sbits32) zSig < 0 ) {
1647         zSig = aSig + bSig;
1648         ++zExp;
1649     }
1650  roundAndPack:
1651     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1652
1653 }
1654
1655 /*----------------------------------------------------------------------------
1656 | Returns the result of subtracting the absolute values of the single-
1657 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1658 | difference is negated before being returned.  `zSign' is ignored if the
1659 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1660 | Standard for Binary Floating-Point Arithmetic.
1661 *----------------------------------------------------------------------------*/
1662
1663 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1664 {
1665     int16 aExp, bExp, zExp;
1666     bits32 aSig, bSig, zSig;
1667     int16 expDiff;
1668
1669     aSig = extractFloat32Frac( a );
1670     aExp = extractFloat32Exp( a );
1671     bSig = extractFloat32Frac( b );
1672     bExp = extractFloat32Exp( b );
1673     expDiff = aExp - bExp;
1674     aSig <<= 7;
1675     bSig <<= 7;
1676     if ( 0 < expDiff ) goto aExpBigger;
1677     if ( expDiff < 0 ) goto bExpBigger;
1678     if ( aExp == 0xFF ) {
1679         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1680         float_raise( float_flag_invalid STATUS_VAR);
1681         return float32_default_nan;
1682     }
1683     if ( aExp == 0 ) {
1684         aExp = 1;
1685         bExp = 1;
1686     }
1687     if ( bSig < aSig ) goto aBigger;
1688     if ( aSig < bSig ) goto bBigger;
1689     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1690  bExpBigger:
1691     if ( bExp == 0xFF ) {
1692         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1693         return packFloat32( zSign ^ 1, 0xFF, 0 );
1694     }
1695     if ( aExp == 0 ) {
1696         ++expDiff;
1697     }
1698     else {
1699         aSig |= 0x40000000;
1700     }
1701     shift32RightJamming( aSig, - expDiff, &aSig );
1702     bSig |= 0x40000000;
1703  bBigger:
1704     zSig = bSig - aSig;
1705     zExp = bExp;
1706     zSign ^= 1;
1707     goto normalizeRoundAndPack;
1708  aExpBigger:
1709     if ( aExp == 0xFF ) {
1710         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1711         return a;
1712     }
1713     if ( bExp == 0 ) {
1714         --expDiff;
1715     }
1716     else {
1717         bSig |= 0x40000000;
1718     }
1719     shift32RightJamming( bSig, expDiff, &bSig );
1720     aSig |= 0x40000000;
1721  aBigger:
1722     zSig = aSig - bSig;
1723     zExp = aExp;
1724  normalizeRoundAndPack:
1725     --zExp;
1726     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1727
1728 }
1729
1730 /*----------------------------------------------------------------------------
1731 | Returns the result of adding the single-precision floating-point values `a'
1732 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1733 | Binary Floating-Point Arithmetic.
1734 *----------------------------------------------------------------------------*/
1735
1736 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1737 {
1738     flag aSign, bSign;
1739
1740     aSign = extractFloat32Sign( a );
1741     bSign = extractFloat32Sign( b );
1742     if ( aSign == bSign ) {
1743         return addFloat32Sigs( a, b, aSign STATUS_VAR);
1744     }
1745     else {
1746         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1747     }
1748
1749 }
1750
1751 /*----------------------------------------------------------------------------
1752 | Returns the result of subtracting the single-precision floating-point values
1753 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1754 | for Binary Floating-Point Arithmetic.
1755 *----------------------------------------------------------------------------*/
1756
1757 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1758 {
1759     flag aSign, bSign;
1760
1761     aSign = extractFloat32Sign( a );
1762     bSign = extractFloat32Sign( b );
1763     if ( aSign == bSign ) {
1764         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1765     }
1766     else {
1767         return addFloat32Sigs( a, b, aSign STATUS_VAR );
1768     }
1769
1770 }
1771
1772 /*----------------------------------------------------------------------------
1773 | Returns the result of multiplying the single-precision floating-point values
1774 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1775 | for Binary Floating-Point Arithmetic.
1776 *----------------------------------------------------------------------------*/
1777
1778 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1779 {
1780     flag aSign, bSign, zSign;
1781     int16 aExp, bExp, zExp;
1782     bits32 aSig, bSig;
1783     bits64 zSig64;
1784     bits32 zSig;
1785
1786     aSig = extractFloat32Frac( a );
1787     aExp = extractFloat32Exp( a );
1788     aSign = extractFloat32Sign( a );
1789     bSig = extractFloat32Frac( b );
1790     bExp = extractFloat32Exp( b );
1791     bSign = extractFloat32Sign( b );
1792     zSign = aSign ^ bSign;
1793     if ( aExp == 0xFF ) {
1794         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1795             return propagateFloat32NaN( a, b STATUS_VAR );
1796         }
1797         if ( ( bExp | bSig ) == 0 ) {
1798             float_raise( float_flag_invalid STATUS_VAR);
1799             return float32_default_nan;
1800         }
1801         return packFloat32( zSign, 0xFF, 0 );
1802     }
1803     if ( bExp == 0xFF ) {
1804         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1805         if ( ( aExp | aSig ) == 0 ) {
1806             float_raise( float_flag_invalid STATUS_VAR);
1807             return float32_default_nan;
1808         }
1809         return packFloat32( zSign, 0xFF, 0 );
1810     }
1811     if ( aExp == 0 ) {
1812         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1813         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1814     }
1815     if ( bExp == 0 ) {
1816         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1817         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1818     }
1819     zExp = aExp + bExp - 0x7F;
1820     aSig = ( aSig | 0x00800000 )<<7;
1821     bSig = ( bSig | 0x00800000 )<<8;
1822     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1823     zSig = zSig64;
1824     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1825         zSig <<= 1;
1826         --zExp;
1827     }
1828     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1829
1830 }
1831
1832 /*----------------------------------------------------------------------------
1833 | Returns the result of dividing the single-precision floating-point value `a'
1834 | by the corresponding value `b'.  The operation is performed according to the
1835 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1836 *----------------------------------------------------------------------------*/
1837
1838 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1839 {
1840     flag aSign, bSign, zSign;
1841     int16 aExp, bExp, zExp;
1842     bits32 aSig, bSig, zSig;
1843
1844     aSig = extractFloat32Frac( a );
1845     aExp = extractFloat32Exp( a );
1846     aSign = extractFloat32Sign( a );
1847     bSig = extractFloat32Frac( b );
1848     bExp = extractFloat32Exp( b );
1849     bSign = extractFloat32Sign( b );
1850     zSign = aSign ^ bSign;
1851     if ( aExp == 0xFF ) {
1852         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1853         if ( bExp == 0xFF ) {
1854             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1855             float_raise( float_flag_invalid STATUS_VAR);
1856             return float32_default_nan;
1857         }
1858         return packFloat32( zSign, 0xFF, 0 );
1859     }
1860     if ( bExp == 0xFF ) {
1861         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1862         return packFloat32( zSign, 0, 0 );
1863     }
1864     if ( bExp == 0 ) {
1865         if ( bSig == 0 ) {
1866             if ( ( aExp | aSig ) == 0 ) {
1867                 float_raise( float_flag_invalid STATUS_VAR);
1868                 return float32_default_nan;
1869             }
1870             float_raise( float_flag_divbyzero STATUS_VAR);
1871             return packFloat32( zSign, 0xFF, 0 );
1872         }
1873         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1874     }
1875     if ( aExp == 0 ) {
1876         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1877         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1878     }
1879     zExp = aExp - bExp + 0x7D;
1880     aSig = ( aSig | 0x00800000 )<<7;
1881     bSig = ( bSig | 0x00800000 )<<8;
1882     if ( bSig <= ( aSig + aSig ) ) {
1883         aSig >>= 1;
1884         ++zExp;
1885     }
1886     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1887     if ( ( zSig & 0x3F ) == 0 ) {
1888         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1889     }
1890     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1891
1892 }
1893
1894 /*----------------------------------------------------------------------------
1895 | Returns the remainder of the single-precision floating-point value `a'
1896 | with respect to the corresponding value `b'.  The operation is performed
1897 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1898 *----------------------------------------------------------------------------*/
1899
1900 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1901 {
1902     flag aSign, bSign, zSign;
1903     int16 aExp, bExp, expDiff;
1904     bits32 aSig, bSig;
1905     bits32 q;
1906     bits64 aSig64, bSig64, q64;
1907     bits32 alternateASig;
1908     sbits32 sigMean;
1909
1910     aSig = extractFloat32Frac( a );
1911     aExp = extractFloat32Exp( a );
1912     aSign = extractFloat32Sign( a );
1913     bSig = extractFloat32Frac( b );
1914     bExp = extractFloat32Exp( b );
1915     bSign = extractFloat32Sign( b );
1916     if ( aExp == 0xFF ) {
1917         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1918             return propagateFloat32NaN( a, b STATUS_VAR );
1919         }
1920         float_raise( float_flag_invalid STATUS_VAR);
1921         return float32_default_nan;
1922     }
1923     if ( bExp == 0xFF ) {
1924         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1925         return a;
1926     }
1927     if ( bExp == 0 ) {
1928         if ( bSig == 0 ) {
1929             float_raise( float_flag_invalid STATUS_VAR);
1930             return float32_default_nan;
1931         }
1932         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1933     }
1934     if ( aExp == 0 ) {
1935         if ( aSig == 0 ) return a;
1936         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1937     }
1938     expDiff = aExp - bExp;
1939     aSig |= 0x00800000;
1940     bSig |= 0x00800000;
1941     if ( expDiff < 32 ) {
1942         aSig <<= 8;
1943         bSig <<= 8;
1944         if ( expDiff < 0 ) {
1945             if ( expDiff < -1 ) return a;
1946             aSig >>= 1;
1947         }
1948         q = ( bSig <= aSig );
1949         if ( q ) aSig -= bSig;
1950         if ( 0 < expDiff ) {
1951             q = ( ( (bits64) aSig )<<32 ) / bSig;
1952             q >>= 32 - expDiff;
1953             bSig >>= 2;
1954             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1955         }
1956         else {
1957             aSig >>= 2;
1958             bSig >>= 2;
1959         }
1960     }
1961     else {
1962         if ( bSig <= aSig ) aSig -= bSig;
1963         aSig64 = ( (bits64) aSig )<<40;
1964         bSig64 = ( (bits64) bSig )<<40;
1965         expDiff -= 64;
1966         while ( 0 < expDiff ) {
1967             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1968             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1969             aSig64 = - ( ( bSig * q64 )<<38 );
1970             expDiff -= 62;
1971         }
1972         expDiff += 64;
1973         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1974         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1975         q = q64>>( 64 - expDiff );
1976         bSig <<= 6;
1977         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1978     }
1979     do {
1980         alternateASig = aSig;
1981         ++q;
1982         aSig -= bSig;
1983     } while ( 0 <= (sbits32) aSig );
1984     sigMean = aSig + alternateASig;
1985     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1986         aSig = alternateASig;
1987     }
1988     zSign = ( (sbits32) aSig < 0 );
1989     if ( zSign ) aSig = - aSig;
1990     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
1991
1992 }
1993
1994 /*----------------------------------------------------------------------------
1995 | Returns the square root of the single-precision floating-point value `a'.
1996 | The operation is performed according to the IEC/IEEE Standard for Binary
1997 | Floating-Point Arithmetic.
1998 *----------------------------------------------------------------------------*/
1999
2000 float32 float32_sqrt( float32 a STATUS_PARAM )
2001 {
2002     flag aSign;
2003     int16 aExp, zExp;
2004     bits32 aSig, zSig;
2005     bits64 rem, term;
2006
2007     aSig = extractFloat32Frac( a );
2008     aExp = extractFloat32Exp( a );
2009     aSign = extractFloat32Sign( a );
2010     if ( aExp == 0xFF ) {
2011         if ( aSig ) return propagateFloat32NaN( a, 0 STATUS_VAR );
2012         if ( ! aSign ) return a;
2013         float_raise( float_flag_invalid STATUS_VAR);
2014         return float32_default_nan;
2015     }
2016     if ( aSign ) {
2017         if ( ( aExp | aSig ) == 0 ) return a;
2018         float_raise( float_flag_invalid STATUS_VAR);
2019         return float32_default_nan;
2020     }
2021     if ( aExp == 0 ) {
2022         if ( aSig == 0 ) return 0;
2023         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2024     }
2025     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2026     aSig = ( aSig | 0x00800000 )<<8;
2027     zSig = estimateSqrt32( aExp, aSig ) + 2;
2028     if ( ( zSig & 0x7F ) <= 5 ) {
2029         if ( zSig < 2 ) {
2030             zSig = 0x7FFFFFFF;
2031             goto roundAndPack;
2032         }
2033         aSig >>= aExp & 1;
2034         term = ( (bits64) zSig ) * zSig;
2035         rem = ( ( (bits64) aSig )<<32 ) - term;
2036         while ( (sbits64) rem < 0 ) {
2037             --zSig;
2038             rem += ( ( (bits64) zSig )<<1 ) | 1;
2039         }
2040         zSig |= ( rem != 0 );
2041     }
2042     shift32RightJamming( zSig, 1, &zSig );
2043  roundAndPack:
2044     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2045
2046 }
2047
2048 /*----------------------------------------------------------------------------
2049 | Returns 1 if the single-precision floating-point value `a' is equal to
2050 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2051 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2052 *----------------------------------------------------------------------------*/
2053
2054 int float32_eq( float32 a, float32 b STATUS_PARAM )
2055 {
2056
2057     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2058          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2059        ) {
2060         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2061             float_raise( float_flag_invalid STATUS_VAR);
2062         }
2063         return 0;
2064     }
2065     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2066
2067 }
2068
2069 /*----------------------------------------------------------------------------
2070 | Returns 1 if the single-precision floating-point value `a' is less than
2071 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
2072 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2073 | Arithmetic.
2074 *----------------------------------------------------------------------------*/
2075
2076 int float32_le( float32 a, float32 b STATUS_PARAM )
2077 {
2078     flag aSign, bSign;
2079
2080     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2081          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2082        ) {
2083         float_raise( float_flag_invalid STATUS_VAR);
2084         return 0;
2085     }
2086     aSign = extractFloat32Sign( a );
2087     bSign = extractFloat32Sign( b );
2088     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2089     return ( a == b ) || ( aSign ^ ( a < b ) );
2090
2091 }
2092
2093 /*----------------------------------------------------------------------------
2094 | Returns 1 if the single-precision floating-point value `a' is less than
2095 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2096 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2097 *----------------------------------------------------------------------------*/
2098
2099 int float32_lt( float32 a, float32 b STATUS_PARAM )
2100 {
2101     flag aSign, bSign;
2102
2103     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2104          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2105        ) {
2106         float_raise( float_flag_invalid STATUS_VAR);
2107         return 0;
2108     }
2109     aSign = extractFloat32Sign( a );
2110     bSign = extractFloat32Sign( b );
2111     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2112     return ( a != b ) && ( aSign ^ ( a < b ) );
2113
2114 }
2115
2116 /*----------------------------------------------------------------------------
2117 | Returns 1 if the single-precision floating-point value `a' is equal to
2118 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2119 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2120 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2121 *----------------------------------------------------------------------------*/
2122
2123 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2124 {
2125
2126     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2127          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2128        ) {
2129         float_raise( float_flag_invalid STATUS_VAR);
2130         return 0;
2131     }
2132     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2133
2134 }
2135
2136 /*----------------------------------------------------------------------------
2137 | Returns 1 if the single-precision floating-point value `a' is less than or
2138 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2139 | cause an exception.  Otherwise, the comparison is performed according to the
2140 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2141 *----------------------------------------------------------------------------*/
2142
2143 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2144 {
2145     flag aSign, bSign;
2146
2147     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2148          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2149        ) {
2150         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2151             float_raise( float_flag_invalid STATUS_VAR);
2152         }
2153         return 0;
2154     }
2155     aSign = extractFloat32Sign( a );
2156     bSign = extractFloat32Sign( b );
2157     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2158     return ( a == b ) || ( aSign ^ ( a < b ) );
2159
2160 }
2161
2162 /*----------------------------------------------------------------------------
2163 | Returns 1 if the single-precision floating-point value `a' is less than
2164 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2165 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2166 | Standard for Binary Floating-Point Arithmetic.
2167 *----------------------------------------------------------------------------*/
2168
2169 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2170 {
2171     flag aSign, bSign;
2172
2173     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2174          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2175        ) {
2176         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2177             float_raise( float_flag_invalid STATUS_VAR);
2178         }
2179         return 0;
2180     }
2181     aSign = extractFloat32Sign( a );
2182     bSign = extractFloat32Sign( b );
2183     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2184     return ( a != b ) && ( aSign ^ ( a < b ) );
2185
2186 }
2187
2188 /*----------------------------------------------------------------------------
2189 | Returns the result of converting the double-precision floating-point value
2190 | `a' to the 32-bit two's complement integer format.  The conversion is
2191 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2192 | Arithmetic---which means in particular that the conversion is rounded
2193 | according to the current rounding mode.  If `a' is a NaN, the largest
2194 | positive integer is returned.  Otherwise, if the conversion overflows, the
2195 | largest integer with the same sign as `a' is returned.
2196 *----------------------------------------------------------------------------*/
2197
2198 int32 float64_to_int32( float64 a STATUS_PARAM )
2199 {
2200     flag aSign;
2201     int16 aExp, shiftCount;
2202     bits64 aSig;
2203
2204     aSig = extractFloat64Frac( a );
2205     aExp = extractFloat64Exp( a );
2206     aSign = extractFloat64Sign( a );
2207     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2208     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2209     shiftCount = 0x42C - aExp;
2210     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2211     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2212
2213 }
2214
2215 /*----------------------------------------------------------------------------
2216 | Returns the result of converting the double-precision floating-point value
2217 | `a' to the 32-bit two's complement integer format.  The conversion is
2218 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2219 | Arithmetic, except that the conversion is always rounded toward zero.
2220 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2221 | the conversion overflows, the largest integer with the same sign as `a' is
2222 | returned.
2223 *----------------------------------------------------------------------------*/
2224
2225 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2226 {
2227     flag aSign;
2228     int16 aExp, shiftCount;
2229     bits64 aSig, savedASig;
2230     int32 z;
2231
2232     aSig = extractFloat64Frac( a );
2233     aExp = extractFloat64Exp( a );
2234     aSign = extractFloat64Sign( a );
2235     if ( 0x41E < aExp ) {
2236         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2237         goto invalid;
2238     }
2239     else if ( aExp < 0x3FF ) {
2240         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2241         return 0;
2242     }
2243     aSig |= LIT64( 0x0010000000000000 );
2244     shiftCount = 0x433 - aExp;
2245     savedASig = aSig;
2246     aSig >>= shiftCount;
2247     z = aSig;
2248     if ( aSign ) z = - z;
2249     if ( ( z < 0 ) ^ aSign ) {
2250  invalid:
2251         float_raise( float_flag_invalid STATUS_VAR);
2252         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2253     }
2254     if ( ( aSig<<shiftCount ) != savedASig ) {
2255         STATUS(float_exception_flags) |= float_flag_inexact;
2256     }
2257     return z;
2258
2259 }
2260
2261 /*----------------------------------------------------------------------------
2262 | Returns the result of converting the double-precision floating-point value
2263 | `a' to the 64-bit two's complement integer format.  The conversion is
2264 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2265 | Arithmetic---which means in particular that the conversion is rounded
2266 | according to the current rounding mode.  If `a' is a NaN, the largest
2267 | positive integer is returned.  Otherwise, if the conversion overflows, the
2268 | largest integer with the same sign as `a' is returned.
2269 *----------------------------------------------------------------------------*/
2270
2271 int64 float64_to_int64( float64 a STATUS_PARAM )
2272 {
2273     flag aSign;
2274     int16 aExp, shiftCount;
2275     bits64 aSig, aSigExtra;
2276
2277     aSig = extractFloat64Frac( a );
2278     aExp = extractFloat64Exp( a );
2279     aSign = extractFloat64Sign( a );
2280     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2281     shiftCount = 0x433 - aExp;
2282     if ( shiftCount <= 0 ) {
2283         if ( 0x43E < aExp ) {
2284             float_raise( float_flag_invalid STATUS_VAR);
2285             if (    ! aSign
2286                  || (    ( aExp == 0x7FF )
2287                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2288                ) {
2289                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2290             }
2291             return (sbits64) LIT64( 0x8000000000000000 );
2292         }
2293         aSigExtra = 0;
2294         aSig <<= - shiftCount;
2295     }
2296     else {
2297         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2298     }
2299     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2300
2301 }
2302
2303 /*----------------------------------------------------------------------------
2304 | Returns the result of converting the double-precision floating-point value
2305 | `a' to the 64-bit two's complement integer format.  The conversion is
2306 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2307 | Arithmetic, except that the conversion is always rounded toward zero.
2308 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2309 | the conversion overflows, the largest integer with the same sign as `a' is
2310 | returned.
2311 *----------------------------------------------------------------------------*/
2312
2313 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2314 {
2315     flag aSign;
2316     int16 aExp, shiftCount;
2317     bits64 aSig;
2318     int64 z;
2319
2320     aSig = extractFloat64Frac( a );
2321     aExp = extractFloat64Exp( a );
2322     aSign = extractFloat64Sign( a );
2323     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2324     shiftCount = aExp - 0x433;
2325     if ( 0 <= shiftCount ) {
2326         if ( 0x43E <= aExp ) {
2327             if ( a != LIT64( 0xC3E0000000000000 ) ) {
2328                 float_raise( float_flag_invalid STATUS_VAR);
2329                 if (    ! aSign
2330                      || (    ( aExp == 0x7FF )
2331                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2332                    ) {
2333                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2334                 }
2335             }
2336             return (sbits64) LIT64( 0x8000000000000000 );
2337         }
2338         z = aSig<<shiftCount;
2339     }
2340     else {
2341         if ( aExp < 0x3FE ) {
2342             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2343             return 0;
2344         }
2345         z = aSig>>( - shiftCount );
2346         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2347             STATUS(float_exception_flags) |= float_flag_inexact;
2348         }
2349     }
2350     if ( aSign ) z = - z;
2351     return z;
2352
2353 }
2354
2355 /*----------------------------------------------------------------------------
2356 | Returns the result of converting the double-precision floating-point value
2357 | `a' to the single-precision floating-point format.  The conversion is
2358 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2359 | Arithmetic.
2360 *----------------------------------------------------------------------------*/
2361
2362 float32 float64_to_float32( float64 a STATUS_PARAM )
2363 {
2364     flag aSign;
2365     int16 aExp;
2366     bits64 aSig;
2367     bits32 zSig;
2368
2369     aSig = extractFloat64Frac( a );
2370     aExp = extractFloat64Exp( a );
2371     aSign = extractFloat64Sign( a );
2372     if ( aExp == 0x7FF ) {
2373         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2374         return packFloat32( aSign, 0xFF, 0 );
2375     }
2376     shift64RightJamming( aSig, 22, &aSig );
2377     zSig = aSig;
2378     if ( aExp || zSig ) {
2379         zSig |= 0x40000000;
2380         aExp -= 0x381;
2381     }
2382     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2383
2384 }
2385
2386 #ifdef FLOATX80
2387
2388 /*----------------------------------------------------------------------------
2389 | Returns the result of converting the double-precision floating-point value
2390 | `a' to the extended double-precision floating-point format.  The conversion
2391 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2392 | Arithmetic.
2393 *----------------------------------------------------------------------------*/
2394
2395 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2396 {
2397     flag aSign;
2398     int16 aExp;
2399     bits64 aSig;
2400
2401     aSig = extractFloat64Frac( a );
2402     aExp = extractFloat64Exp( a );
2403     aSign = extractFloat64Sign( a );
2404     if ( aExp == 0x7FF ) {
2405         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2406         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2407     }
2408     if ( aExp == 0 ) {
2409         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2410         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2411     }
2412     return
2413         packFloatx80(
2414             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2415
2416 }
2417
2418 #endif
2419
2420 #ifdef FLOAT128
2421
2422 /*----------------------------------------------------------------------------
2423 | Returns the result of converting the double-precision floating-point value
2424 | `a' to the quadruple-precision floating-point format.  The conversion is
2425 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2426 | Arithmetic.
2427 *----------------------------------------------------------------------------*/
2428
2429 float128 float64_to_float128( float64 a STATUS_PARAM )
2430 {
2431     flag aSign;
2432     int16 aExp;
2433     bits64 aSig, zSig0, zSig1;
2434
2435     aSig = extractFloat64Frac( a );
2436     aExp = extractFloat64Exp( a );
2437     aSign = extractFloat64Sign( a );
2438     if ( aExp == 0x7FF ) {
2439         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2440         return packFloat128( aSign, 0x7FFF, 0, 0 );
2441     }
2442     if ( aExp == 0 ) {
2443         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2444         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2445         --aExp;
2446     }
2447     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2448     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2449
2450 }
2451
2452 #endif
2453
2454 /*----------------------------------------------------------------------------
2455 | Rounds the double-precision floating-point value `a' to an integer, and
2456 | returns the result as a double-precision floating-point value.  The
2457 | operation is performed according to the IEC/IEEE Standard for Binary
2458 | Floating-Point Arithmetic.
2459 *----------------------------------------------------------------------------*/
2460
2461 float64 float64_round_to_int( float64 a STATUS_PARAM )
2462 {
2463     flag aSign;
2464     int16 aExp;
2465     bits64 lastBitMask, roundBitsMask;
2466     int8 roundingMode;
2467     float64 z;
2468
2469     aExp = extractFloat64Exp( a );
2470     if ( 0x433 <= aExp ) {
2471         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2472             return propagateFloat64NaN( a, a STATUS_VAR );
2473         }
2474         return a;
2475     }
2476     if ( aExp < 0x3FF ) {
2477         if ( (bits64) ( a<<1 ) == 0 ) return a;
2478         STATUS(float_exception_flags) |= float_flag_inexact;
2479         aSign = extractFloat64Sign( a );
2480         switch ( STATUS(float_rounding_mode) ) {
2481          case float_round_nearest_even:
2482             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2483                 return packFloat64( aSign, 0x3FF, 0 );
2484             }
2485             break;
2486          case float_round_down:
2487             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2488          case float_round_up:
2489             return
2490             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2491         }
2492         return packFloat64( aSign, 0, 0 );
2493     }
2494     lastBitMask = 1;
2495     lastBitMask <<= 0x433 - aExp;
2496     roundBitsMask = lastBitMask - 1;
2497     z = a;
2498     roundingMode = STATUS(float_rounding_mode);
2499     if ( roundingMode == float_round_nearest_even ) {
2500         z += lastBitMask>>1;
2501         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2502     }
2503     else if ( roundingMode != float_round_to_zero ) {
2504         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2505             z += roundBitsMask;
2506         }
2507     }
2508     z &= ~ roundBitsMask;
2509     if ( z != a ) STATUS(float_exception_flags) |= float_flag_inexact;
2510     return z;
2511
2512 }
2513
2514 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2515 {
2516     int oldmode;
2517     float64 res;
2518     oldmode = STATUS(float_rounding_mode);
2519     STATUS(float_rounding_mode) = float_round_to_zero;
2520     res = float64_round_to_int(a STATUS_VAR);
2521     STATUS(float_rounding_mode) = oldmode;
2522     return res;
2523 }
2524
2525 /*----------------------------------------------------------------------------
2526 | Returns the result of adding the absolute values of the double-precision
2527 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2528 | before being returned.  `zSign' is ignored if the result is a NaN.
2529 | The addition is performed according to the IEC/IEEE Standard for Binary
2530 | Floating-Point Arithmetic.
2531 *----------------------------------------------------------------------------*/
2532
2533 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2534 {
2535     int16 aExp, bExp, zExp;
2536     bits64 aSig, bSig, zSig;
2537     int16 expDiff;
2538
2539     aSig = extractFloat64Frac( a );
2540     aExp = extractFloat64Exp( a );
2541     bSig = extractFloat64Frac( b );
2542     bExp = extractFloat64Exp( b );
2543     expDiff = aExp - bExp;
2544     aSig <<= 9;
2545     bSig <<= 9;
2546     if ( 0 < expDiff ) {
2547         if ( aExp == 0x7FF ) {
2548             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2549             return a;
2550         }
2551         if ( bExp == 0 ) {
2552             --expDiff;
2553         }
2554         else {
2555             bSig |= LIT64( 0x2000000000000000 );
2556         }
2557         shift64RightJamming( bSig, expDiff, &bSig );
2558         zExp = aExp;
2559     }
2560     else if ( expDiff < 0 ) {
2561         if ( bExp == 0x7FF ) {
2562             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2563             return packFloat64( zSign, 0x7FF, 0 );
2564         }
2565         if ( aExp == 0 ) {
2566             ++expDiff;
2567         }
2568         else {
2569             aSig |= LIT64( 0x2000000000000000 );
2570         }
2571         shift64RightJamming( aSig, - expDiff, &aSig );
2572         zExp = bExp;
2573     }
2574     else {
2575         if ( aExp == 0x7FF ) {
2576             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2577             return a;
2578         }
2579         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2580         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2581         zExp = aExp;
2582         goto roundAndPack;
2583     }
2584     aSig |= LIT64( 0x2000000000000000 );
2585     zSig = ( aSig + bSig )<<1;
2586     --zExp;
2587     if ( (sbits64) zSig < 0 ) {
2588         zSig = aSig + bSig;
2589         ++zExp;
2590     }
2591  roundAndPack:
2592     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2593
2594 }
2595
2596 /*----------------------------------------------------------------------------
2597 | Returns the result of subtracting the absolute values of the double-
2598 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2599 | difference is negated before being returned.  `zSign' is ignored if the
2600 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2601 | Standard for Binary Floating-Point Arithmetic.
2602 *----------------------------------------------------------------------------*/
2603
2604 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2605 {
2606     int16 aExp, bExp, zExp;
2607     bits64 aSig, bSig, zSig;
2608     int16 expDiff;
2609
2610     aSig = extractFloat64Frac( a );
2611     aExp = extractFloat64Exp( a );
2612     bSig = extractFloat64Frac( b );
2613     bExp = extractFloat64Exp( b );
2614     expDiff = aExp - bExp;
2615     aSig <<= 10;
2616     bSig <<= 10;
2617     if ( 0 < expDiff ) goto aExpBigger;
2618     if ( expDiff < 0 ) goto bExpBigger;
2619     if ( aExp == 0x7FF ) {
2620         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2621         float_raise( float_flag_invalid STATUS_VAR);
2622         return float64_default_nan;
2623     }
2624     if ( aExp == 0 ) {
2625         aExp = 1;
2626         bExp = 1;
2627     }
2628     if ( bSig < aSig ) goto aBigger;
2629     if ( aSig < bSig ) goto bBigger;
2630     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2631  bExpBigger:
2632     if ( bExp == 0x7FF ) {
2633         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2634         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2635     }
2636     if ( aExp == 0 ) {
2637         ++expDiff;
2638     }
2639     else {
2640         aSig |= LIT64( 0x4000000000000000 );
2641     }
2642     shift64RightJamming( aSig, - expDiff, &aSig );
2643     bSig |= LIT64( 0x4000000000000000 );
2644  bBigger:
2645     zSig = bSig - aSig;
2646     zExp = bExp;
2647     zSign ^= 1;
2648     goto normalizeRoundAndPack;
2649  aExpBigger:
2650     if ( aExp == 0x7FF ) {
2651         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2652         return a;
2653     }
2654     if ( bExp == 0 ) {
2655         --expDiff;
2656     }
2657     else {
2658         bSig |= LIT64( 0x4000000000000000 );
2659     }
2660     shift64RightJamming( bSig, expDiff, &bSig );
2661     aSig |= LIT64( 0x4000000000000000 );
2662  aBigger:
2663     zSig = aSig - bSig;
2664     zExp = aExp;
2665  normalizeRoundAndPack:
2666     --zExp;
2667     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2668
2669 }
2670
2671 /*----------------------------------------------------------------------------
2672 | Returns the result of adding the double-precision floating-point values `a'
2673 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2674 | Binary Floating-Point Arithmetic.
2675 *----------------------------------------------------------------------------*/
2676
2677 float64 float64_add( float64 a, float64 b STATUS_PARAM )
2678 {
2679     flag aSign, bSign;
2680
2681     aSign = extractFloat64Sign( a );
2682     bSign = extractFloat64Sign( b );
2683     if ( aSign == bSign ) {
2684         return addFloat64Sigs( a, b, aSign STATUS_VAR );
2685     }
2686     else {
2687         return subFloat64Sigs( a, b, aSign STATUS_VAR );
2688     }
2689
2690 }
2691
2692 /*----------------------------------------------------------------------------
2693 | Returns the result of subtracting the double-precision floating-point values
2694 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2695 | for Binary Floating-Point Arithmetic.
2696 *----------------------------------------------------------------------------*/
2697
2698 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2699 {
2700     flag aSign, bSign;
2701
2702     aSign = extractFloat64Sign( a );
2703     bSign = extractFloat64Sign( b );
2704     if ( aSign == bSign ) {
2705         return subFloat64Sigs( a, b, aSign STATUS_VAR );
2706     }
2707     else {
2708         return addFloat64Sigs( a, b, aSign STATUS_VAR );
2709     }
2710
2711 }
2712
2713 /*----------------------------------------------------------------------------
2714 | Returns the result of multiplying the double-precision floating-point values
2715 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2716 | for Binary Floating-Point Arithmetic.
2717 *----------------------------------------------------------------------------*/
2718
2719 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2720 {
2721     flag aSign, bSign, zSign;
2722     int16 aExp, bExp, zExp;
2723     bits64 aSig, bSig, zSig0, zSig1;
2724
2725     aSig = extractFloat64Frac( a );
2726     aExp = extractFloat64Exp( a );
2727     aSign = extractFloat64Sign( a );
2728     bSig = extractFloat64Frac( b );
2729     bExp = extractFloat64Exp( b );
2730     bSign = extractFloat64Sign( b );
2731     zSign = aSign ^ bSign;
2732     if ( aExp == 0x7FF ) {
2733         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2734             return propagateFloat64NaN( a, b STATUS_VAR );
2735         }
2736         if ( ( bExp | bSig ) == 0 ) {
2737             float_raise( float_flag_invalid STATUS_VAR);
2738             return float64_default_nan;
2739         }
2740         return packFloat64( zSign, 0x7FF, 0 );
2741     }
2742     if ( bExp == 0x7FF ) {
2743         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2744         if ( ( aExp | aSig ) == 0 ) {
2745             float_raise( float_flag_invalid STATUS_VAR);
2746             return float64_default_nan;
2747         }
2748         return packFloat64( zSign, 0x7FF, 0 );
2749     }
2750     if ( aExp == 0 ) {
2751         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2752         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2753     }
2754     if ( bExp == 0 ) {
2755         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2756         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2757     }
2758     zExp = aExp + bExp - 0x3FF;
2759     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2760     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2761     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2762     zSig0 |= ( zSig1 != 0 );
2763     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2764         zSig0 <<= 1;
2765         --zExp;
2766     }
2767     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2768
2769 }
2770
2771 /*----------------------------------------------------------------------------
2772 | Returns the result of dividing the double-precision floating-point value `a'
2773 | by the corresponding value `b'.  The operation is performed according to
2774 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2775 *----------------------------------------------------------------------------*/
2776
2777 float64 float64_div( float64 a, float64 b STATUS_PARAM )
2778 {
2779     flag aSign, bSign, zSign;
2780     int16 aExp, bExp, zExp;
2781     bits64 aSig, bSig, zSig;
2782     bits64 rem0, rem1;
2783     bits64 term0, term1;
2784
2785     aSig = extractFloat64Frac( a );
2786     aExp = extractFloat64Exp( a );
2787     aSign = extractFloat64Sign( a );
2788     bSig = extractFloat64Frac( b );
2789     bExp = extractFloat64Exp( b );
2790     bSign = extractFloat64Sign( b );
2791     zSign = aSign ^ bSign;
2792     if ( aExp == 0x7FF ) {
2793         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2794         if ( bExp == 0x7FF ) {
2795             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2796             float_raise( float_flag_invalid STATUS_VAR);
2797             return float64_default_nan;
2798         }
2799         return packFloat64( zSign, 0x7FF, 0 );
2800     }
2801     if ( bExp == 0x7FF ) {
2802         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2803         return packFloat64( zSign, 0, 0 );
2804     }
2805     if ( bExp == 0 ) {
2806         if ( bSig == 0 ) {
2807             if ( ( aExp | aSig ) == 0 ) {
2808                 float_raise( float_flag_invalid STATUS_VAR);
2809                 return float64_default_nan;
2810             }
2811             float_raise( float_flag_divbyzero STATUS_VAR);
2812             return packFloat64( zSign, 0x7FF, 0 );
2813         }
2814         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2815     }
2816     if ( aExp == 0 ) {
2817         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2818         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2819     }
2820     zExp = aExp - bExp + 0x3FD;
2821     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2822     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2823     if ( bSig <= ( aSig + aSig ) ) {
2824         aSig >>= 1;
2825         ++zExp;
2826     }
2827     zSig = estimateDiv128To64( aSig, 0, bSig );
2828     if ( ( zSig & 0x1FF ) <= 2 ) {
2829         mul64To128( bSig, zSig, &term0, &term1 );
2830         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2831         while ( (sbits64) rem0 < 0 ) {
2832             --zSig;
2833             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2834         }
2835         zSig |= ( rem1 != 0 );
2836     }
2837     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2838
2839 }
2840
2841 /*----------------------------------------------------------------------------
2842 | Returns the remainder of the double-precision floating-point value `a'
2843 | with respect to the corresponding value `b'.  The operation is performed
2844 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2845 *----------------------------------------------------------------------------*/
2846
2847 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
2848 {
2849     flag aSign, bSign, zSign;
2850     int16 aExp, bExp, expDiff;
2851     bits64 aSig, bSig;
2852     bits64 q, alternateASig;
2853     sbits64 sigMean;
2854
2855     aSig = extractFloat64Frac( a );
2856     aExp = extractFloat64Exp( a );
2857     aSign = extractFloat64Sign( a );
2858     bSig = extractFloat64Frac( b );
2859     bExp = extractFloat64Exp( b );
2860     bSign = extractFloat64Sign( b );
2861     if ( aExp == 0x7FF ) {
2862         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2863             return propagateFloat64NaN( a, b STATUS_VAR );
2864         }
2865         float_raise( float_flag_invalid STATUS_VAR);
2866         return float64_default_nan;
2867     }
2868     if ( bExp == 0x7FF ) {
2869         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2870         return a;
2871     }
2872     if ( bExp == 0 ) {
2873         if ( bSig == 0 ) {
2874             float_raise( float_flag_invalid STATUS_VAR);
2875             return float64_default_nan;
2876         }
2877         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2878     }
2879     if ( aExp == 0 ) {
2880         if ( aSig == 0 ) return a;
2881         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2882     }
2883     expDiff = aExp - bExp;
2884     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2885     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2886     if ( expDiff < 0 ) {
2887         if ( expDiff < -1 ) return a;
2888         aSig >>= 1;
2889     }
2890     q = ( bSig <= aSig );
2891     if ( q ) aSig -= bSig;
2892     expDiff -= 64;
2893     while ( 0 < expDiff ) {
2894         q = estimateDiv128To64( aSig, 0, bSig );
2895         q = ( 2 < q ) ? q - 2 : 0;
2896         aSig = - ( ( bSig>>2 ) * q );
2897         expDiff -= 62;
2898     }
2899     expDiff += 64;
2900     if ( 0 < expDiff ) {
2901         q = estimateDiv128To64( aSig, 0, bSig );
2902         q = ( 2 < q ) ? q - 2 : 0;
2903         q >>= 64 - expDiff;
2904         bSig >>= 2;
2905         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2906     }
2907     else {
2908         aSig >>= 2;
2909         bSig >>= 2;
2910     }
2911     do {
2912         alternateASig = aSig;
2913         ++q;
2914         aSig -= bSig;
2915     } while ( 0 <= (sbits64) aSig );
2916     sigMean = aSig + alternateASig;
2917     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2918         aSig = alternateASig;
2919     }
2920     zSign = ( (sbits64) aSig < 0 );
2921     if ( zSign ) aSig = - aSig;
2922     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
2923
2924 }
2925
2926 /*----------------------------------------------------------------------------
2927 | Returns the square root of the double-precision floating-point value `a'.
2928 | The operation is performed according to the IEC/IEEE Standard for Binary
2929 | Floating-Point Arithmetic.
2930 *----------------------------------------------------------------------------*/
2931
2932 float64 float64_sqrt( float64 a STATUS_PARAM )
2933 {
2934     flag aSign;
2935     int16 aExp, zExp;
2936     bits64 aSig, zSig, doubleZSig;
2937     bits64 rem0, rem1, term0, term1;
2938
2939     aSig = extractFloat64Frac( a );
2940     aExp = extractFloat64Exp( a );
2941     aSign = extractFloat64Sign( a );
2942     if ( aExp == 0x7FF ) {
2943         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
2944         if ( ! aSign ) return a;
2945         float_raise( float_flag_invalid STATUS_VAR);
2946         return float64_default_nan;
2947     }
2948     if ( aSign ) {
2949         if ( ( aExp | aSig ) == 0 ) return a;
2950         float_raise( float_flag_invalid STATUS_VAR);
2951         return float64_default_nan;
2952     }
2953     if ( aExp == 0 ) {
2954         if ( aSig == 0 ) return 0;
2955         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2956     }
2957     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2958     aSig |= LIT64( 0x0010000000000000 );
2959     zSig = estimateSqrt32( aExp, aSig>>21 );
2960     aSig <<= 9 - ( aExp & 1 );
2961     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2962     if ( ( zSig & 0x1FF ) <= 5 ) {
2963         doubleZSig = zSig<<1;
2964         mul64To128( zSig, zSig, &term0, &term1 );
2965         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2966         while ( (sbits64) rem0 < 0 ) {
2967             --zSig;
2968             doubleZSig -= 2;
2969             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
2970         }
2971         zSig |= ( ( rem0 | rem1 ) != 0 );
2972     }
2973     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
2974
2975 }
2976
2977 /*----------------------------------------------------------------------------
2978 | Returns 1 if the double-precision floating-point value `a' is equal to the
2979 | corresponding value `b', and 0 otherwise.  The comparison is performed
2980 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2981 *----------------------------------------------------------------------------*/
2982
2983 int float64_eq( float64 a, float64 b STATUS_PARAM )
2984 {
2985
2986     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2987          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2988        ) {
2989         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2990             float_raise( float_flag_invalid STATUS_VAR);
2991         }
2992         return 0;
2993     }
2994     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2995
2996 }
2997
2998 /*----------------------------------------------------------------------------
2999 | Returns 1 if the double-precision floating-point value `a' is less than or
3000 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
3001 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3002 | Arithmetic.
3003 *----------------------------------------------------------------------------*/
3004
3005 int float64_le( float64 a, float64 b STATUS_PARAM )
3006 {
3007     flag aSign, bSign;
3008
3009     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3010          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3011        ) {
3012         float_raise( float_flag_invalid STATUS_VAR);
3013         return 0;
3014     }
3015     aSign = extractFloat64Sign( a );
3016     bSign = extractFloat64Sign( b );
3017     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3018     return ( a == b ) || ( aSign ^ ( a < b ) );
3019
3020 }
3021
3022 /*----------------------------------------------------------------------------
3023 | Returns 1 if the double-precision floating-point value `a' is less than
3024 | the corresponding value `b', and 0 otherwise.  The comparison is performed
3025 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3026 *----------------------------------------------------------------------------*/
3027
3028 int float64_lt( float64 a, float64 b STATUS_PARAM )
3029 {
3030     flag aSign, bSign;
3031
3032     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3033          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3034        ) {
3035         float_raise( float_flag_invalid STATUS_VAR);
3036         return 0;
3037     }
3038     aSign = extractFloat64Sign( a );
3039     bSign = extractFloat64Sign( b );
3040     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3041     return ( a != b ) && ( aSign ^ ( a < b ) );
3042
3043 }
3044
3045 /*----------------------------------------------------------------------------
3046 | Returns 1 if the double-precision floating-point value `a' is equal to the
3047 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
3048 | if either operand is a NaN.  Otherwise, the comparison is performed
3049 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3050 *----------------------------------------------------------------------------*/
3051
3052 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3053 {
3054
3055     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3056          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3057        ) {
3058         float_raise( float_flag_invalid STATUS_VAR);
3059         return 0;
3060     }
3061     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3062
3063 }
3064
3065 /*----------------------------------------------------------------------------
3066 | Returns 1 if the double-precision floating-point value `a' is less than or
3067 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3068 | cause an exception.  Otherwise, the comparison is performed according to the
3069 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3070 *----------------------------------------------------------------------------*/
3071
3072 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3073 {
3074     flag aSign, bSign;
3075
3076     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3077          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3078        ) {
3079         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3080             float_raise( float_flag_invalid STATUS_VAR);
3081         }
3082         return 0;
3083     }
3084     aSign = extractFloat64Sign( a );
3085     bSign = extractFloat64Sign( b );
3086     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3087     return ( a == b ) || ( aSign ^ ( a < b ) );
3088
3089 }
3090
3091 /*----------------------------------------------------------------------------
3092 | Returns 1 if the double-precision floating-point value `a' is less than
3093 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3094 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3095 | Standard for Binary Floating-Point Arithmetic.
3096 *----------------------------------------------------------------------------*/
3097
3098 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3099 {
3100     flag aSign, bSign;
3101
3102     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3103          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3104        ) {
3105         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3106             float_raise( float_flag_invalid STATUS_VAR);
3107         }
3108         return 0;
3109     }
3110     aSign = extractFloat64Sign( a );
3111     bSign = extractFloat64Sign( b );
3112     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3113     return ( a != b ) && ( aSign ^ ( a < b ) );
3114
3115 }
3116
3117 #ifdef FLOATX80
3118
3119 /*----------------------------------------------------------------------------
3120 | Returns the result of converting the extended double-precision floating-
3121 | point value `a' to the 32-bit two's complement integer format.  The
3122 | conversion is performed according to the IEC/IEEE Standard for Binary
3123 | Floating-Point Arithmetic---which means in particular that the conversion
3124 | is rounded according to the current rounding mode.  If `a' is a NaN, the
3125 | largest positive integer is returned.  Otherwise, if the conversion
3126 | overflows, the largest integer with the same sign as `a' is returned.
3127 *----------------------------------------------------------------------------*/
3128
3129 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3130 {
3131     flag aSign;
3132     int32 aExp, shiftCount;
3133     bits64 aSig;
3134
3135     aSig = extractFloatx80Frac( a );
3136     aExp = extractFloatx80Exp( a );
3137     aSign = extractFloatx80Sign( a );
3138     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3139     shiftCount = 0x4037 - aExp;
3140     if ( shiftCount <= 0 ) shiftCount = 1;
3141     shift64RightJamming( aSig, shiftCount, &aSig );
3142     return roundAndPackInt32( aSign, aSig STATUS_VAR );
3143
3144 }
3145
3146 /*----------------------------------------------------------------------------
3147 | Returns the result of converting the extended double-precision floating-
3148 | point value `a' to the 32-bit two's complement integer format.  The
3149 | conversion is performed according to the IEC/IEEE Standard for Binary
3150 | Floating-Point Arithmetic, except that the conversion is always rounded
3151 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3152 | Otherwise, if the conversion overflows, the largest integer with the same
3153 | sign as `a' is returned.
3154 *----------------------------------------------------------------------------*/
3155
3156 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3157 {
3158     flag aSign;
3159     int32 aExp, shiftCount;
3160     bits64 aSig, savedASig;
3161     int32 z;
3162
3163     aSig = extractFloatx80Frac( a );
3164     aExp = extractFloatx80Exp( a );
3165     aSign = extractFloatx80Sign( a );
3166     if ( 0x401E < aExp ) {
3167         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3168         goto invalid;
3169     }
3170     else if ( aExp < 0x3FFF ) {
3171         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3172         return 0;
3173     }
3174     shiftCount = 0x403E - aExp;
3175     savedASig = aSig;
3176     aSig >>= shiftCount;
3177     z = aSig;
3178     if ( aSign ) z = - z;
3179     if ( ( z < 0 ) ^ aSign ) {
3180  invalid:
3181         float_raise( float_flag_invalid STATUS_VAR);
3182         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3183     }
3184     if ( ( aSig<<shiftCount ) != savedASig ) {
3185         STATUS(float_exception_flags) |= float_flag_inexact;
3186     }
3187     return z;
3188
3189 }
3190
3191 /*----------------------------------------------------------------------------
3192 | Returns the result of converting the extended double-precision floating-
3193 | point value `a' to the 64-bit two's complement integer format.  The
3194 | conversion is performed according to the IEC/IEEE Standard for Binary
3195 | Floating-Point Arithmetic---which means in particular that the conversion
3196 | is rounded according to the current rounding mode.  If `a' is a NaN,
3197 | the largest positive integer is returned.  Otherwise, if the conversion
3198 | overflows, the largest integer with the same sign as `a' is returned.
3199 *----------------------------------------------------------------------------*/
3200
3201 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3202 {
3203     flag aSign;
3204     int32 aExp, shiftCount;
3205     bits64 aSig, aSigExtra;
3206
3207     aSig = extractFloatx80Frac( a );
3208     aExp = extractFloatx80Exp( a );
3209     aSign = extractFloatx80Sign( a );
3210     shiftCount = 0x403E - aExp;
3211     if ( shiftCount <= 0 ) {
3212         if ( shiftCount ) {
3213             float_raise( float_flag_invalid STATUS_VAR);
3214             if (    ! aSign
3215                  || (    ( aExp == 0x7FFF )
3216                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3217                ) {
3218                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3219             }
3220             return (sbits64) LIT64( 0x8000000000000000 );
3221         }
3222         aSigExtra = 0;
3223     }
3224     else {
3225         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3226     }
3227     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3228
3229 }
3230
3231 /*----------------------------------------------------------------------------
3232 | Returns the result of converting the extended double-precision floating-
3233 | point value `a' to the 64-bit two's complement integer format.  The
3234 | conversion is performed according to the IEC/IEEE Standard for Binary
3235 | Floating-Point Arithmetic, except that the conversion is always rounded
3236 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3237 | Otherwise, if the conversion overflows, the largest integer with the same
3238 | sign as `a' is returned.
3239 *----------------------------------------------------------------------------*/
3240
3241 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3242 {
3243     flag aSign;
3244     int32 aExp, shiftCount;
3245     bits64 aSig;
3246     int64 z;
3247
3248     aSig = extractFloatx80Frac( a );
3249     aExp = extractFloatx80Exp( a );
3250     aSign = extractFloatx80Sign( a );
3251     shiftCount = aExp - 0x403E;
3252     if ( 0 <= shiftCount ) {
3253         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3254         if ( ( a.high != 0xC03E ) || aSig ) {
3255             float_raise( float_flag_invalid STATUS_VAR);
3256             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3257                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3258             }
3259         }
3260         return (sbits64) LIT64( 0x8000000000000000 );
3261     }
3262     else if ( aExp < 0x3FFF ) {
3263         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3264         return 0;
3265     }
3266     z = aSig>>( - shiftCount );
3267     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3268         STATUS(float_exception_flags) |= float_flag_inexact;
3269     }
3270     if ( aSign ) z = - z;
3271     return z;
3272
3273 }
3274
3275 /*----------------------------------------------------------------------------
3276 | Returns the result of converting the extended double-precision floating-
3277 | point value `a' to the single-precision floating-point format.  The
3278 | conversion is performed according to the IEC/IEEE Standard for Binary
3279 | Floating-Point Arithmetic.
3280 *----------------------------------------------------------------------------*/
3281
3282 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3283 {
3284     flag aSign;
3285     int32 aExp;
3286     bits64 aSig;
3287
3288     aSig = extractFloatx80Frac( a );
3289     aExp = extractFloatx80Exp( a );
3290     aSign = extractFloatx80Sign( a );
3291     if ( aExp == 0x7FFF ) {
3292         if ( (bits64) ( aSig<<1 ) ) {
3293             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3294         }
3295         return packFloat32( aSign, 0xFF, 0 );
3296     }
3297     shift64RightJamming( aSig, 33, &aSig );
3298     if ( aExp || aSig ) aExp -= 0x3F81;
3299     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3300
3301 }
3302
3303 /*----------------------------------------------------------------------------
3304 | Returns the result of converting the extended double-precision floating-
3305 | point value `a' to the double-precision floating-point format.  The
3306 | conversion is performed according to the IEC/IEEE Standard for Binary
3307 | Floating-Point Arithmetic.
3308 *----------------------------------------------------------------------------*/
3309
3310 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3311 {
3312     flag aSign;
3313     int32 aExp;
3314     bits64 aSig, zSig;
3315
3316     aSig = extractFloatx80Frac( a );
3317     aExp = extractFloatx80Exp( a );
3318     aSign = extractFloatx80Sign( a );
3319     if ( aExp == 0x7FFF ) {
3320         if ( (bits64) ( aSig<<1 ) ) {
3321             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3322         }
3323         return packFloat64( aSign, 0x7FF, 0 );
3324     }
3325     shift64RightJamming( aSig, 1, &zSig );
3326     if ( aExp || aSig ) aExp -= 0x3C01;
3327     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3328
3329 }
3330
3331 #ifdef FLOAT128
3332
3333 /*----------------------------------------------------------------------------
3334 | Returns the result of converting the extended double-precision floating-
3335 | point value `a' to the quadruple-precision floating-point format.  The
3336 | conversion is performed according to the IEC/IEEE Standard for Binary
3337 | Floating-Point Arithmetic.
3338 *----------------------------------------------------------------------------*/
3339
3340 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3341 {
3342     flag aSign;
3343     int16 aExp;
3344     bits64 aSig, zSig0, zSig1;
3345
3346     aSig = extractFloatx80Frac( a );
3347     aExp = extractFloatx80Exp( a );
3348     aSign = extractFloatx80Sign( a );
3349     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3350         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3351     }
3352     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3353     return packFloat128( aSign, aExp, zSig0, zSig1 );
3354
3355 }
3356
3357 #endif
3358
3359 /*----------------------------------------------------------------------------
3360 | Rounds the extended double-precision floating-point value `a' to an integer,
3361 | and returns the result as an extended quadruple-precision floating-point
3362 | value.  The operation is performed according to the IEC/IEEE Standard for
3363 | Binary Floating-Point Arithmetic.
3364 *----------------------------------------------------------------------------*/
3365
3366 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3367 {
3368     flag aSign;
3369     int32 aExp;
3370     bits64 lastBitMask, roundBitsMask;
3371     int8 roundingMode;
3372     floatx80 z;
3373
3374     aExp = extractFloatx80Exp( a );
3375     if ( 0x403E <= aExp ) {
3376         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3377             return propagateFloatx80NaN( a, a STATUS_VAR );
3378         }
3379         return a;
3380     }
3381     if ( aExp < 0x3FFF ) {
3382         if (    ( aExp == 0 )
3383              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3384             return a;
3385         }
3386         STATUS(float_exception_flags) |= float_flag_inexact;
3387         aSign = extractFloatx80Sign( a );
3388         switch ( STATUS(float_rounding_mode) ) {
3389          case float_round_nearest_even:
3390             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3391                ) {
3392                 return
3393                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3394             }
3395             break;
3396          case float_round_down:
3397             return
3398                   aSign ?
3399                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3400                 : packFloatx80( 0, 0, 0 );
3401          case float_round_up:
3402             return
3403                   aSign ? packFloatx80( 1, 0, 0 )
3404                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3405         }
3406         return packFloatx80( aSign, 0, 0 );
3407     }
3408     lastBitMask = 1;
3409     lastBitMask <<= 0x403E - aExp;
3410     roundBitsMask = lastBitMask - 1;
3411     z = a;
3412     roundingMode = STATUS(float_rounding_mode);
3413     if ( roundingMode == float_round_nearest_even ) {
3414         z.low += lastBitMask>>1;
3415         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3416     }
3417     else if ( roundingMode != float_round_to_zero ) {
3418         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3419             z.low += roundBitsMask;
3420         }
3421     }
3422     z.low &= ~ roundBitsMask;
3423     if ( z.low == 0 ) {
3424         ++z.high;
3425         z.low = LIT64( 0x8000000000000000 );
3426     }
3427     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3428     return z;
3429
3430 }
3431
3432 /*----------------------------------------------------------------------------
3433 | Returns the result of adding the absolute values of the extended double-
3434 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3435 | negated before being returned.  `zSign' is ignored if the result is a NaN.
3436 | The addition is performed according to the IEC/IEEE Standard for Binary
3437 | Floating-Point Arithmetic.
3438 *----------------------------------------------------------------------------*/
3439
3440 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3441 {
3442     int32 aExp, bExp, zExp;
3443     bits64 aSig, bSig, zSig0, zSig1;
3444     int32 expDiff;
3445
3446     aSig = extractFloatx80Frac( a );
3447     aExp = extractFloatx80Exp( a );
3448     bSig = extractFloatx80Frac( b );
3449     bExp = extractFloatx80Exp( b );
3450     expDiff = aExp - bExp;
3451     if ( 0 < expDiff ) {
3452         if ( aExp == 0x7FFF ) {
3453             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3454             return a;
3455         }
3456         if ( bExp == 0 ) --expDiff;
3457         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3458         zExp = aExp;
3459     }
3460     else if ( expDiff < 0 ) {
3461         if ( bExp == 0x7FFF ) {
3462             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3463             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3464         }
3465         if ( aExp == 0 ) ++expDiff;
3466         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3467         zExp = bExp;
3468     }
3469     else {
3470         if ( aExp == 0x7FFF ) {
3471             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3472                 return propagateFloatx80NaN( a, b STATUS_VAR );
3473             }
3474             return a;
3475         }
3476         zSig1 = 0;
3477         zSig0 = aSig + bSig;
3478         if ( aExp == 0 ) {
3479             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3480             goto roundAndPack;
3481         }
3482         zExp = aExp;
3483         goto shiftRight1;
3484     }
3485     zSig0 = aSig + bSig;
3486     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3487  shiftRight1:
3488     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3489     zSig0 |= LIT64( 0x8000000000000000 );
3490     ++zExp;
3491  roundAndPack:
3492     return
3493         roundAndPackFloatx80(
3494             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3495
3496 }
3497
3498 /*----------------------------------------------------------------------------
3499 | Returns the result of subtracting the absolute values of the extended
3500 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3501 | difference is negated before being returned.  `zSign' is ignored if the
3502 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3503 | Standard for Binary Floating-Point Arithmetic.
3504 *----------------------------------------------------------------------------*/
3505
3506 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3507 {
3508     int32 aExp, bExp, zExp;
3509     bits64 aSig, bSig, zSig0, zSig1;
3510     int32 expDiff;
3511     floatx80 z;
3512
3513     aSig = extractFloatx80Frac( a );
3514     aExp = extractFloatx80Exp( a );
3515     bSig = extractFloatx80Frac( b );
3516     bExp = extractFloatx80Exp( b );
3517     expDiff = aExp - bExp;
3518     if ( 0 < expDiff ) goto aExpBigger;
3519     if ( expDiff < 0 ) goto bExpBigger;
3520     if ( aExp == 0x7FFF ) {
3521         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3522             return propagateFloatx80NaN( a, b STATUS_VAR );
3523         }
3524         float_raise( float_flag_invalid STATUS_VAR);
3525         z.low = floatx80_default_nan_low;
3526         z.high = floatx80_default_nan_high;
3527         return z;
3528     }
3529     if ( aExp == 0 ) {
3530         aExp = 1;
3531         bExp = 1;
3532     }
3533     zSig1 = 0;
3534     if ( bSig < aSig ) goto aBigger;
3535     if ( aSig < bSig ) goto bBigger;
3536     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3537  bExpBigger:
3538     if ( bExp == 0x7FFF ) {
3539         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3540         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3541     }
3542     if ( aExp == 0 ) ++expDiff;
3543     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3544  bBigger:
3545     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3546     zExp = bExp;
3547     zSign ^= 1;
3548     goto normalizeRoundAndPack;
3549  aExpBigger:
3550     if ( aExp == 0x7FFF ) {
3551         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3552         return a;
3553     }
3554     if ( bExp == 0 ) --expDiff;
3555     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3556  aBigger:
3557     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3558     zExp = aExp;
3559  normalizeRoundAndPack:
3560     return
3561         normalizeRoundAndPackFloatx80(
3562             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3563
3564 }
3565
3566 /*----------------------------------------------------------------------------
3567 | Returns the result of adding the extended double-precision floating-point
3568 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
3569 | Standard for Binary Floating-Point Arithmetic.
3570 *----------------------------------------------------------------------------*/
3571
3572 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3573 {
3574     flag aSign, bSign;
3575
3576     aSign = extractFloatx80Sign( a );
3577     bSign = extractFloatx80Sign( b );
3578     if ( aSign == bSign ) {
3579         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3580     }
3581     else {
3582         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3583     }
3584
3585 }
3586
3587 /*----------------------------------------------------------------------------
3588 | Returns the result of subtracting the extended double-precision floating-
3589 | point values `a' and `b'.  The operation is performed according to the
3590 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3591 *----------------------------------------------------------------------------*/
3592
3593 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3594 {
3595     flag aSign, bSign;
3596
3597     aSign = extractFloatx80Sign( a );
3598     bSign = extractFloatx80Sign( b );
3599     if ( aSign == bSign ) {
3600         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3601     }
3602     else {
3603         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3604     }
3605
3606 }
3607
3608 /*----------------------------------------------------------------------------
3609 | Returns the result of multiplying the extended double-precision floating-
3610 | point values `a' and `b'.  The operation is performed according to the
3611 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3612 *----------------------------------------------------------------------------*/
3613
3614 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3615 {
3616     flag aSign, bSign, zSign;
3617     int32 aExp, bExp, zExp;
3618     bits64 aSig, bSig, zSig0, zSig1;
3619     floatx80 z;
3620
3621     aSig = extractFloatx80Frac( a );
3622     aExp = extractFloatx80Exp( a );
3623     aSign = extractFloatx80Sign( a );
3624     bSig = extractFloatx80Frac( b );
3625     bExp = extractFloatx80Exp( b );
3626     bSign = extractFloatx80Sign( b );
3627     zSign = aSign ^ bSign;
3628     if ( aExp == 0x7FFF ) {
3629         if (    (bits64) ( aSig<<1 )
3630              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3631             return propagateFloatx80NaN( a, b STATUS_VAR );
3632         }
3633         if ( ( bExp | bSig ) == 0 ) goto invalid;
3634         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3635     }
3636     if ( bExp == 0x7FFF ) {
3637         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3638         if ( ( aExp | aSig ) == 0 ) {
3639  invalid:
3640             float_raise( float_flag_invalid STATUS_VAR);
3641             z.low = floatx80_default_nan_low;
3642             z.high = floatx80_default_nan_high;
3643             return z;
3644         }
3645         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3646     }
3647     if ( aExp == 0 ) {
3648         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3649         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3650     }
3651     if ( bExp == 0 ) {
3652         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3653         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3654     }
3655     zExp = aExp + bExp - 0x3FFE;
3656     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3657     if ( 0 < (sbits64) zSig0 ) {
3658         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3659         --zExp;
3660     }
3661     return
3662         roundAndPackFloatx80(
3663             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3664
3665 }
3666
3667 /*----------------------------------------------------------------------------
3668 | Returns the result of dividing the extended double-precision floating-point
3669 | value `a' by the corresponding value `b'.  The operation is performed
3670 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3671 *----------------------------------------------------------------------------*/
3672
3673 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3674 {
3675     flag aSign, bSign, zSign;
3676     int32 aExp, bExp, zExp;
3677     bits64 aSig, bSig, zSig0, zSig1;
3678     bits64 rem0, rem1, rem2, term0, term1, term2;
3679     floatx80 z;
3680
3681     aSig = extractFloatx80Frac( a );
3682     aExp = extractFloatx80Exp( a );
3683     aSign = extractFloatx80Sign( a );
3684     bSig = extractFloatx80Frac( b );
3685     bExp = extractFloatx80Exp( b );
3686     bSign = extractFloatx80Sign( b );
3687     zSign = aSign ^ bSign;
3688     if ( aExp == 0x7FFF ) {
3689         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3690         if ( bExp == 0x7FFF ) {
3691             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3692             goto invalid;
3693         }
3694         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3695     }
3696     if ( bExp == 0x7FFF ) {
3697         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3698         return packFloatx80( zSign, 0, 0 );
3699     }
3700     if ( bExp == 0 ) {
3701         if ( bSig == 0 ) {
3702             if ( ( aExp | aSig ) == 0 ) {
3703  invalid:
3704                 float_raise( float_flag_invalid STATUS_VAR);
3705                 z.low = floatx80_default_nan_low;
3706                 z.high = floatx80_default_nan_high;
3707                 return z;
3708             }
3709             float_raise( float_flag_divbyzero STATUS_VAR);
3710             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3711         }
3712         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3713     }
3714     if ( aExp == 0 ) {
3715         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3716         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3717     }
3718     zExp = aExp - bExp + 0x3FFE;
3719     rem1 = 0;
3720     if ( bSig <= aSig ) {
3721         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3722         ++zExp;
3723     }
3724     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3725     mul64To128( bSig, zSig0, &term0, &term1 );
3726     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3727     while ( (sbits64) rem0 < 0 ) {
3728         --zSig0;
3729         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3730     }
3731     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3732     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3733         mul64To128( bSig, zSig1, &term1, &term2 );
3734         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3735         while ( (sbits64) rem1 < 0 ) {
3736             --zSig1;
3737             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3738         }
3739         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3740     }
3741     return
3742         roundAndPackFloatx80(
3743             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3744
3745 }
3746
3747 /*----------------------------------------------------------------------------
3748 | Returns the remainder of the extended double-precision floating-point value
3749 | `a' with respect to the corresponding value `b'.  The operation is performed
3750 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3751 *----------------------------------------------------------------------------*/
3752
3753 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
3754 {
3755     flag aSign, bSign, zSign;
3756     int32 aExp, bExp, expDiff;
3757     bits64 aSig0, aSig1, bSig;
3758     bits64 q, term0, term1, alternateASig0, alternateASig1;
3759     floatx80 z;
3760
3761     aSig0 = extractFloatx80Frac( a );
3762     aExp = extractFloatx80Exp( a );
3763     aSign = extractFloatx80Sign( a );
3764     bSig = extractFloatx80Frac( b );
3765     bExp = extractFloatx80Exp( b );
3766     bSign = extractFloatx80Sign( b );
3767     if ( aExp == 0x7FFF ) {
3768         if (    (bits64) ( aSig0<<1 )
3769              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3770             return propagateFloatx80NaN( a, b STATUS_VAR );
3771         }
3772         goto invalid;
3773     }
3774     if ( bExp == 0x7FFF ) {
3775         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3776         return a;
3777     }
3778     if ( bExp == 0 ) {
3779         if ( bSig == 0 ) {
3780  invalid:
3781             float_raise( float_flag_invalid STATUS_VAR);
3782             z.low = floatx80_default_nan_low;
3783             z.high = floatx80_default_nan_high;
3784             return z;
3785         }
3786         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3787     }
3788     if ( aExp == 0 ) {
3789         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3790         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3791     }
3792     bSig |= LIT64( 0x8000000000000000 );
3793     zSign = aSign;
3794     expDiff = aExp - bExp;
3795     aSig1 = 0;
3796     if ( expDiff < 0 ) {
3797         if ( expDiff < -1 ) return a;
3798         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3799         expDiff = 0;
3800     }
3801     q = ( bSig <= aSig0 );
3802     if ( q ) aSig0 -= bSig;
3803     expDiff -= 64;
3804     while ( 0 < expDiff ) {
3805         q = estimateDiv128To64( aSig0, aSig1, bSig );
3806         q = ( 2 < q ) ? q - 2 : 0;
3807         mul64To128( bSig, q, &term0, &term1 );
3808         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3809         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3810         expDiff -= 62;
3811     }
3812     expDiff += 64;
3813     if ( 0 < expDiff ) {
3814         q = estimateDiv128To64( aSig0, aSig1, bSig );
3815         q = ( 2 < q ) ? q - 2 : 0;
3816         q >>= 64 - expDiff;
3817         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3818         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3819         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3820         while ( le128( term0, term1, aSig0, aSig1 ) ) {
3821             ++q;
3822             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3823         }
3824     }
3825     else {
3826         term1 = 0;
3827         term0 = bSig;
3828     }
3829     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3830     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3831          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3832               && ( q & 1 ) )
3833        ) {
3834         aSig0 = alternateASig0;
3835         aSig1 = alternateASig1;
3836         zSign = ! zSign;
3837     }
3838     return
3839         normalizeRoundAndPackFloatx80(
3840             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
3841
3842 }
3843
3844 /*----------------------------------------------------------------------------
3845 | Returns the square root of the extended double-precision floating-point
3846 | value `a'.  The operation is performed according to the IEC/IEEE Standard
3847 | for Binary Floating-Point Arithmetic.
3848 *----------------------------------------------------------------------------*/
3849
3850 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
3851 {
3852     flag aSign;
3853     int32 aExp, zExp;
3854     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3855     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3856     floatx80 z;
3857
3858     aSig0 = extractFloatx80Frac( a );
3859     aExp = extractFloatx80Exp( a );
3860     aSign = extractFloatx80Sign( a );
3861     if ( aExp == 0x7FFF ) {
3862         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
3863         if ( ! aSign ) return a;
3864         goto invalid;
3865     }
3866     if ( aSign ) {
3867         if ( ( aExp | aSig0 ) == 0 ) return a;
3868  invalid:
3869         float_raise( float_flag_invalid STATUS_VAR);
3870         z.low = floatx80_default_nan_low;
3871         z.high = floatx80_default_nan_high;
3872         return z;
3873     }
3874     if ( aExp == 0 ) {
3875         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3876         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3877     }
3878     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3879     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3880     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3881     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3882     doubleZSig0 = zSig0<<1;
3883     mul64To128( zSig0, zSig0, &term0, &term1 );
3884     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3885     while ( (sbits64) rem0 < 0 ) {
3886         --zSig0;
3887         doubleZSig0 -= 2;
3888         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3889     }
3890     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3891     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3892         if ( zSig1 == 0 ) zSig1 = 1;
3893         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3894         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3895         mul64To128( zSig1, zSig1, &term2, &term3 );
3896         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3897         while ( (sbits64) rem1 < 0 ) {
3898             --zSig1;
3899             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3900             term3 |= 1;
3901             term2 |= doubleZSig0;
3902             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3903         }
3904         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3905     }
3906     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3907     zSig0 |= doubleZSig0;
3908     return
3909         roundAndPackFloatx80(
3910             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
3911
3912 }
3913
3914 /*----------------------------------------------------------------------------
3915 | Returns 1 if the extended double-precision floating-point value `a' is
3916 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
3917 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3918 | Arithmetic.
3919 *----------------------------------------------------------------------------*/
3920
3921 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
3922 {
3923
3924     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3925               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3926          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3927               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3928        ) {
3929         if (    floatx80_is_signaling_nan( a )
3930              || floatx80_is_signaling_nan( b ) ) {
3931             float_raise( float_flag_invalid STATUS_VAR);
3932         }
3933         return 0;
3934     }
3935     return
3936            ( a.low == b.low )
3937         && (    ( a.high == b.high )
3938              || (    ( a.low == 0 )
3939                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3940            );
3941
3942 }
3943
3944 /*----------------------------------------------------------------------------
3945 | Returns 1 if the extended double-precision floating-point value `a' is
3946 | less than or equal to the corresponding value `b', and 0 otherwise.  The
3947 | comparison is performed according to the IEC/IEEE Standard for Binary
3948 | Floating-Point Arithmetic.
3949 *----------------------------------------------------------------------------*/
3950
3951 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
3952 {
3953     flag aSign, bSign;
3954
3955     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3956               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3957          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3958               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3959        ) {
3960         float_raise( float_flag_invalid STATUS_VAR);
3961         return 0;
3962     }
3963     aSign = extractFloatx80Sign( a );
3964     bSign = extractFloatx80Sign( b );
3965     if ( aSign != bSign ) {
3966         return
3967                aSign
3968             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3969                  == 0 );
3970     }
3971     return
3972           aSign ? le128( b.high, b.low, a.high, a.low )
3973         : le128( a.high, a.low, b.high, b.low );
3974
3975 }
3976
3977 /*----------------------------------------------------------------------------
3978 | Returns 1 if the extended double-precision floating-point value `a' is
3979 | less than the corresponding value `b', and 0 otherwise.  The comparison
3980 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3981 | Arithmetic.
3982 *----------------------------------------------------------------------------*/
3983
3984 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
3985 {
3986     flag aSign, bSign;
3987
3988     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3989               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3990          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3991               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3992        ) {
3993         float_raise( float_flag_invalid STATUS_VAR);
3994         return 0;
3995     }
3996     aSign = extractFloatx80Sign( a );
3997     bSign = extractFloatx80Sign( b );
3998     if ( aSign != bSign ) {
3999         return
4000                aSign
4001             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4002                  != 0 );
4003     }
4004     return
4005           aSign ? lt128( b.high, b.low, a.high, a.low )
4006         : lt128( a.high, a.low, b.high, b.low );
4007
4008 }
4009
4010 /*----------------------------------------------------------------------------
4011 | Returns 1 if the extended double-precision floating-point value `a' is equal
4012 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
4013 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4014 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4015 *----------------------------------------------------------------------------*/
4016
4017 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4018 {
4019
4020     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4021               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4022          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4023               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4024        ) {
4025         float_raise( float_flag_invalid STATUS_VAR);
4026         return 0;
4027     }
4028     return
4029            ( a.low == b.low )
4030         && (    ( a.high == b.high )
4031              || (    ( a.low == 0 )
4032                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4033            );
4034
4035 }
4036
4037 /*----------------------------------------------------------------------------
4038 | Returns 1 if the extended double-precision floating-point value `a' is less
4039 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4040 | do not cause an exception.  Otherwise, the comparison is performed according
4041 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4042 *----------------------------------------------------------------------------*/
4043
4044 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4045 {
4046     flag aSign, bSign;
4047
4048     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4049               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4050          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4051               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4052        ) {
4053         if (    floatx80_is_signaling_nan( a )
4054              || floatx80_is_signaling_nan( b ) ) {
4055             float_raise( float_flag_invalid STATUS_VAR);
4056         }
4057         return 0;
4058     }
4059     aSign = extractFloatx80Sign( a );
4060     bSign = extractFloatx80Sign( b );
4061     if ( aSign != bSign ) {
4062         return
4063                aSign
4064             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4065                  == 0 );
4066     }
4067     return
4068           aSign ? le128( b.high, b.low, a.high, a.low )
4069         : le128( a.high, a.low, b.high, b.low );
4070
4071 }
4072
4073 /*----------------------------------------------------------------------------
4074 | Returns 1 if the extended double-precision floating-point value `a' is less
4075 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4076 | an exception.  Otherwise, the comparison is performed according to the
4077 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4078 *----------------------------------------------------------------------------*/
4079
4080 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4081 {
4082     flag aSign, bSign;
4083
4084     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4085               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4086          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4087               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4088        ) {
4089         if (    floatx80_is_signaling_nan( a )
4090              || floatx80_is_signaling_nan( b ) ) {
4091             float_raise( float_flag_invalid STATUS_VAR);
4092         }
4093         return 0;
4094     }
4095     aSign = extractFloatx80Sign( a );
4096     bSign = extractFloatx80Sign( b );
4097     if ( aSign != bSign ) {
4098         return
4099                aSign
4100             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4101                  != 0 );
4102     }
4103     return
4104           aSign ? lt128( b.high, b.low, a.high, a.low )
4105         : lt128( a.high, a.low, b.high, b.low );
4106
4107 }
4108
4109 #endif
4110
4111 #ifdef FLOAT128
4112
4113 /*----------------------------------------------------------------------------
4114 | Returns the result of converting the quadruple-precision floating-point
4115 | value `a' to the 32-bit two's complement integer format.  The conversion
4116 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4117 | Arithmetic---which means in particular that the conversion is rounded
4118 | according to the current rounding mode.  If `a' is a NaN, the largest
4119 | positive integer is returned.  Otherwise, if the conversion overflows, the
4120 | largest integer with the same sign as `a' is returned.
4121 *----------------------------------------------------------------------------*/
4122
4123 int32 float128_to_int32( float128 a STATUS_PARAM )
4124 {
4125     flag aSign;
4126     int32 aExp, shiftCount;
4127     bits64 aSig0, aSig1;
4128
4129     aSig1 = extractFloat128Frac1( a );
4130     aSig0 = extractFloat128Frac0( a );
4131     aExp = extractFloat128Exp( a );
4132     aSign = extractFloat128Sign( a );
4133     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4134     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4135     aSig0 |= ( aSig1 != 0 );
4136     shiftCount = 0x4028 - aExp;
4137     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4138     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4139
4140 }
4141
4142 /*----------------------------------------------------------------------------
4143 | Returns the result of converting the quadruple-precision floating-point
4144 | value `a' to the 32-bit two's complement integer format.  The conversion
4145 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4146 | Arithmetic, except that the conversion is always rounded toward zero.  If
4147 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4148 | conversion overflows, the largest integer with the same sign as `a' is
4149 | returned.
4150 *----------------------------------------------------------------------------*/
4151
4152 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4153 {
4154     flag aSign;
4155     int32 aExp, shiftCount;
4156     bits64 aSig0, aSig1, savedASig;
4157     int32 z;
4158
4159     aSig1 = extractFloat128Frac1( a );
4160     aSig0 = extractFloat128Frac0( a );
4161     aExp = extractFloat128Exp( a );
4162     aSign = extractFloat128Sign( a );
4163     aSig0 |= ( aSig1 != 0 );
4164     if ( 0x401E < aExp ) {
4165         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4166         goto invalid;
4167     }
4168     else if ( aExp < 0x3FFF ) {
4169         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4170         return 0;
4171     }
4172     aSig0 |= LIT64( 0x0001000000000000 );
4173     shiftCount = 0x402F - aExp;
4174     savedASig = aSig0;
4175     aSig0 >>= shiftCount;
4176     z = aSig0;
4177     if ( aSign ) z = - z;
4178     if ( ( z < 0 ) ^ aSign ) {
4179  invalid:
4180         float_raise( float_flag_invalid STATUS_VAR);
4181         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4182     }
4183     if ( ( aSig0<<shiftCount ) != savedASig ) {
4184         STATUS(float_exception_flags) |= float_flag_inexact;
4185     }
4186     return z;
4187
4188 }
4189
4190 /*----------------------------------------------------------------------------
4191 | Returns the result of converting the quadruple-precision floating-point
4192 | value `a' to the 64-bit two's complement integer format.  The conversion
4193 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4194 | Arithmetic---which means in particular that the conversion is rounded
4195 | according to the current rounding mode.  If `a' is a NaN, the largest
4196 | positive integer is returned.  Otherwise, if the conversion overflows, the
4197 | largest integer with the same sign as `a' is returned.
4198 *----------------------------------------------------------------------------*/
4199
4200 int64 float128_to_int64( float128 a STATUS_PARAM )
4201 {
4202     flag aSign;
4203     int32 aExp, shiftCount;
4204     bits64 aSig0, aSig1;
4205
4206     aSig1 = extractFloat128Frac1( a );
4207     aSig0 = extractFloat128Frac0( a );
4208     aExp = extractFloat128Exp( a );
4209     aSign = extractFloat128Sign( a );
4210     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4211     shiftCount = 0x402F - aExp;
4212     if ( shiftCount <= 0 ) {
4213         if ( 0x403E < aExp ) {
4214             float_raise( float_flag_invalid STATUS_VAR);
4215             if (    ! aSign
4216                  || (    ( aExp == 0x7FFF )
4217                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4218                     )
4219                ) {
4220                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4221             }
4222             return (sbits64) LIT64( 0x8000000000000000 );
4223         }
4224         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4225     }
4226     else {
4227         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4228     }
4229     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4230
4231 }
4232
4233 /*----------------------------------------------------------------------------
4234 | Returns the result of converting the quadruple-precision floating-point
4235 | value `a' to the 64-bit two's complement integer format.  The conversion
4236 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4237 | Arithmetic, except that the conversion is always rounded toward zero.
4238 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4239 | the conversion overflows, the largest integer with the same sign as `a' is
4240 | returned.
4241 *----------------------------------------------------------------------------*/
4242
4243 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4244 {
4245     flag aSign;
4246     int32 aExp, shiftCount;
4247     bits64 aSig0, aSig1;
4248     int64 z;
4249
4250     aSig1 = extractFloat128Frac1( a );
4251     aSig0 = extractFloat128Frac0( a );
4252     aExp = extractFloat128Exp( a );
4253     aSign = extractFloat128Sign( a );
4254     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4255     shiftCount = aExp - 0x402F;
4256     if ( 0 < shiftCount ) {
4257         if ( 0x403E <= aExp ) {
4258             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4259             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4260                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4261                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4262             }
4263             else {
4264                 float_raise( float_flag_invalid STATUS_VAR);
4265                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4266                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4267                 }
4268             }
4269             return (sbits64) LIT64( 0x8000000000000000 );
4270         }
4271         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4272         if ( (bits64) ( aSig1<<shiftCount ) ) {
4273             STATUS(float_exception_flags) |= float_flag_inexact;
4274         }
4275     }
4276     else {
4277         if ( aExp < 0x3FFF ) {
4278             if ( aExp | aSig0 | aSig1 ) {
4279                 STATUS(float_exception_flags) |= float_flag_inexact;
4280             }
4281             return 0;
4282         }
4283         z = aSig0>>( - shiftCount );
4284         if (    aSig1
4285              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4286             STATUS(float_exception_flags) |= float_flag_inexact;
4287         }
4288     }
4289     if ( aSign ) z = - z;
4290     return z;
4291
4292 }
4293
4294 /*----------------------------------------------------------------------------
4295 | Returns the result of converting the quadruple-precision floating-point
4296 | value `a' to the single-precision floating-point format.  The conversion
4297 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4298 | Arithmetic.
4299 *----------------------------------------------------------------------------*/
4300
4301 float32 float128_to_float32( float128 a STATUS_PARAM )
4302 {
4303     flag aSign;
4304     int32 aExp;
4305     bits64 aSig0, aSig1;
4306     bits32 zSig;
4307
4308     aSig1 = extractFloat128Frac1( a );
4309     aSig0 = extractFloat128Frac0( a );
4310     aExp = extractFloat128Exp( a );
4311     aSign = extractFloat128Sign( a );
4312     if ( aExp == 0x7FFF ) {
4313         if ( aSig0 | aSig1 ) {
4314             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4315         }
4316         return packFloat32( aSign, 0xFF, 0 );
4317     }
4318     aSig0 |= ( aSig1 != 0 );
4319     shift64RightJamming( aSig0, 18, &aSig0 );
4320     zSig = aSig0;
4321     if ( aExp || zSig ) {
4322         zSig |= 0x40000000;
4323         aExp -= 0x3F81;
4324     }
4325     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4326
4327 }
4328
4329 /*----------------------------------------------------------------------------
4330 | Returns the result of converting the quadruple-precision floating-point
4331 | value `a' to the double-precision floating-point format.  The conversion
4332 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4333 | Arithmetic.
4334 *----------------------------------------------------------------------------*/
4335
4336 float64 float128_to_float64( float128 a STATUS_PARAM )
4337 {
4338     flag aSign;
4339     int32 aExp;
4340     bits64 aSig0, aSig1;
4341
4342     aSig1 = extractFloat128Frac1( a );
4343     aSig0 = extractFloat128Frac0( a );
4344     aExp = extractFloat128Exp( a );
4345     aSign = extractFloat128Sign( a );
4346     if ( aExp == 0x7FFF ) {
4347         if ( aSig0 | aSig1 ) {
4348             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4349         }
4350         return packFloat64( aSign, 0x7FF, 0 );
4351     }
4352     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4353     aSig0 |= ( aSig1 != 0 );
4354     if ( aExp || aSig0 ) {
4355         aSig0 |= LIT64( 0x4000000000000000 );
4356         aExp -= 0x3C01;
4357     }
4358     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4359
4360 }
4361
4362 #ifdef FLOATX80
4363
4364 /*----------------------------------------------------------------------------
4365 | Returns the result of converting the quadruple-precision floating-point
4366 | value `a' to the extended double-precision floating-point format.  The
4367 | conversion is performed according to the IEC/IEEE Standard for Binary
4368 | Floating-Point Arithmetic.
4369 *----------------------------------------------------------------------------*/
4370
4371 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4372 {
4373     flag aSign;
4374     int32 aExp;
4375     bits64 aSig0, aSig1;
4376
4377     aSig1 = extractFloat128Frac1( a );
4378     aSig0 = extractFloat128Frac0( a );
4379     aExp = extractFloat128Exp( a );
4380     aSign = extractFloat128Sign( a );
4381     if ( aExp == 0x7FFF ) {
4382         if ( aSig0 | aSig1 ) {
4383             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4384         }
4385         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4386     }
4387     if ( aExp == 0 ) {
4388         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4389         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4390     }
4391     else {
4392         aSig0 |= LIT64( 0x0001000000000000 );
4393     }
4394     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4395     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4396
4397 }
4398
4399 #endif
4400
4401 /*----------------------------------------------------------------------------
4402 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4403 | returns the result as a quadruple-precision floating-point value.  The
4404 | operation is performed according to the IEC/IEEE Standard for Binary
4405 | Floating-Point Arithmetic.
4406 *----------------------------------------------------------------------------*/
4407
4408 float128 float128_round_to_int( float128 a STATUS_PARAM )
4409 {
4410     flag aSign;
4411     int32 aExp;
4412     bits64 lastBitMask, roundBitsMask;
4413     int8 roundingMode;
4414     float128 z;
4415
4416     aExp = extractFloat128Exp( a );
4417     if ( 0x402F <= aExp ) {
4418         if ( 0x406F <= aExp ) {
4419             if (    ( aExp == 0x7FFF )
4420                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4421                ) {
4422                 return propagateFloat128NaN( a, a STATUS_VAR );
4423             }
4424             return a;
4425         }
4426         lastBitMask = 1;
4427         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4428         roundBitsMask = lastBitMask - 1;
4429         z = a;
4430         roundingMode = STATUS(float_rounding_mode);
4431         if ( roundingMode == float_round_nearest_even ) {
4432             if ( lastBitMask ) {
4433                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4434                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4435             }
4436             else {
4437                 if ( (sbits64) z.low < 0 ) {
4438                     ++z.high;
4439                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4440                 }
4441             }
4442         }
4443         else if ( roundingMode != float_round_to_zero ) {
4444             if (   extractFloat128Sign( z )
4445                  ^ ( roundingMode == float_round_up ) ) {
4446                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4447             }
4448         }
4449         z.low &= ~ roundBitsMask;
4450     }
4451     else {
4452         if ( aExp < 0x3FFF ) {
4453             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4454             STATUS(float_exception_flags) |= float_flag_inexact;
4455             aSign = extractFloat128Sign( a );
4456             switch ( STATUS(float_rounding_mode) ) {
4457              case float_round_nearest_even:
4458                 if (    ( aExp == 0x3FFE )
4459                      && (   extractFloat128Frac0( a )
4460                           | extractFloat128Frac1( a ) )
4461                    ) {
4462                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4463                 }
4464                 break;
4465              case float_round_down:
4466                 return
4467                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4468                     : packFloat128( 0, 0, 0, 0 );
4469              case float_round_up:
4470                 return
4471                       aSign ? packFloat128( 1, 0, 0, 0 )
4472                     : packFloat128( 0, 0x3FFF, 0, 0 );
4473             }
4474             return packFloat128( aSign, 0, 0, 0 );
4475         }
4476         lastBitMask = 1;
4477         lastBitMask <<= 0x402F - aExp;
4478         roundBitsMask = lastBitMask - 1;
4479         z.low = 0;
4480         z.high = a.high;
4481         roundingMode = STATUS(float_rounding_mode);
4482         if ( roundingMode == float_round_nearest_even ) {
4483             z.high += lastBitMask>>1;
4484             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4485                 z.high &= ~ lastBitMask;
4486             }
4487         }
4488         else if ( roundingMode != float_round_to_zero ) {
4489             if (   extractFloat128Sign( z )
4490                  ^ ( roundingMode == float_round_up ) ) {
4491                 z.high |= ( a.low != 0 );
4492                 z.high += roundBitsMask;
4493             }
4494         }
4495         z.high &= ~ roundBitsMask;
4496     }
4497     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4498         STATUS(float_exception_flags) |= float_flag_inexact;
4499     }
4500     return z;
4501
4502 }
4503
4504 /*----------------------------------------------------------------------------
4505 | Returns the result of adding the absolute values of the quadruple-precision
4506 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4507 | before being returned.  `zSign' is ignored if the result is a NaN.
4508 | The addition is performed according to the IEC/IEEE Standard for Binary
4509 | Floating-Point Arithmetic.
4510 *----------------------------------------------------------------------------*/
4511
4512 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4513 {
4514     int32 aExp, bExp, zExp;
4515     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4516     int32 expDiff;
4517
4518     aSig1 = extractFloat128Frac1( a );
4519     aSig0 = extractFloat128Frac0( a );
4520     aExp = extractFloat128Exp( a );
4521     bSig1 = extractFloat128Frac1( b );
4522     bSig0 = extractFloat128Frac0( b );
4523     bExp = extractFloat128Exp( b );
4524     expDiff = aExp - bExp;
4525     if ( 0 < expDiff ) {
4526         if ( aExp == 0x7FFF ) {
4527             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4528             return a;
4529         }
4530         if ( bExp == 0 ) {
4531             --expDiff;
4532         }
4533         else {
4534             bSig0 |= LIT64( 0x0001000000000000 );
4535         }
4536         shift128ExtraRightJamming(
4537             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4538         zExp = aExp;
4539     }
4540     else if ( expDiff < 0 ) {
4541         if ( bExp == 0x7FFF ) {
4542             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4543             return packFloat128( zSign, 0x7FFF, 0, 0 );
4544         }
4545         if ( aExp == 0 ) {
4546             ++expDiff;
4547         }
4548         else {
4549             aSig0 |= LIT64( 0x0001000000000000 );
4550         }
4551         shift128ExtraRightJamming(
4552             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4553         zExp = bExp;
4554     }
4555     else {
4556         if ( aExp == 0x7FFF ) {
4557             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4558                 return propagateFloat128NaN( a, b STATUS_VAR );
4559             }
4560             return a;
4561         }
4562         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4563         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4564         zSig2 = 0;
4565         zSig0 |= LIT64( 0x0002000000000000 );
4566         zExp = aExp;
4567         goto shiftRight1;
4568     }
4569     aSig0 |= LIT64( 0x0001000000000000 );
4570     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4571     --zExp;
4572     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4573     ++zExp;
4574  shiftRight1:
4575     shift128ExtraRightJamming(
4576         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4577  roundAndPack:
4578     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4579
4580 }
4581
4582 /*----------------------------------------------------------------------------
4583 | Returns the result of subtracting the absolute values of the quadruple-
4584 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
4585 | difference is negated before being returned.  `zSign' is ignored if the
4586 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4587 | Standard for Binary Floating-Point Arithmetic.
4588 *----------------------------------------------------------------------------*/
4589
4590 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4591 {
4592     int32 aExp, bExp, zExp;
4593     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4594     int32 expDiff;
4595     float128 z;
4596
4597     aSig1 = extractFloat128Frac1( a );
4598     aSig0 = extractFloat128Frac0( a );
4599     aExp = extractFloat128Exp( a );
4600     bSig1 = extractFloat128Frac1( b );
4601     bSig0 = extractFloat128Frac0( b );
4602     bExp = extractFloat128Exp( b );
4603     expDiff = aExp - bExp;
4604     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4605     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4606     if ( 0 < expDiff ) goto aExpBigger;
4607     if ( expDiff < 0 ) goto bExpBigger;
4608     if ( aExp == 0x7FFF ) {
4609         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4610             return propagateFloat128NaN( a, b STATUS_VAR );
4611         }
4612         float_raise( float_flag_invalid STATUS_VAR);
4613         z.low = float128_default_nan_low;
4614         z.high = float128_default_nan_high;
4615         return z;
4616     }
4617     if ( aExp == 0 ) {
4618         aExp = 1;
4619         bExp = 1;
4620     }
4621     if ( bSig0 < aSig0 ) goto aBigger;
4622     if ( aSig0 < bSig0 ) goto bBigger;
4623     if ( bSig1 < aSig1 ) goto aBigger;
4624     if ( aSig1 < bSig1 ) goto bBigger;
4625     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4626  bExpBigger:
4627     if ( bExp == 0x7FFF ) {
4628         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4629         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4630     }
4631     if ( aExp == 0 ) {
4632         ++expDiff;
4633     }
4634     else {
4635         aSig0 |= LIT64( 0x4000000000000000 );
4636     }
4637     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4638     bSig0 |= LIT64( 0x4000000000000000 );
4639  bBigger:
4640     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4641     zExp = bExp;
4642     zSign ^= 1;
4643     goto normalizeRoundAndPack;
4644  aExpBigger:
4645     if ( aExp == 0x7FFF ) {
4646         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4647         return a;
4648     }
4649     if ( bExp == 0 ) {
4650         --expDiff;
4651     }
4652     else {
4653         bSig0 |= LIT64( 0x4000000000000000 );
4654     }
4655     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4656     aSig0 |= LIT64( 0x4000000000000000 );
4657  aBigger:
4658     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4659     zExp = aExp;
4660  normalizeRoundAndPack:
4661     --zExp;
4662     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4663
4664 }
4665
4666 /*----------------------------------------------------------------------------
4667 | Returns the result of adding the quadruple-precision floating-point values
4668 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4669 | for Binary Floating-Point Arithmetic.
4670 *----------------------------------------------------------------------------*/
4671
4672 float128 float128_add( float128 a, float128 b STATUS_PARAM )
4673 {
4674     flag aSign, bSign;
4675
4676     aSign = extractFloat128Sign( a );
4677     bSign = extractFloat128Sign( b );
4678     if ( aSign == bSign ) {
4679         return addFloat128Sigs( a, b, aSign STATUS_VAR );
4680     }
4681     else {
4682         return subFloat128Sigs( a, b, aSign STATUS_VAR );
4683     }
4684
4685 }
4686
4687 /*----------------------------------------------------------------------------
4688 | Returns the result of subtracting the quadruple-precision floating-point
4689 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4690 | Standard for Binary Floating-Point Arithmetic.
4691 *----------------------------------------------------------------------------*/
4692
4693 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4694 {
4695     flag aSign, bSign;
4696
4697     aSign = extractFloat128Sign( a );
4698     bSign = extractFloat128Sign( b );
4699     if ( aSign == bSign ) {
4700         return subFloat128Sigs( a, b, aSign STATUS_VAR );
4701     }
4702     else {
4703         return addFloat128Sigs( a, b, aSign STATUS_VAR );
4704     }
4705
4706 }
4707
4708 /*----------------------------------------------------------------------------
4709 | Returns the result of multiplying the quadruple-precision floating-point
4710 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4711 | Standard for Binary Floating-Point Arithmetic.
4712 *----------------------------------------------------------------------------*/
4713
4714 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4715 {
4716     flag aSign, bSign, zSign;
4717     int32 aExp, bExp, zExp;
4718     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4719     float128 z;
4720
4721     aSig1 = extractFloat128Frac1( a );
4722     aSig0 = extractFloat128Frac0( a );
4723     aExp = extractFloat128Exp( a );
4724     aSign = extractFloat128Sign( a );
4725     bSig1 = extractFloat128Frac1( b );
4726     bSig0 = extractFloat128Frac0( b );
4727     bExp = extractFloat128Exp( b );
4728     bSign = extractFloat128Sign( b );
4729     zSign = aSign ^ bSign;
4730     if ( aExp == 0x7FFF ) {
4731         if (    ( aSig0 | aSig1 )
4732              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4733             return propagateFloat128NaN( a, b STATUS_VAR );
4734         }
4735         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4736         return packFloat128( zSign, 0x7FFF, 0, 0 );
4737     }
4738     if ( bExp == 0x7FFF ) {
4739         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4740         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4741  invalid:
4742             float_raise( float_flag_invalid STATUS_VAR);
4743             z.low = float128_default_nan_low;
4744             z.high = float128_default_nan_high;
4745             return z;
4746         }
4747         return packFloat128( zSign, 0x7FFF, 0, 0 );
4748     }
4749     if ( aExp == 0 ) {
4750         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4751         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4752     }
4753     if ( bExp == 0 ) {
4754         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4755         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4756     }
4757     zExp = aExp + bExp - 0x4000;
4758     aSig0 |= LIT64( 0x0001000000000000 );
4759     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4760     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4761     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4762     zSig2 |= ( zSig3 != 0 );
4763     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4764         shift128ExtraRightJamming(
4765             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4766         ++zExp;
4767     }
4768     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4769
4770 }
4771
4772 /*----------------------------------------------------------------------------
4773 | Returns the result of dividing the quadruple-precision floating-point value
4774 | `a' by the corresponding value `b'.  The operation is performed according to
4775 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4776 *----------------------------------------------------------------------------*/
4777
4778 float128 float128_div( float128 a, float128 b STATUS_PARAM )
4779 {
4780     flag aSign, bSign, zSign;
4781     int32 aExp, bExp, zExp;
4782     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4783     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4784     float128 z;
4785
4786     aSig1 = extractFloat128Frac1( a );
4787     aSig0 = extractFloat128Frac0( a );
4788     aExp = extractFloat128Exp( a );
4789     aSign = extractFloat128Sign( a );
4790     bSig1 = extractFloat128Frac1( b );
4791     bSig0 = extractFloat128Frac0( b );
4792     bExp = extractFloat128Exp( b );
4793     bSign = extractFloat128Sign( b );
4794     zSign = aSign ^ bSign;
4795     if ( aExp == 0x7FFF ) {
4796         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4797         if ( bExp == 0x7FFF ) {
4798             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4799             goto invalid;
4800         }
4801         return packFloat128( zSign, 0x7FFF, 0, 0 );
4802     }
4803     if ( bExp == 0x7FFF ) {
4804         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4805         return packFloat128( zSign, 0, 0, 0 );
4806     }
4807     if ( bExp == 0 ) {
4808         if ( ( bSig0 | bSig1 ) == 0 ) {
4809             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4810  invalid:
4811                 float_raise( float_flag_invalid STATUS_VAR);
4812                 z.low = float128_default_nan_low;
4813                 z.high = float128_default_nan_high;
4814                 return z;
4815             }
4816             float_raise( float_flag_divbyzero STATUS_VAR);
4817             return packFloat128( zSign, 0x7FFF, 0, 0 );
4818         }
4819         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4820     }
4821     if ( aExp == 0 ) {
4822         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4823         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4824     }
4825     zExp = aExp - bExp + 0x3FFD;
4826     shortShift128Left(
4827         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4828     shortShift128Left(
4829         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4830     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4831         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4832         ++zExp;
4833     }
4834     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4835     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4836     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4837     while ( (sbits64) rem0 < 0 ) {
4838         --zSig0;
4839         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4840     }
4841     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4842     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4843         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4844         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4845         while ( (sbits64) rem1 < 0 ) {
4846             --zSig1;
4847             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4848         }
4849         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4850     }
4851     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4852     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4853
4854 }
4855
4856 /*----------------------------------------------------------------------------
4857 | Returns the remainder of the quadruple-precision floating-point value `a'
4858 | with respect to the corresponding value `b'.  The operation is performed
4859 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4860 *----------------------------------------------------------------------------*/
4861
4862 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
4863 {
4864     flag aSign, bSign, zSign;
4865     int32 aExp, bExp, expDiff;
4866     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4867     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4868     sbits64 sigMean0;
4869     float128 z;
4870
4871     aSig1 = extractFloat128Frac1( a );
4872     aSig0 = extractFloat128Frac0( a );
4873     aExp = extractFloat128Exp( a );
4874     aSign = extractFloat128Sign( a );
4875     bSig1 = extractFloat128Frac1( b );
4876     bSig0 = extractFloat128Frac0( b );
4877     bExp = extractFloat128Exp( b );
4878     bSign = extractFloat128Sign( b );
4879     if ( aExp == 0x7FFF ) {
4880         if (    ( aSig0 | aSig1 )
4881              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4882             return propagateFloat128NaN( a, b STATUS_VAR );
4883         }
4884         goto invalid;
4885     }
4886     if ( bExp == 0x7FFF ) {
4887         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4888         return a;
4889     }
4890     if ( bExp == 0 ) {
4891         if ( ( bSig0 | bSig1 ) == 0 ) {
4892  invalid:
4893             float_raise( float_flag_invalid STATUS_VAR);
4894             z.low = float128_default_nan_low;
4895             z.high = float128_default_nan_high;
4896             return z;
4897         }
4898         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4899     }
4900     if ( aExp == 0 ) {
4901         if ( ( aSig0 | aSig1 ) == 0 ) return a;
4902         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4903     }
4904     expDiff = aExp - bExp;
4905     if ( expDiff < -1 ) return a;
4906     shortShift128Left(
4907         aSig0 | LIT64( 0x0001000000000000 ),
4908         aSig1,
4909         15 - ( expDiff < 0 ),
4910         &aSig0,
4911         &aSig1
4912     );
4913     shortShift128Left(
4914         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4915     q = le128( bSig0, bSig1, aSig0, aSig1 );
4916     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4917     expDiff -= 64;
4918     while ( 0 < expDiff ) {
4919         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4920         q = ( 4 < q ) ? q - 4 : 0;
4921         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4922         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4923         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4924         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4925         expDiff -= 61;
4926     }
4927     if ( -64 < expDiff ) {
4928         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4929         q = ( 4 < q ) ? q - 4 : 0;
4930         q >>= - expDiff;
4931         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4932         expDiff += 52;
4933         if ( expDiff < 0 ) {
4934             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4935         }
4936         else {
4937             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4938         }
4939         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4940         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4941     }
4942     else {
4943         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4944         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4945     }
4946     do {
4947         alternateASig0 = aSig0;
4948         alternateASig1 = aSig1;
4949         ++q;
4950         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4951     } while ( 0 <= (sbits64) aSig0 );
4952     add128(
4953         aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
4954     if (    ( sigMean0 < 0 )
4955          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4956         aSig0 = alternateASig0;
4957         aSig1 = alternateASig1;
4958     }
4959     zSign = ( (sbits64) aSig0 < 0 );
4960     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4961     return
4962         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
4963
4964 }
4965
4966 /*----------------------------------------------------------------------------
4967 | Returns the square root of the quadruple-precision floating-point value `a'.
4968 | The operation is performed according to the IEC/IEEE Standard for Binary
4969 | Floating-Point Arithmetic.
4970 *----------------------------------------------------------------------------*/
4971
4972 float128 float128_sqrt( float128 a STATUS_PARAM )
4973 {
4974     flag aSign;
4975     int32 aExp, zExp;
4976     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
4977     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4978     float128 z;
4979
4980     aSig1 = extractFloat128Frac1( a );
4981     aSig0 = extractFloat128Frac0( a );
4982     aExp = extractFloat128Exp( a );
4983     aSign = extractFloat128Sign( a );
4984     if ( aExp == 0x7FFF ) {
4985         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
4986         if ( ! aSign ) return a;
4987         goto invalid;
4988     }
4989     if ( aSign ) {
4990         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
4991  invalid:
4992         float_raise( float_flag_invalid STATUS_VAR);
4993         z.low = float128_default_nan_low;
4994         z.high = float128_default_nan_high;
4995         return z;
4996     }
4997     if ( aExp == 0 ) {
4998         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
4999         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5000     }
5001     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5002     aSig0 |= LIT64( 0x0001000000000000 );
5003     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5004     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5005     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5006     doubleZSig0 = zSig0<<1;
5007     mul64To128( zSig0, zSig0, &term0, &term1 );
5008     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5009     while ( (sbits64) rem0 < 0 ) {
5010         --zSig0;
5011         doubleZSig0 -= 2;
5012         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5013     }
5014     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5015     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5016         if ( zSig1 == 0 ) zSig1 = 1;
5017         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5018         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5019         mul64To128( zSig1, zSig1, &term2, &term3 );
5020         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5021         while ( (sbits64) rem1 < 0 ) {
5022             --zSig1;
5023             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5024             term3 |= 1;
5025             term2 |= doubleZSig0;
5026             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5027         }
5028         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5029     }
5030     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5031     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5032
5033 }
5034
5035 /*----------------------------------------------------------------------------
5036 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5037 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5038 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5039 *----------------------------------------------------------------------------*/
5040
5041 int float128_eq( float128 a, float128 b STATUS_PARAM )
5042 {
5043
5044     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5045               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5046          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5047               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5048        ) {
5049         if (    float128_is_signaling_nan( a )
5050              || float128_is_signaling_nan( b ) ) {
5051             float_raise( float_flag_invalid STATUS_VAR);
5052         }
5053         return 0;
5054     }
5055     return
5056            ( a.low == b.low )
5057         && (    ( a.high == b.high )
5058              || (    ( a.low == 0 )
5059                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5060            );
5061
5062 }
5063
5064 /*----------------------------------------------------------------------------
5065 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5066 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
5067 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5068 | Arithmetic.
5069 *----------------------------------------------------------------------------*/
5070
5071 int float128_le( float128 a, float128 b STATUS_PARAM )
5072 {
5073     flag aSign, bSign;
5074
5075     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5076               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5077          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5078               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5079        ) {
5080         float_raise( float_flag_invalid STATUS_VAR);
5081         return 0;
5082     }
5083     aSign = extractFloat128Sign( a );
5084     bSign = extractFloat128Sign( b );
5085     if ( aSign != bSign ) {
5086         return
5087                aSign
5088             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5089                  == 0 );
5090     }
5091     return
5092           aSign ? le128( b.high, b.low, a.high, a.low )
5093         : le128( a.high, a.low, b.high, b.low );
5094
5095 }
5096
5097 /*----------------------------------------------------------------------------
5098 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5099 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5100 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5101 *----------------------------------------------------------------------------*/
5102
5103 int float128_lt( float128 a, float128 b STATUS_PARAM )
5104 {
5105     flag aSign, bSign;
5106
5107     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5108               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5109          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5110               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5111        ) {
5112         float_raise( float_flag_invalid STATUS_VAR);
5113         return 0;
5114     }
5115     aSign = extractFloat128Sign( a );
5116     bSign = extractFloat128Sign( b );
5117     if ( aSign != bSign ) {
5118         return
5119                aSign
5120             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5121                  != 0 );
5122     }
5123     return
5124           aSign ? lt128( b.high, b.low, a.high, a.low )
5125         : lt128( a.high, a.low, b.high, b.low );
5126
5127 }
5128
5129 /*----------------------------------------------------------------------------
5130 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5131 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5132 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5133 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5134 *----------------------------------------------------------------------------*/
5135
5136 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5137 {
5138
5139     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5140               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5141          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5142               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5143        ) {
5144         float_raise( float_flag_invalid STATUS_VAR);
5145         return 0;
5146     }
5147     return
5148            ( a.low == b.low )
5149         && (    ( a.high == b.high )
5150              || (    ( a.low == 0 )
5151                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5152            );
5153
5154 }
5155
5156 /*----------------------------------------------------------------------------
5157 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5158 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5159 | cause an exception.  Otherwise, the comparison is performed according to the
5160 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5161 *----------------------------------------------------------------------------*/
5162
5163 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5164 {
5165     flag aSign, bSign;
5166
5167     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5168               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5169          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5170               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5171        ) {
5172         if (    float128_is_signaling_nan( a )
5173              || float128_is_signaling_nan( b ) ) {
5174             float_raise( float_flag_invalid STATUS_VAR);
5175         }
5176         return 0;
5177     }
5178     aSign = extractFloat128Sign( a );
5179     bSign = extractFloat128Sign( b );
5180     if ( aSign != bSign ) {
5181         return
5182                aSign
5183             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5184                  == 0 );
5185     }
5186     return
5187           aSign ? le128( b.high, b.low, a.high, a.low )
5188         : le128( a.high, a.low, b.high, b.low );
5189
5190 }
5191
5192 /*----------------------------------------------------------------------------
5193 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5194 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5195 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5196 | Standard for Binary Floating-Point Arithmetic.
5197 *----------------------------------------------------------------------------*/
5198
5199 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5200 {
5201     flag aSign, bSign;
5202
5203     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5204               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5205          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5206               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5207        ) {
5208         if (    float128_is_signaling_nan( a )
5209              || float128_is_signaling_nan( b ) ) {
5210             float_raise( float_flag_invalid STATUS_VAR);
5211         }
5212         return 0;
5213     }
5214     aSign = extractFloat128Sign( a );
5215     bSign = extractFloat128Sign( b );
5216     if ( aSign != bSign ) {
5217         return
5218                aSign
5219             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5220                  != 0 );
5221     }
5222     return
5223           aSign ? lt128( b.high, b.low, a.high, a.low )
5224         : lt128( a.high, a.low, b.high, b.low );
5225
5226 }
5227
5228 #endif
5229
5230 /* misc functions */
5231 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5232 {
5233     return int64_to_float32(a STATUS_VAR);
5234 }
5235
5236 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5237 {
5238     return int64_to_float64(a STATUS_VAR);
5239 }
5240
5241 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5242 {
5243     int64_t v;
5244     unsigned int res;
5245
5246     v = float32_to_int64(a STATUS_VAR);
5247     if (v < 0) {
5248         res = 0;
5249         float_raise( float_flag_invalid STATUS_VAR);
5250     } else if (v > 0xffffffff) {
5251         res = 0xffffffff;
5252         float_raise( float_flag_invalid STATUS_VAR);
5253     } else {
5254         res = v;
5255     }
5256     return res;
5257 }
5258
5259 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5260 {
5261     int64_t v;
5262     unsigned int res;
5263
5264     v = float32_to_int64_round_to_zero(a STATUS_VAR);
5265     if (v < 0) {
5266         res = 0;
5267         float_raise( float_flag_invalid STATUS_VAR);
5268     } else if (v > 0xffffffff) {
5269         res = 0xffffffff;
5270         float_raise( float_flag_invalid STATUS_VAR);
5271     } else {
5272         res = v;
5273     }
5274     return res;
5275 }
5276
5277 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5278 {
5279     int64_t v;
5280     unsigned int res;
5281
5282     v = float64_to_int64(a STATUS_VAR);
5283     if (v < 0) {
5284         res = 0;
5285         float_raise( float_flag_invalid STATUS_VAR);
5286     } else if (v > 0xffffffff) {
5287         res = 0xffffffff;
5288         float_raise( float_flag_invalid STATUS_VAR);
5289     } else {
5290         res = v;
5291     }
5292     return res;
5293 }
5294
5295 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5296 {
5297     int64_t v;
5298     unsigned int res;
5299
5300     v = float64_to_int64_round_to_zero(a STATUS_VAR);
5301     if (v < 0) {
5302         res = 0;
5303         float_raise( float_flag_invalid STATUS_VAR);
5304     } else if (v > 0xffffffff) {
5305         res = 0xffffffff;
5306         float_raise( float_flag_invalid STATUS_VAR);
5307     } else {
5308         res = v;
5309     }
5310     return res;
5311 }
5312
5313 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5314 {
5315     int64_t v;
5316
5317     v = int64_to_float64(INT64_MIN STATUS_VAR);
5318     v = float64_to_int64((a + v) STATUS_VAR);
5319
5320     return v - INT64_MIN;
5321 }
5322
5323 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5324 {
5325     int64_t v;
5326
5327     v = int64_to_float64(INT64_MIN STATUS_VAR);
5328     v = float64_to_int64_round_to_zero((a + v) STATUS_VAR);
5329
5330     return v - INT64_MIN;
5331 }
5332
5333 #define COMPARE(s, nan_exp)                                                  \
5334 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
5335                                       int is_quiet STATUS_PARAM )            \
5336 {                                                                            \
5337     flag aSign, bSign;                                                       \
5338                                                                              \
5339     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5340          extractFloat ## s ## Frac( a ) ) ||                                 \
5341         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5342           extractFloat ## s ## Frac( b ) )) {                                \
5343         if (!is_quiet ||                                                     \
5344             float ## s ## _is_signaling_nan( a ) ||                          \
5345             float ## s ## _is_signaling_nan( b ) ) {                         \
5346             float_raise( float_flag_invalid STATUS_VAR);                     \
5347         }                                                                    \
5348         return float_relation_unordered;                                     \
5349     }                                                                        \
5350     aSign = extractFloat ## s ## Sign( a );                                  \
5351     bSign = extractFloat ## s ## Sign( b );                                  \
5352     if ( aSign != bSign ) {                                                  \
5353         if ( (bits ## s) ( ( a | b )<<1 ) == 0 ) {                           \
5354             /* zero case */                                                  \
5355             return float_relation_equal;                                     \
5356         } else {                                                             \
5357             return 1 - (2 * aSign);                                          \
5358         }                                                                    \
5359     } else {                                                                 \
5360         if (a == b) {                                                        \
5361             return float_relation_equal;                                     \
5362         } else {                                                             \
5363             return 1 - 2 * (aSign ^ ( a < b ));                              \
5364         }                                                                    \
5365     }                                                                        \
5366 }                                                                            \
5367                                                                              \
5368 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
5369 {                                                                            \
5370     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
5371 }                                                                            \
5372                                                                              \
5373 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
5374 {                                                                            \
5375     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
5376 }
5377
5378 COMPARE(32, 0xff)
5379 COMPARE(64, 0x7ff)