fpu/softfloat.c

   1
   2 /*============================================================================
   3
   4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
   5 Package, Release 2b.
   6
   7 Written by John R. Hauser.  This work was made possible in part by the
   8 International Computer Science Institute, located at Suite 600, 1947 Center
   9 Street, Berkeley, California 94704.  Funding was partially provided by the
  10 National Science Foundation under grant MIP-9311980.  The original version
  11 of this code was written as part of a project to build a fixed-point vector
  12 processor in collaboration with the University of California at Berkeley,
  13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  15 arithmetic/SoftFloat.html'.
  16
  17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  19 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  25
  26 Derivative works are acceptable, even for commercial purposes, so long as
  27 (1) the source code for the derivative work includes prominent notice that
  28 the work is derivative, and (2) the source code includes prominent notice with
  29 these four paragraphs for those parts of this code that are retained.
  30
  31 =============================================================================*/
  32
  33 #include "softfloat.h"
  34
  35 /*----------------------------------------------------------------------------
  36 | Primitive arithmetic functions, including multi-word arithmetic, and
  37 | division and square root approximations.  (Can be specialized to target if
  38 | desired.)
  39 *----------------------------------------------------------------------------*/
  40 #include "softfloat-macros.h"
  41
  42 /*----------------------------------------------------------------------------
  43 | Functions and definitions to determine:  (1) whether tininess for underflow
  44 | is detected before or after rounding by default, (2) what (if anything)
  45 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  46 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  47 | are propagated from function inputs to output.  These details are target-
  48 | specific.
  49 *----------------------------------------------------------------------------*/
  50 #include "softfloat-specialize.h"
  51
  52 void set_float_rounding_mode(int val STATUS_PARAM)
  53 {
  54     STATUS(float_rounding_mode) = val;
  55 }
  56
  57 void set_float_exception_flags(int val STATUS_PARAM)
  58 {
  59     STATUS(float_exception_flags) = val;
  60 }
  61
  62 #ifdef FLOATX80
  63 void set_floatx80_rounding_precision(int val STATUS_PARAM)
  64 {
  65     STATUS(floatx80_rounding_precision) = val;
  66 }
  67 #endif
  68
  69 /*----------------------------------------------------------------------------
  70 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  71 | and 7, and returns the properly rounded 32-bit integer corresponding to the
  72 | input.  If `zSign' is 1, the input is negated before being converted to an
  73 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
  74 | is simply rounded to an integer, with the inexact exception raised if the
  75 | input cannot be represented exactly as an integer.  However, if the fixed-
  76 | point input is too large, the invalid exception is raised and the largest
  77 | positive or negative integer is returned.
  78 *----------------------------------------------------------------------------*/
  79
  80 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
  81 {
  82     int8 roundingMode;
  83     flag roundNearestEven;
  84     int8 roundIncrement, roundBits;
  85     int32 z;
  86
  87     roundingMode = STATUS(float_rounding_mode);
  88     roundNearestEven = ( roundingMode == float_round_nearest_even );
  89     roundIncrement = 0x40;
  90     if ( ! roundNearestEven ) {
  91         if ( roundingMode == float_round_to_zero ) {
  92             roundIncrement = 0;
  93         }
  94         else {
  95             roundIncrement = 0x7F;
  96             if ( zSign ) {
  97                 if ( roundingMode == float_round_up ) roundIncrement = 0;
  98             }
  99             else {
 100                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 101             }
 102         }
 103     }
 104     roundBits = absZ & 0x7F;
 105     absZ = ( absZ + roundIncrement )>>7;
 106     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 107     z = absZ;
 108     if ( zSign ) z = - z;
 109     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 110         float_raise( float_flag_invalid STATUS_VAR);
 111         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 112     }
 113     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 114     return z;
 115
 116 }
 117
 118 /*----------------------------------------------------------------------------
 119 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 120 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 121 | and returns the properly rounded 64-bit integer corresponding to the input.
 122 | If `zSign' is 1, the input is negated before being converted to an integer.
 123 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 124 | the inexact exception raised if the input cannot be represented exactly as
 125 | an integer.  However, if the fixed-point input is too large, the invalid
 126 | exception is raised and the largest positive or negative integer is
 127 | returned.
 128 *----------------------------------------------------------------------------*/
 129
 130 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
 131 {
 132     int8 roundingMode;
 133     flag roundNearestEven, increment;
 134     int64 z;
 135
 136     roundingMode = STATUS(float_rounding_mode);
 137     roundNearestEven = ( roundingMode == float_round_nearest_even );
 138     increment = ( (sbits64) absZ1 < 0 );
 139     if ( ! roundNearestEven ) {
 140         if ( roundingMode == float_round_to_zero ) {
 141             increment = 0;
 142         }
 143         else {
 144             if ( zSign ) {
 145                 increment = ( roundingMode == float_round_down ) && absZ1;
 146             }
 147             else {
 148                 increment = ( roundingMode == float_round_up ) && absZ1;
 149             }
 150         }
 151     }
 152     if ( increment ) {
 153         ++absZ0;
 154         if ( absZ0 == 0 ) goto overflow;
 155         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 156     }
 157     z = absZ0;
 158     if ( zSign ) z = - z;
 159     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 160  overflow:
 161         float_raise( float_flag_invalid STATUS_VAR);
 162         return
 163               zSign ? (sbits64) LIT64( 0x8000000000000000 )
 164             : LIT64( 0x7FFFFFFFFFFFFFFF );
 165     }
 166     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 167     return z;
 168
 169 }
 170
 171 /*----------------------------------------------------------------------------
 172 | Returns the fraction bits of the single-precision floating-point value `a'.
 173 *----------------------------------------------------------------------------*/
 174
 175 INLINE bits32 extractFloat32Frac( float32 a )
 176 {
 177
 178     return a & 0x007FFFFF;
 179
 180 }
 181
 182 /*----------------------------------------------------------------------------
 183 | Returns the exponent bits of the single-precision floating-point value `a'.
 184 *----------------------------------------------------------------------------*/
 185
 186 INLINE int16 extractFloat32Exp( float32 a )
 187 {
 188
 189     return ( a>>23 ) & 0xFF;
 190
 191 }
 192
 193 /*----------------------------------------------------------------------------
 194 | Returns the sign bit of the single-precision floating-point value `a'.
 195 *----------------------------------------------------------------------------*/
 196
 197 INLINE flag extractFloat32Sign( float32 a )
 198 {
 199
 200     return a>>31;
 201
 202 }
 203
 204 /*----------------------------------------------------------------------------
 205 | Normalizes the subnormal single-precision floating-point value represented
 206 | by the denormalized significand `aSig'.  The normalized exponent and
 207 | significand are stored at the locations pointed to by `zExpPtr' and
 208 | `zSigPtr', respectively.
 209 *----------------------------------------------------------------------------*/
 210
 211 static void
 212  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 213 {
 214     int8 shiftCount;
 215
 216     shiftCount = countLeadingZeros32( aSig ) - 8;
 217     *zSigPtr = aSig<<shiftCount;
 218     *zExpPtr = 1 - shiftCount;
 219
 220 }
 221
 222 /*----------------------------------------------------------------------------
 223 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 224 | single-precision floating-point value, returning the result.  After being
 225 | shifted into the proper positions, the three fields are simply added
 226 | together to form the result.  This means that any integer portion of `zSig'
 227 | will be added into the exponent.  Since a properly normalized significand
 228 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 229 | than the desired result exponent whenever `zSig' is a complete, normalized
 230 | significand.
 231 *----------------------------------------------------------------------------*/
 232
 233 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 234 {
 235
 236     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
 237
 238 }
 239
 240 /*----------------------------------------------------------------------------
 241 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 242 | and significand `zSig', and returns the proper single-precision floating-
 243 | point value corresponding to the abstract input.  Ordinarily, the abstract
 244 | value is simply rounded and packed into the single-precision format, with
 245 | the inexact exception raised if the abstract input cannot be represented
 246 | exactly.  However, if the abstract value is too large, the overflow and
 247 | inexact exceptions are raised and an infinity or maximal finite value is
 248 | returned.  If the abstract value is too small, the input value is rounded to
 249 | a subnormal number, and the underflow and inexact exceptions are raised if
 250 | the abstract input cannot be represented exactly as a subnormal single-
 251 | precision floating-point number.
 252 |     The input significand `zSig' has its binary point between bits 30
 253 | and 29, which is 7 bits to the left of the usual location.  This shifted
 254 | significand must be normalized or smaller.  If `zSig' is not normalized,
 255 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 256 | and it must not require rounding.  In the usual case that `zSig' is
 257 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 258 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 259 | Binary Floating-Point Arithmetic.
 260 *----------------------------------------------------------------------------*/
 261
 262 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 263 {
 264     int8 roundingMode;
 265     flag roundNearestEven;
 266     int8 roundIncrement, roundBits;
 267     flag isTiny;
 268
 269     roundingMode = STATUS(float_rounding_mode);
 270     roundNearestEven = ( roundingMode == float_round_nearest_even );
 271     roundIncrement = 0x40;
 272     if ( ! roundNearestEven ) {
 273         if ( roundingMode == float_round_to_zero ) {
 274             roundIncrement = 0;
 275         }
 276         else {
 277             roundIncrement = 0x7F;
 278             if ( zSign ) {
 279                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 280             }
 281             else {
 282                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 283             }
 284         }
 285     }
 286     roundBits = zSig & 0x7F;
 287     if ( 0xFD <= (bits16) zExp ) {
 288         if (    ( 0xFD < zExp )
 289              || (    ( zExp == 0xFD )
 290                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 291            ) {
 292             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 293             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
 294         }
 295         if ( zExp < 0 ) {
 296             isTiny =
 297                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 298                 || ( zExp < -1 )
 299                 || ( zSig + roundIncrement < 0x80000000 );
 300             shift32RightJamming( zSig, - zExp, &zSig );
 301             zExp = 0;
 302             roundBits = zSig & 0x7F;
 303             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 304         }
 305     }
 306     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 307     zSig = ( zSig + roundIncrement )>>7;
 308     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 309     if ( zSig == 0 ) zExp = 0;
 310     return packFloat32( zSign, zExp, zSig );
 311
 312 }
 313
 314 /*----------------------------------------------------------------------------
 315 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 316 | and significand `zSig', and returns the proper single-precision floating-
 317 | point value corresponding to the abstract input.  This routine is just like
 318 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 319 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 320 | floating-point exponent.
 321 *----------------------------------------------------------------------------*/
 322
 323 static float32
 324  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 325 {
 326     int8 shiftCount;
 327
 328     shiftCount = countLeadingZeros32( zSig ) - 1;
 329     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 330
 331 }
 332
 333 /*----------------------------------------------------------------------------
 334 | Returns the fraction bits of the double-precision floating-point value `a'.
 335 *----------------------------------------------------------------------------*/
 336
 337 INLINE bits64 extractFloat64Frac( float64 a )
 338 {
 339
 340     return a & LIT64( 0x000FFFFFFFFFFFFF );
 341
 342 }
 343
 344 /*----------------------------------------------------------------------------
 345 | Returns the exponent bits of the double-precision floating-point value `a'.
 346 *----------------------------------------------------------------------------*/
 347
 348 INLINE int16 extractFloat64Exp( float64 a )
 349 {
 350
 351     return ( a>>52 ) & 0x7FF;
 352
 353 }
 354
 355 /*----------------------------------------------------------------------------
 356 | Returns the sign bit of the double-precision floating-point value `a'.
 357 *----------------------------------------------------------------------------*/
 358
 359 INLINE flag extractFloat64Sign( float64 a )
 360 {
 361
 362     return a>>63;
 363
 364 }
 365
 366 /*----------------------------------------------------------------------------
 367 | Normalizes the subnormal double-precision floating-point value represented
 368 | by the denormalized significand `aSig'.  The normalized exponent and
 369 | significand are stored at the locations pointed to by `zExpPtr' and
 370 | `zSigPtr', respectively.
 371 *----------------------------------------------------------------------------*/
 372
 373 static void
 374  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 375 {
 376     int8 shiftCount;
 377
 378     shiftCount = countLeadingZeros64( aSig ) - 11;
 379     *zSigPtr = aSig<<shiftCount;
 380     *zExpPtr = 1 - shiftCount;
 381
 382 }
 383
 384 /*----------------------------------------------------------------------------
 385 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 386 | double-precision floating-point value, returning the result.  After being
 387 | shifted into the proper positions, the three fields are simply added
 388 | together to form the result.  This means that any integer portion of `zSig'
 389 | will be added into the exponent.  Since a properly normalized significand
 390 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 391 | than the desired result exponent whenever `zSig' is a complete, normalized
 392 | significand.
 393 *----------------------------------------------------------------------------*/
 394
 395 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 396 {
 397
 398     return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
 399
 400 }
 401
 402 /*----------------------------------------------------------------------------
 403 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 404 | and significand `zSig', and returns the proper double-precision floating-
 405 | point value corresponding to the abstract input.  Ordinarily, the abstract
 406 | value is simply rounded and packed into the double-precision format, with
 407 | the inexact exception raised if the abstract input cannot be represented
 408 | exactly.  However, if the abstract value is too large, the overflow and
 409 | inexact exceptions are raised and an infinity or maximal finite value is
 410 | returned.  If the abstract value is too small, the input value is rounded
 411 | to a subnormal number, and the underflow and inexact exceptions are raised
 412 | if the abstract input cannot be represented exactly as a subnormal double-
 413 | precision floating-point number.
 414 |     The input significand `zSig' has its binary point between bits 62
 415 | and 61, which is 10 bits to the left of the usual location.  This shifted
 416 | significand must be normalized or smaller.  If `zSig' is not normalized,
 417 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 418 | and it must not require rounding.  In the usual case that `zSig' is
 419 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 420 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 421 | Binary Floating-Point Arithmetic.
 422 *----------------------------------------------------------------------------*/
 423
 424 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 425 {
 426     int8 roundingMode;
 427     flag roundNearestEven;
 428     int16 roundIncrement, roundBits;
 429     flag isTiny;
 430
 431     roundingMode = STATUS(float_rounding_mode);
 432     roundNearestEven = ( roundingMode == float_round_nearest_even );
 433     roundIncrement = 0x200;
 434     if ( ! roundNearestEven ) {
 435         if ( roundingMode == float_round_to_zero ) {
 436             roundIncrement = 0;
 437         }
 438         else {
 439             roundIncrement = 0x3FF;
 440             if ( zSign ) {
 441                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 442             }
 443             else {
 444                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 445             }
 446         }
 447     }
 448     roundBits = zSig & 0x3FF;
 449     if ( 0x7FD <= (bits16) zExp ) {
 450         if (    ( 0x7FD < zExp )
 451              || (    ( zExp == 0x7FD )
 452                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 453            ) {
 454             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 455             return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
 456         }
 457         if ( zExp < 0 ) {
 458             isTiny =
 459                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 460                 || ( zExp < -1 )
 461                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 462             shift64RightJamming( zSig, - zExp, &zSig );
 463             zExp = 0;
 464             roundBits = zSig & 0x3FF;
 465             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 466         }
 467     }
 468     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 469     zSig = ( zSig + roundIncrement )>>10;
 470     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 471     if ( zSig == 0 ) zExp = 0;
 472     return packFloat64( zSign, zExp, zSig );
 473
 474 }
 475
 476 /*----------------------------------------------------------------------------
 477 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 478 | and significand `zSig', and returns the proper double-precision floating-
 479 | point value corresponding to the abstract input.  This routine is just like
 480 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 481 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 482 | floating-point exponent.
 483 *----------------------------------------------------------------------------*/
 484
 485 static float64
 486  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 487 {
 488     int8 shiftCount;
 489
 490     shiftCount = countLeadingZeros64( zSig ) - 1;
 491     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 492
 493 }
 494
 495 #ifdef FLOATX80
 496
 497 /*----------------------------------------------------------------------------
 498 | Returns the fraction bits of the extended double-precision floating-point
 499 | value `a'.
 500 *----------------------------------------------------------------------------*/
 501
 502 INLINE bits64 extractFloatx80Frac( floatx80 a )
 503 {
 504
 505     return a.low;
 506
 507 }
 508
 509 /*----------------------------------------------------------------------------
 510 | Returns the exponent bits of the extended double-precision floating-point
 511 | value `a'.
 512 *----------------------------------------------------------------------------*/
 513
 514 INLINE int32 extractFloatx80Exp( floatx80 a )
 515 {
 516
 517     return a.high & 0x7FFF;
 518
 519 }
 520
 521 /*----------------------------------------------------------------------------
 522 | Returns the sign bit of the extended double-precision floating-point value
 523 | `a'.
 524 *----------------------------------------------------------------------------*/
 525
 526 INLINE flag extractFloatx80Sign( floatx80 a )
 527 {
 528
 529     return a.high>>15;
 530
 531 }
 532
 533 /*----------------------------------------------------------------------------
 534 | Normalizes the subnormal extended double-precision floating-point value
 535 | represented by the denormalized significand `aSig'.  The normalized exponent
 536 | and significand are stored at the locations pointed to by `zExpPtr' and
 537 | `zSigPtr', respectively.
 538 *----------------------------------------------------------------------------*/
 539
 540 static void
 541  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 542 {
 543     int8 shiftCount;
 544
 545     shiftCount = countLeadingZeros64( aSig );
 546     *zSigPtr = aSig<<shiftCount;
 547     *zExpPtr = 1 - shiftCount;
 548
 549 }
 550
 551 /*----------------------------------------------------------------------------
 552 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 553 | extended double-precision floating-point value, returning the result.
 554 *----------------------------------------------------------------------------*/
 555
 556 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 557 {
 558     floatx80 z;
 559
 560     z.low = zSig;
 561     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 562     return z;
 563
 564 }
 565
 566 /*----------------------------------------------------------------------------
 567 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 568 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 569 | and returns the proper extended double-precision floating-point value
 570 | corresponding to the abstract input.  Ordinarily, the abstract value is
 571 | rounded and packed into the extended double-precision format, with the
 572 | inexact exception raised if the abstract input cannot be represented
 573 | exactly.  However, if the abstract value is too large, the overflow and
 574 | inexact exceptions are raised and an infinity or maximal finite value is
 575 | returned.  If the abstract value is too small, the input value is rounded to
 576 | a subnormal number, and the underflow and inexact exceptions are raised if
 577 | the abstract input cannot be represented exactly as a subnormal extended
 578 | double-precision floating-point number.
 579 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 580 | number of bits as single or double precision, respectively.  Otherwise, the
 581 | result is rounded to the full precision of the extended double-precision
 582 | format.
 583 |     The input significand must be normalized or smaller.  If the input
 584 | significand is not normalized, `zExp' must be 0; in that case, the result
 585 | returned is a subnormal number, and it must not require rounding.  The
 586 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 587 | Floating-Point Arithmetic.
 588 *----------------------------------------------------------------------------*/
 589
 590 static floatx80
 591  roundAndPackFloatx80(
 592      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 593  STATUS_PARAM)
 594 {
 595     int8 roundingMode;
 596     flag roundNearestEven, increment, isTiny;
 597     int64 roundIncrement, roundMask, roundBits;
 598
 599     roundingMode = STATUS(float_rounding_mode);
 600     roundNearestEven = ( roundingMode == float_round_nearest_even );
 601     if ( roundingPrecision == 80 ) goto precision80;
 602     if ( roundingPrecision == 64 ) {
 603         roundIncrement = LIT64( 0x0000000000000400 );
 604         roundMask = LIT64( 0x00000000000007FF );
 605     }
 606     else if ( roundingPrecision == 32 ) {
 607         roundIncrement = LIT64( 0x0000008000000000 );
 608         roundMask = LIT64( 0x000000FFFFFFFFFF );
 609     }
 610     else {
 611         goto precision80;
 612     }
 613     zSig0 |= ( zSig1 != 0 );
 614     if ( ! roundNearestEven ) {
 615         if ( roundingMode == float_round_to_zero ) {
 616             roundIncrement = 0;
 617         }
 618         else {
 619             roundIncrement = roundMask;
 620             if ( zSign ) {
 621                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 622             }
 623             else {
 624                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 625             }
 626         }
 627     }
 628     roundBits = zSig0 & roundMask;
 629     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 630         if (    ( 0x7FFE < zExp )
 631              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 632            ) {
 633             goto overflow;
 634         }
 635         if ( zExp <= 0 ) {
 636             isTiny =
 637                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 638                 || ( zExp < 0 )
 639                 || ( zSig0 <= zSig0 + roundIncrement );
 640             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 641             zExp = 0;
 642             roundBits = zSig0 & roundMask;
 643             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 644             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 645             zSig0 += roundIncrement;
 646             if ( (sbits64) zSig0 < 0 ) zExp = 1;
 647             roundIncrement = roundMask + 1;
 648             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 649                 roundMask |= roundIncrement;
 650             }
 651             zSig0 &= ~ roundMask;
 652             return packFloatx80( zSign, zExp, zSig0 );
 653         }
 654     }
 655     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 656     zSig0 += roundIncrement;
 657     if ( zSig0 < roundIncrement ) {
 658         ++zExp;
 659         zSig0 = LIT64( 0x8000000000000000 );
 660     }
 661     roundIncrement = roundMask + 1;
 662     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 663         roundMask |= roundIncrement;
 664     }
 665     zSig0 &= ~ roundMask;
 666     if ( zSig0 == 0 ) zExp = 0;
 667     return packFloatx80( zSign, zExp, zSig0 );
 668  precision80:
 669     increment = ( (sbits64) zSig1 < 0 );
 670     if ( ! roundNearestEven ) {
 671         if ( roundingMode == float_round_to_zero ) {
 672             increment = 0;
 673         }
 674         else {
 675             if ( zSign ) {
 676                 increment = ( roundingMode == float_round_down ) && zSig1;
 677             }
 678             else {
 679                 increment = ( roundingMode == float_round_up ) && zSig1;
 680             }
 681         }
 682     }
 683     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 684         if (    ( 0x7FFE < zExp )
 685              || (    ( zExp == 0x7FFE )
 686                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 687                   && increment
 688                 )
 689            ) {
 690             roundMask = 0;
 691  overflow:
 692             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 693             if (    ( roundingMode == float_round_to_zero )
 694                  || ( zSign && ( roundingMode == float_round_up ) )
 695                  || ( ! zSign && ( roundingMode == float_round_down ) )
 696                ) {
 697                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 698             }
 699             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 700         }
 701         if ( zExp <= 0 ) {
 702             isTiny =
 703                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 704                 || ( zExp < 0 )
 705                 || ! increment
 706                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 707             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 708             zExp = 0;
 709             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 710             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 711             if ( roundNearestEven ) {
 712                 increment = ( (sbits64) zSig1 < 0 );
 713             }
 714             else {
 715                 if ( zSign ) {
 716                     increment = ( roundingMode == float_round_down ) && zSig1;
 717                 }
 718                 else {
 719                     increment = ( roundingMode == float_round_up ) && zSig1;
 720                 }
 721             }
 722             if ( increment ) {
 723                 ++zSig0;
 724                 zSig0 &=
 725                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 726                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
 727             }
 728             return packFloatx80( zSign, zExp, zSig0 );
 729         }
 730     }
 731     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 732     if ( increment ) {
 733         ++zSig0;
 734         if ( zSig0 == 0 ) {
 735             ++zExp;
 736             zSig0 = LIT64( 0x8000000000000000 );
 737         }
 738         else {
 739             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 740         }
 741     }
 742     else {
 743         if ( zSig0 == 0 ) zExp = 0;
 744     }
 745     return packFloatx80( zSign, zExp, zSig0 );
 746
 747 }
 748
 749 /*----------------------------------------------------------------------------
 750 | Takes an abstract floating-point value having sign `zSign', exponent
 751 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 752 | and returns the proper extended double-precision floating-point value
 753 | corresponding to the abstract input.  This routine is just like
 754 | `roundAndPackFloatx80' except that the input significand does not have to be
 755 | normalized.
 756 *----------------------------------------------------------------------------*/
 757
 758 static floatx80
 759  normalizeRoundAndPackFloatx80(
 760      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 761  STATUS_PARAM)
 762 {
 763     int8 shiftCount;
 764
 765     if ( zSig0 == 0 ) {
 766         zSig0 = zSig1;
 767         zSig1 = 0;
 768         zExp -= 64;
 769     }
 770     shiftCount = countLeadingZeros64( zSig0 );
 771     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 772     zExp -= shiftCount;
 773     return
 774         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 775
 776 }
 777
 778 #endif
 779
 780 #ifdef FLOAT128
 781
 782 /*----------------------------------------------------------------------------
 783 | Returns the least-significant 64 fraction bits of the quadruple-precision
 784 | floating-point value `a'.
 785 *----------------------------------------------------------------------------*/
 786
 787 INLINE bits64 extractFloat128Frac1( float128 a )
 788 {
 789
 790     return a.low;
 791
 792 }
 793
 794 /*----------------------------------------------------------------------------
 795 | Returns the most-significant 48 fraction bits of the quadruple-precision
 796 | floating-point value `a'.
 797 *----------------------------------------------------------------------------*/
 798
 799 INLINE bits64 extractFloat128Frac0( float128 a )
 800 {
 801
 802     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 803
 804 }
 805
 806 /*----------------------------------------------------------------------------
 807 | Returns the exponent bits of the quadruple-precision floating-point value
 808 | `a'.
 809 *----------------------------------------------------------------------------*/
 810
 811 INLINE int32 extractFloat128Exp( float128 a )
 812 {
 813
 814     return ( a.high>>48 ) & 0x7FFF;
 815
 816 }
 817
 818 /*----------------------------------------------------------------------------
 819 | Returns the sign bit of the quadruple-precision floating-point value `a'.
 820 *----------------------------------------------------------------------------*/
 821
 822 INLINE flag extractFloat128Sign( float128 a )
 823 {
 824
 825     return a.high>>63;
 826
 827 }
 828
 829 /*----------------------------------------------------------------------------
 830 | Normalizes the subnormal quadruple-precision floating-point value
 831 | represented by the denormalized significand formed by the concatenation of
 832 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 833 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 834 | significand are stored at the location pointed to by `zSig0Ptr', and the
 835 | least significant 64 bits of the normalized significand are stored at the
 836 | location pointed to by `zSig1Ptr'.
 837 *----------------------------------------------------------------------------*/
 838
 839 static void
 840  normalizeFloat128Subnormal(
 841      bits64 aSig0,
 842      bits64 aSig1,
 843      int32 *zExpPtr,
 844      bits64 *zSig0Ptr,
 845      bits64 *zSig1Ptr
 846  )
 847 {
 848     int8 shiftCount;
 849
 850     if ( aSig0 == 0 ) {
 851         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 852         if ( shiftCount < 0 ) {
 853             *zSig0Ptr = aSig1>>( - shiftCount );
 854             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 855         }
 856         else {
 857             *zSig0Ptr = aSig1<<shiftCount;
 858             *zSig1Ptr = 0;
 859         }
 860         *zExpPtr = - shiftCount - 63;
 861     }
 862     else {
 863         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 864         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 865         *zExpPtr = 1 - shiftCount;
 866     }
 867
 868 }
 869
 870 /*----------------------------------------------------------------------------
 871 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 872 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 873 | floating-point value, returning the result.  After being shifted into the
 874 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 875 | added together to form the most significant 32 bits of the result.  This
 876 | means that any integer portion of `zSig0' will be added into the exponent.
 877 | Since a properly normalized significand will have an integer portion equal
 878 | to 1, the `zExp' input should be 1 less than the desired result exponent
 879 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 880 | significand.
 881 *----------------------------------------------------------------------------*/
 882
 883 INLINE float128
 884  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 885 {
 886     float128 z;
 887
 888     z.low = zSig1;
 889     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
 890     return z;
 891
 892 }
 893
 894 /*----------------------------------------------------------------------------
 895 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 896 | and extended significand formed by the concatenation of `zSig0', `zSig1',
 897 | and `zSig2', and returns the proper quadruple-precision floating-point value
 898 | corresponding to the abstract input.  Ordinarily, the abstract value is
 899 | simply rounded and packed into the quadruple-precision format, with the
 900 | inexact exception raised if the abstract input cannot be represented
 901 | exactly.  However, if the abstract value is too large, the overflow and
 902 | inexact exceptions are raised and an infinity or maximal finite value is
 903 | returned.  If the abstract value is too small, the input value is rounded to
 904 | a subnormal number, and the underflow and inexact exceptions are raised if
 905 | the abstract input cannot be represented exactly as a subnormal quadruple-
 906 | precision floating-point number.
 907 |     The input significand must be normalized or smaller.  If the input
 908 | significand is not normalized, `zExp' must be 0; in that case, the result
 909 | returned is a subnormal number, and it must not require rounding.  In the
 910 | usual case that the input significand is normalized, `zExp' must be 1 less
 911 | than the ``true'' floating-point exponent.  The handling of underflow and
 912 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 913 *----------------------------------------------------------------------------*/
 914
 915 static float128
 916  roundAndPackFloat128(
 917      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
 918 {
 919     int8 roundingMode;
 920     flag roundNearestEven, increment, isTiny;
 921
 922     roundingMode = STATUS(float_rounding_mode);
 923     roundNearestEven = ( roundingMode == float_round_nearest_even );
 924     increment = ( (sbits64) zSig2 < 0 );
 925     if ( ! roundNearestEven ) {
 926         if ( roundingMode == float_round_to_zero ) {
 927             increment = 0;
 928         }
 929         else {
 930             if ( zSign ) {
 931                 increment = ( roundingMode == float_round_down ) && zSig2;
 932             }
 933             else {
 934                 increment = ( roundingMode == float_round_up ) && zSig2;
 935             }
 936         }
 937     }
 938     if ( 0x7FFD <= (bits32) zExp ) {
 939         if (    ( 0x7FFD < zExp )
 940              || (    ( zExp == 0x7FFD )
 941                   && eq128(
 942                          LIT64( 0x0001FFFFFFFFFFFF ),
 943                          LIT64( 0xFFFFFFFFFFFFFFFF ),
 944                          zSig0,
 945                          zSig1
 946                      )
 947                   && increment
 948                 )
 949            ) {
 950             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 951             if (    ( roundingMode == float_round_to_zero )
 952                  || ( zSign && ( roundingMode == float_round_up ) )
 953                  || ( ! zSign && ( roundingMode == float_round_down ) )
 954                ) {
 955                 return
 956                     packFloat128(
 957                         zSign,
 958                         0x7FFE,
 959                         LIT64( 0x0000FFFFFFFFFFFF ),
 960                         LIT64( 0xFFFFFFFFFFFFFFFF )
 961                     );
 962             }
 963             return packFloat128( zSign, 0x7FFF, 0, 0 );
 964         }
 965         if ( zExp < 0 ) {
 966             isTiny =
 967                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 968                 || ( zExp < -1 )
 969                 || ! increment
 970                 || lt128(
 971                        zSig0,
 972                        zSig1,
 973                        LIT64( 0x0001FFFFFFFFFFFF ),
 974                        LIT64( 0xFFFFFFFFFFFFFFFF )
 975                    );
 976             shift128ExtraRightJamming(
 977                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
 978             zExp = 0;
 979             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
 980             if ( roundNearestEven ) {
 981                 increment = ( (sbits64) zSig2 < 0 );
 982             }
 983             else {
 984                 if ( zSign ) {
 985                     increment = ( roundingMode == float_round_down ) && zSig2;
 986                 }
 987                 else {
 988                     increment = ( roundingMode == float_round_up ) && zSig2;
 989                 }
 990             }
 991         }
 992     }
 993     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
 994     if ( increment ) {
 995         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
 996         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
 997     }
 998     else {
 999         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1000     }
1001     return packFloat128( zSign, zExp, zSig0, zSig1 );
1002
1003 }
1004
1005 /*----------------------------------------------------------------------------
1006 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1007 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1008 | returns the proper quadruple-precision floating-point value corresponding
1009 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1010 | except that the input significand has fewer bits and does not have to be
1011 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1012 | point exponent.
1013 *----------------------------------------------------------------------------*/
1014
1015 static float128
1016  normalizeRoundAndPackFloat128(
1017      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1018 {
1019     int8 shiftCount;
1020     bits64 zSig2;
1021
1022     if ( zSig0 == 0 ) {
1023         zSig0 = zSig1;
1024         zSig1 = 0;
1025         zExp -= 64;
1026     }
1027     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1028     if ( 0 <= shiftCount ) {
1029         zSig2 = 0;
1030         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1031     }
1032     else {
1033         shift128ExtraRightJamming(
1034             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1035     }
1036     zExp -= shiftCount;
1037     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1038
1039 }
1040
1041 #endif
1042
1043 /*----------------------------------------------------------------------------
1044 | Returns the result of converting the 32-bit two's complement integer `a'
1045 | to the single-precision floating-point format.  The conversion is performed
1046 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1047 *----------------------------------------------------------------------------*/
1048
1049 float32 int32_to_float32( int32 a STATUS_PARAM )
1050 {
1051     flag zSign;
1052
1053     if ( a == 0 ) return 0;
1054     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1055     zSign = ( a < 0 );
1056     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1057
1058 }
1059
1060 /*----------------------------------------------------------------------------
1061 | Returns the result of converting the 32-bit two's complement integer `a'
1062 | to the double-precision floating-point format.  The conversion is performed
1063 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1064 *----------------------------------------------------------------------------*/
1065
1066 float64 int32_to_float64( int32 a STATUS_PARAM )
1067 {
1068     flag zSign;
1069     uint32 absA;
1070     int8 shiftCount;
1071     bits64 zSig;
1072
1073     if ( a == 0 ) return 0;
1074     zSign = ( a < 0 );
1075     absA = zSign ? - a : a;
1076     shiftCount = countLeadingZeros32( absA ) + 21;
1077     zSig = absA;
1078     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1079
1080 }
1081
1082 #ifdef FLOATX80
1083
1084 /*----------------------------------------------------------------------------
1085 | Returns the result of converting the 32-bit two's complement integer `a'
1086 | to the extended double-precision floating-point format.  The conversion
1087 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1088 | Arithmetic.
1089 *----------------------------------------------------------------------------*/
1090
1091 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1092 {
1093     flag zSign;
1094     uint32 absA;
1095     int8 shiftCount;
1096     bits64 zSig;
1097
1098     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1099     zSign = ( a < 0 );
1100     absA = zSign ? - a : a;
1101     shiftCount = countLeadingZeros32( absA ) + 32;
1102     zSig = absA;
1103     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1104
1105 }
1106
1107 #endif
1108
1109 #ifdef FLOAT128
1110
1111 /*----------------------------------------------------------------------------
1112 | Returns the result of converting the 32-bit two's complement integer `a' to
1113 | the quadruple-precision floating-point format.  The conversion is performed
1114 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115 *----------------------------------------------------------------------------*/
1116
1117 float128 int32_to_float128( int32 a STATUS_PARAM )
1118 {
1119     flag zSign;
1120     uint32 absA;
1121     int8 shiftCount;
1122     bits64 zSig0;
1123
1124     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1125     zSign = ( a < 0 );
1126     absA = zSign ? - a : a;
1127     shiftCount = countLeadingZeros32( absA ) + 17;
1128     zSig0 = absA;
1129     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1130
1131 }
1132
1133 #endif
1134
1135 /*----------------------------------------------------------------------------
1136 | Returns the result of converting the 64-bit two's complement integer `a'
1137 | to the single-precision floating-point format.  The conversion is performed
1138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139 *----------------------------------------------------------------------------*/
1140
1141 float32 int64_to_float32( int64 a STATUS_PARAM )
1142 {
1143     flag zSign;
1144     uint64 absA;
1145     int8 shiftCount;
1146
1147     if ( a == 0 ) return 0;
1148     zSign = ( a < 0 );
1149     absA = zSign ? - a : a;
1150     shiftCount = countLeadingZeros64( absA ) - 40;
1151     if ( 0 <= shiftCount ) {
1152         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1153     }
1154     else {
1155         shiftCount += 7;
1156         if ( shiftCount < 0 ) {
1157             shift64RightJamming( absA, - shiftCount, &absA );
1158         }
1159         else {
1160             absA <<= shiftCount;
1161         }
1162         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1163     }
1164
1165 }
1166
1167 /*----------------------------------------------------------------------------
1168 | Returns the result of converting the 64-bit two's complement integer `a'
1169 | to the double-precision floating-point format.  The conversion is performed
1170 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1171 *----------------------------------------------------------------------------*/
1172
1173 float64 int64_to_float64( int64 a STATUS_PARAM )
1174 {
1175     flag zSign;
1176
1177     if ( a == 0 ) return 0;
1178     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1179         return packFloat64( 1, 0x43E, 0 );
1180     }
1181     zSign = ( a < 0 );
1182     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1183
1184 }
1185
1186 #ifdef FLOATX80
1187
1188 /*----------------------------------------------------------------------------
1189 | Returns the result of converting the 64-bit two's complement integer `a'
1190 | to the extended double-precision floating-point format.  The conversion
1191 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1192 | Arithmetic.
1193 *----------------------------------------------------------------------------*/
1194
1195 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1196 {
1197     flag zSign;
1198     uint64 absA;
1199     int8 shiftCount;
1200
1201     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1202     zSign = ( a < 0 );
1203     absA = zSign ? - a : a;
1204     shiftCount = countLeadingZeros64( absA );
1205     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1206
1207 }
1208
1209 #endif
1210
1211 #ifdef FLOAT128
1212
1213 /*----------------------------------------------------------------------------
1214 | Returns the result of converting the 64-bit two's complement integer `a' to
1215 | the quadruple-precision floating-point format.  The conversion is performed
1216 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1217 *----------------------------------------------------------------------------*/
1218
1219 float128 int64_to_float128( int64 a STATUS_PARAM )
1220 {
1221     flag zSign;
1222     uint64 absA;
1223     int8 shiftCount;
1224     int32 zExp;
1225     bits64 zSig0, zSig1;
1226
1227     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1228     zSign = ( a < 0 );
1229     absA = zSign ? - a : a;
1230     shiftCount = countLeadingZeros64( absA ) + 49;
1231     zExp = 0x406E - shiftCount;
1232     if ( 64 <= shiftCount ) {
1233         zSig1 = 0;
1234         zSig0 = absA;
1235         shiftCount -= 64;
1236     }
1237     else {
1238         zSig1 = absA;
1239         zSig0 = 0;
1240     }
1241     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1242     return packFloat128( zSign, zExp, zSig0, zSig1 );
1243
1244 }
1245
1246 #endif
1247
1248 /*----------------------------------------------------------------------------
1249 | Returns the result of converting the single-precision floating-point value
1250 | `a' to the 32-bit two's complement integer format.  The conversion is
1251 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1252 | Arithmetic---which means in particular that the conversion is rounded
1253 | according to the current rounding mode.  If `a' is a NaN, the largest
1254 | positive integer is returned.  Otherwise, if the conversion overflows, the
1255 | largest integer with the same sign as `a' is returned.
1256 *----------------------------------------------------------------------------*/
1257
1258 int32 float32_to_int32( float32 a STATUS_PARAM )
1259 {
1260     flag aSign;
1261     int16 aExp, shiftCount;
1262     bits32 aSig;
1263     bits64 aSig64;
1264
1265     aSig = extractFloat32Frac( a );
1266     aExp = extractFloat32Exp( a );
1267     aSign = extractFloat32Sign( a );
1268     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1269     if ( aExp ) aSig |= 0x00800000;
1270     shiftCount = 0xAF - aExp;
1271     aSig64 = aSig;
1272     aSig64 <<= 32;
1273     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1274     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1275
1276 }
1277
1278 /*----------------------------------------------------------------------------
1279 | Returns the result of converting the single-precision floating-point value
1280 | `a' to the 32-bit two's complement integer format.  The conversion is
1281 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1282 | Arithmetic, except that the conversion is always rounded toward zero.
1283 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1284 | the conversion overflows, the largest integer with the same sign as `a' is
1285 | returned.
1286 *----------------------------------------------------------------------------*/
1287
1288 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1289 {
1290     flag aSign;
1291     int16 aExp, shiftCount;
1292     bits32 aSig;
1293     int32 z;
1294
1295     aSig = extractFloat32Frac( a );
1296     aExp = extractFloat32Exp( a );
1297     aSign = extractFloat32Sign( a );
1298     shiftCount = aExp - 0x9E;
1299     if ( 0 <= shiftCount ) {
1300         if ( a != 0xCF000000 ) {
1301             float_raise( float_flag_invalid STATUS_VAR);
1302             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1303         }
1304         return (sbits32) 0x80000000;
1305     }
1306     else if ( aExp <= 0x7E ) {
1307         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1308         return 0;
1309     }
1310     aSig = ( aSig | 0x00800000 )<<8;
1311     z = aSig>>( - shiftCount );
1312     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1313         STATUS(float_exception_flags) |= float_flag_inexact;
1314     }
1315     if ( aSign ) z = - z;
1316     return z;
1317
1318 }
1319
1320 /*----------------------------------------------------------------------------
1321 | Returns the result of converting the single-precision floating-point value
1322 | `a' to the 64-bit two's complement integer format.  The conversion is
1323 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1324 | Arithmetic---which means in particular that the conversion is rounded
1325 | according to the current rounding mode.  If `a' is a NaN, the largest
1326 | positive integer is returned.  Otherwise, if the conversion overflows, the
1327 | largest integer with the same sign as `a' is returned.
1328 *----------------------------------------------------------------------------*/
1329
1330 int64 float32_to_int64( float32 a STATUS_PARAM )
1331 {
1332     flag aSign;
1333     int16 aExp, shiftCount;
1334     bits32 aSig;
1335     bits64 aSig64, aSigExtra;
1336
1337     aSig = extractFloat32Frac( a );
1338     aExp = extractFloat32Exp( a );
1339     aSign = extractFloat32Sign( a );
1340     shiftCount = 0xBE - aExp;
1341     if ( shiftCount < 0 ) {
1342         float_raise( float_flag_invalid STATUS_VAR);
1343         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1344             return LIT64( 0x7FFFFFFFFFFFFFFF );
1345         }
1346         return (sbits64) LIT64( 0x8000000000000000 );
1347     }
1348     if ( aExp ) aSig |= 0x00800000;
1349     aSig64 = aSig;
1350     aSig64 <<= 40;
1351     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1352     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1353
1354 }
1355
1356 /*----------------------------------------------------------------------------
1357 | Returns the result of converting the single-precision floating-point value
1358 | `a' to the 64-bit two's complement integer format.  The conversion is
1359 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1360 | Arithmetic, except that the conversion is always rounded toward zero.  If
1361 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1362 | conversion overflows, the largest integer with the same sign as `a' is
1363 | returned.
1364 *----------------------------------------------------------------------------*/
1365
1366 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1367 {
1368     flag aSign;
1369     int16 aExp, shiftCount;
1370     bits32 aSig;
1371     bits64 aSig64;
1372     int64 z;
1373
1374     aSig = extractFloat32Frac( a );
1375     aExp = extractFloat32Exp( a );
1376     aSign = extractFloat32Sign( a );
1377     shiftCount = aExp - 0xBE;
1378     if ( 0 <= shiftCount ) {
1379         if ( a != 0xDF000000 ) {
1380             float_raise( float_flag_invalid STATUS_VAR);
1381             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1382                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1383             }
1384         }
1385         return (sbits64) LIT64( 0x8000000000000000 );
1386     }
1387     else if ( aExp <= 0x7E ) {
1388         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1389         return 0;
1390     }
1391     aSig64 = aSig | 0x00800000;
1392     aSig64 <<= 40;
1393     z = aSig64>>( - shiftCount );
1394     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1395         STATUS(float_exception_flags) |= float_flag_inexact;
1396     }
1397     if ( aSign ) z = - z;
1398     return z;
1399
1400 }
1401
1402 /*----------------------------------------------------------------------------
1403 | Returns the result of converting the single-precision floating-point value
1404 | `a' to the double-precision floating-point format.  The conversion is
1405 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1406 | Arithmetic.
1407 *----------------------------------------------------------------------------*/
1408
1409 float64 float32_to_float64( float32 a STATUS_PARAM )
1410 {
1411     flag aSign;
1412     int16 aExp;
1413     bits32 aSig;
1414
1415     aSig = extractFloat32Frac( a );
1416     aExp = extractFloat32Exp( a );
1417     aSign = extractFloat32Sign( a );
1418     if ( aExp == 0xFF ) {
1419         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1420         return packFloat64( aSign, 0x7FF, 0 );
1421     }
1422     if ( aExp == 0 ) {
1423         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1424         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1425         --aExp;
1426     }
1427     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1428
1429 }
1430
1431 #ifdef FLOATX80
1432
1433 /*----------------------------------------------------------------------------
1434 | Returns the result of converting the single-precision floating-point value
1435 | `a' to the extended double-precision floating-point format.  The conversion
1436 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1437 | Arithmetic.
1438 *----------------------------------------------------------------------------*/
1439
1440 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1441 {
1442     flag aSign;
1443     int16 aExp;
1444     bits32 aSig;
1445
1446     aSig = extractFloat32Frac( a );
1447     aExp = extractFloat32Exp( a );
1448     aSign = extractFloat32Sign( a );
1449     if ( aExp == 0xFF ) {
1450         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1451         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1452     }
1453     if ( aExp == 0 ) {
1454         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1455         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1456     }
1457     aSig |= 0x00800000;
1458     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1459
1460 }
1461
1462 #endif
1463
1464 #ifdef FLOAT128
1465
1466 /*----------------------------------------------------------------------------
1467 | Returns the result of converting the single-precision floating-point value
1468 | `a' to the double-precision floating-point format.  The conversion is
1469 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1470 | Arithmetic.
1471 *----------------------------------------------------------------------------*/
1472
1473 float128 float32_to_float128( float32 a STATUS_PARAM )
1474 {
1475     flag aSign;
1476     int16 aExp;
1477     bits32 aSig;
1478
1479     aSig = extractFloat32Frac( a );
1480     aExp = extractFloat32Exp( a );
1481     aSign = extractFloat32Sign( a );
1482     if ( aExp == 0xFF ) {
1483         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1484         return packFloat128( aSign, 0x7FFF, 0, 0 );
1485     }
1486     if ( aExp == 0 ) {
1487         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1488         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1489         --aExp;
1490     }
1491     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1492
1493 }
1494
1495 #endif
1496
1497 /*----------------------------------------------------------------------------
1498 | Rounds the single-precision floating-point value `a' to an integer, and
1499 | returns the result as a single-precision floating-point value.  The
1500 | operation is performed according to the IEC/IEEE Standard for Binary
1501 | Floating-Point Arithmetic.
1502 *----------------------------------------------------------------------------*/
1503
1504 float32 float32_round_to_int( float32 a STATUS_PARAM)
1505 {
1506     flag aSign;
1507     int16 aExp;
1508     bits32 lastBitMask, roundBitsMask;
1509     int8 roundingMode;
1510     float32 z;
1511
1512     aExp = extractFloat32Exp( a );
1513     if ( 0x96 <= aExp ) {
1514         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1515             return propagateFloat32NaN( a, a STATUS_VAR );
1516         }
1517         return a;
1518     }
1519     if ( aExp <= 0x7E ) {
1520         if ( (bits32) ( a<<1 ) == 0 ) return a;
1521         STATUS(float_exception_flags) |= float_flag_inexact;
1522         aSign = extractFloat32Sign( a );
1523         switch ( STATUS(float_rounding_mode) ) {
1524          case float_round_nearest_even:
1525             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1526                 return packFloat32( aSign, 0x7F, 0 );
1527             }
1528             break;
1529          case float_round_down:
1530             return aSign ? 0xBF800000 : 0;
1531          case float_round_up:
1532             return aSign ? 0x80000000 : 0x3F800000;
1533         }
1534         return packFloat32( aSign, 0, 0 );
1535     }
1536     lastBitMask = 1;
1537     lastBitMask <<= 0x96 - aExp;
1538     roundBitsMask = lastBitMask - 1;
1539     z = a;
1540     roundingMode = STATUS(float_rounding_mode);
1541     if ( roundingMode == float_round_nearest_even ) {
1542         z += lastBitMask>>1;
1543         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1544     }
1545     else if ( roundingMode != float_round_to_zero ) {
1546         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1547             z += roundBitsMask;
1548         }
1549     }
1550     z &= ~ roundBitsMask;
1551     if ( z != a ) STATUS(float_exception_flags) |= float_flag_inexact;
1552     return z;
1553
1554 }
1555
1556 /*----------------------------------------------------------------------------
1557 | Returns the result of adding the absolute values of the single-precision
1558 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1559 | before being returned.  `zSign' is ignored if the result is a NaN.
1560 | The addition is performed according to the IEC/IEEE Standard for Binary
1561 | Floating-Point Arithmetic.
1562 *----------------------------------------------------------------------------*/
1563
1564 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1565 {
1566     int16 aExp, bExp, zExp;
1567     bits32 aSig, bSig, zSig;
1568     int16 expDiff;
1569
1570     aSig = extractFloat32Frac( a );
1571     aExp = extractFloat32Exp( a );
1572     bSig = extractFloat32Frac( b );
1573     bExp = extractFloat32Exp( b );
1574     expDiff = aExp - bExp;
1575     aSig <<= 6;
1576     bSig <<= 6;
1577     if ( 0 < expDiff ) {
1578         if ( aExp == 0xFF ) {
1579             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1580             return a;
1581         }
1582         if ( bExp == 0 ) {
1583             --expDiff;
1584         }
1585         else {
1586             bSig |= 0x20000000;
1587         }
1588         shift32RightJamming( bSig, expDiff, &bSig );
1589         zExp = aExp;
1590     }
1591     else if ( expDiff < 0 ) {
1592         if ( bExp == 0xFF ) {
1593             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1594             return packFloat32( zSign, 0xFF, 0 );
1595         }
1596         if ( aExp == 0 ) {
1597             ++expDiff;
1598         }
1599         else {
1600             aSig |= 0x20000000;
1601         }
1602         shift32RightJamming( aSig, - expDiff, &aSig );
1603         zExp = bExp;
1604     }
1605     else {
1606         if ( aExp == 0xFF ) {
1607             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1608             return a;
1609         }
1610         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1611         zSig = 0x40000000 + aSig + bSig;
1612         zExp = aExp;
1613         goto roundAndPack;
1614     }
1615     aSig |= 0x20000000;
1616     zSig = ( aSig + bSig )<<1;
1617     --zExp;
1618     if ( (sbits32) zSig < 0 ) {
1619         zSig = aSig + bSig;
1620         ++zExp;
1621     }
1622  roundAndPack:
1623     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1624
1625 }
1626
1627 /*----------------------------------------------------------------------------
1628 | Returns the result of subtracting the absolute values of the single-
1629 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1630 | difference is negated before being returned.  `zSign' is ignored if the
1631 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1632 | Standard for Binary Floating-Point Arithmetic.
1633 *----------------------------------------------------------------------------*/
1634
1635 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1636 {
1637     int16 aExp, bExp, zExp;
1638     bits32 aSig, bSig, zSig;
1639     int16 expDiff;
1640
1641     aSig = extractFloat32Frac( a );
1642     aExp = extractFloat32Exp( a );
1643     bSig = extractFloat32Frac( b );
1644     bExp = extractFloat32Exp( b );
1645     expDiff = aExp - bExp;
1646     aSig <<= 7;
1647     bSig <<= 7;
1648     if ( 0 < expDiff ) goto aExpBigger;
1649     if ( expDiff < 0 ) goto bExpBigger;
1650     if ( aExp == 0xFF ) {
1651         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1652         float_raise( float_flag_invalid STATUS_VAR);
1653         return float32_default_nan;
1654     }
1655     if ( aExp == 0 ) {
1656         aExp = 1;
1657         bExp = 1;
1658     }
1659     if ( bSig < aSig ) goto aBigger;
1660     if ( aSig < bSig ) goto bBigger;
1661     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1662  bExpBigger:
1663     if ( bExp == 0xFF ) {
1664         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1665         return packFloat32( zSign ^ 1, 0xFF, 0 );
1666     }
1667     if ( aExp == 0 ) {
1668         ++expDiff;
1669     }
1670     else {
1671         aSig |= 0x40000000;
1672     }
1673     shift32RightJamming( aSig, - expDiff, &aSig );
1674     bSig |= 0x40000000;
1675  bBigger:
1676     zSig = bSig - aSig;
1677     zExp = bExp;
1678     zSign ^= 1;
1679     goto normalizeRoundAndPack;
1680  aExpBigger:
1681     if ( aExp == 0xFF ) {
1682         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1683         return a;
1684     }
1685     if ( bExp == 0 ) {
1686         --expDiff;
1687     }
1688     else {
1689         bSig |= 0x40000000;
1690     }
1691     shift32RightJamming( bSig, expDiff, &bSig );
1692     aSig |= 0x40000000;
1693  aBigger:
1694     zSig = aSig - bSig;
1695     zExp = aExp;
1696  normalizeRoundAndPack:
1697     --zExp;
1698     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1699
1700 }
1701
1702 /*----------------------------------------------------------------------------
1703 | Returns the result of adding the single-precision floating-point values `a'
1704 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1705 | Binary Floating-Point Arithmetic.
1706 *----------------------------------------------------------------------------*/
1707
1708 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1709 {
1710     flag aSign, bSign;
1711
1712     aSign = extractFloat32Sign( a );
1713     bSign = extractFloat32Sign( b );
1714     if ( aSign == bSign ) {
1715         return addFloat32Sigs( a, b, aSign STATUS_VAR);
1716     }
1717     else {
1718         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1719     }
1720
1721 }
1722
1723 /*----------------------------------------------------------------------------
1724 | Returns the result of subtracting the single-precision floating-point values
1725 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1726 | for Binary Floating-Point Arithmetic.
1727 *----------------------------------------------------------------------------*/
1728
1729 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1730 {
1731     flag aSign, bSign;
1732
1733     aSign = extractFloat32Sign( a );
1734     bSign = extractFloat32Sign( b );
1735     if ( aSign == bSign ) {
1736         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1737     }
1738     else {
1739         return addFloat32Sigs( a, b, aSign STATUS_VAR );
1740     }
1741
1742 }
1743
1744 /*----------------------------------------------------------------------------
1745 | Returns the result of multiplying the single-precision floating-point values
1746 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1747 | for Binary Floating-Point Arithmetic.
1748 *----------------------------------------------------------------------------*/
1749
1750 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1751 {
1752     flag aSign, bSign, zSign;
1753     int16 aExp, bExp, zExp;
1754     bits32 aSig, bSig;
1755     bits64 zSig64;
1756     bits32 zSig;
1757
1758     aSig = extractFloat32Frac( a );
1759     aExp = extractFloat32Exp( a );
1760     aSign = extractFloat32Sign( a );
1761     bSig = extractFloat32Frac( b );
1762     bExp = extractFloat32Exp( b );
1763     bSign = extractFloat32Sign( b );
1764     zSign = aSign ^ bSign;
1765     if ( aExp == 0xFF ) {
1766         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1767             return propagateFloat32NaN( a, b STATUS_VAR );
1768         }
1769         if ( ( bExp | bSig ) == 0 ) {
1770             float_raise( float_flag_invalid STATUS_VAR);
1771             return float32_default_nan;
1772         }
1773         return packFloat32( zSign, 0xFF, 0 );
1774     }
1775     if ( bExp == 0xFF ) {
1776         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1777         if ( ( aExp | aSig ) == 0 ) {
1778             float_raise( float_flag_invalid STATUS_VAR);
1779             return float32_default_nan;
1780         }
1781         return packFloat32( zSign, 0xFF, 0 );
1782     }
1783     if ( aExp == 0 ) {
1784         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1785         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1786     }
1787     if ( bExp == 0 ) {
1788         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1789         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1790     }
1791     zExp = aExp + bExp - 0x7F;
1792     aSig = ( aSig | 0x00800000 )<<7;
1793     bSig = ( bSig | 0x00800000 )<<8;
1794     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1795     zSig = zSig64;
1796     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1797         zSig <<= 1;
1798         --zExp;
1799     }
1800     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1801
1802 }
1803
1804 /*----------------------------------------------------------------------------
1805 | Returns the result of dividing the single-precision floating-point value `a'
1806 | by the corresponding value `b'.  The operation is performed according to the
1807 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1808 *----------------------------------------------------------------------------*/
1809
1810 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1811 {
1812     flag aSign, bSign, zSign;
1813     int16 aExp, bExp, zExp;
1814     bits32 aSig, bSig, zSig;
1815
1816     aSig = extractFloat32Frac( a );
1817     aExp = extractFloat32Exp( a );
1818     aSign = extractFloat32Sign( a );
1819     bSig = extractFloat32Frac( b );
1820     bExp = extractFloat32Exp( b );
1821     bSign = extractFloat32Sign( b );
1822     zSign = aSign ^ bSign;
1823     if ( aExp == 0xFF ) {
1824         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1825         if ( bExp == 0xFF ) {
1826             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1827             float_raise( float_flag_invalid STATUS_VAR);
1828             return float32_default_nan;
1829         }
1830         return packFloat32( zSign, 0xFF, 0 );
1831     }
1832     if ( bExp == 0xFF ) {
1833         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1834         return packFloat32( zSign, 0, 0 );
1835     }
1836     if ( bExp == 0 ) {
1837         if ( bSig == 0 ) {
1838             if ( ( aExp | aSig ) == 0 ) {
1839                 float_raise( float_flag_invalid STATUS_VAR);
1840                 return float32_default_nan;
1841             }
1842             float_raise( float_flag_divbyzero STATUS_VAR);
1843             return packFloat32( zSign, 0xFF, 0 );
1844         }
1845         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1846     }
1847     if ( aExp == 0 ) {
1848         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1849         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1850     }
1851     zExp = aExp - bExp + 0x7D;
1852     aSig = ( aSig | 0x00800000 )<<7;
1853     bSig = ( bSig | 0x00800000 )<<8;
1854     if ( bSig <= ( aSig + aSig ) ) {
1855         aSig >>= 1;
1856         ++zExp;
1857     }
1858     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1859     if ( ( zSig & 0x3F ) == 0 ) {
1860         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1861     }
1862     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1863
1864 }
1865
1866 /*----------------------------------------------------------------------------
1867 | Returns the remainder of the single-precision floating-point value `a'
1868 | with respect to the corresponding value `b'.  The operation is performed
1869 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1870 *----------------------------------------------------------------------------*/
1871
1872 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1873 {
1874     flag aSign, bSign, zSign;
1875     int16 aExp, bExp, expDiff;
1876     bits32 aSig, bSig;
1877     bits32 q;
1878     bits64 aSig64, bSig64, q64;
1879     bits32 alternateASig;
1880     sbits32 sigMean;
1881
1882     aSig = extractFloat32Frac( a );
1883     aExp = extractFloat32Exp( a );
1884     aSign = extractFloat32Sign( a );
1885     bSig = extractFloat32Frac( b );
1886     bExp = extractFloat32Exp( b );
1887     bSign = extractFloat32Sign( b );
1888     if ( aExp == 0xFF ) {
1889         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1890             return propagateFloat32NaN( a, b STATUS_VAR );
1891         }
1892         float_raise( float_flag_invalid STATUS_VAR);
1893         return float32_default_nan;
1894     }
1895     if ( bExp == 0xFF ) {
1896         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1897         return a;
1898     }
1899     if ( bExp == 0 ) {
1900         if ( bSig == 0 ) {
1901             float_raise( float_flag_invalid STATUS_VAR);
1902             return float32_default_nan;
1903         }
1904         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1905     }
1906     if ( aExp == 0 ) {
1907         if ( aSig == 0 ) return a;
1908         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1909     }
1910     expDiff = aExp - bExp;
1911     aSig |= 0x00800000;
1912     bSig |= 0x00800000;
1913     if ( expDiff < 32 ) {
1914         aSig <<= 8;
1915         bSig <<= 8;
1916         if ( expDiff < 0 ) {
1917             if ( expDiff < -1 ) return a;
1918             aSig >>= 1;
1919         }
1920         q = ( bSig <= aSig );
1921         if ( q ) aSig -= bSig;
1922         if ( 0 < expDiff ) {
1923             q = ( ( (bits64) aSig )<<32 ) / bSig;
1924             q >>= 32 - expDiff;
1925             bSig >>= 2;
1926             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1927         }
1928         else {
1929             aSig >>= 2;
1930             bSig >>= 2;
1931         }
1932     }
1933     else {
1934         if ( bSig <= aSig ) aSig -= bSig;
1935         aSig64 = ( (bits64) aSig )<<40;
1936         bSig64 = ( (bits64) bSig )<<40;
1937         expDiff -= 64;
1938         while ( 0 < expDiff ) {
1939             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1940             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1941             aSig64 = - ( ( bSig * q64 )<<38 );
1942             expDiff -= 62;
1943         }
1944         expDiff += 64;
1945         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1946         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1947         q = q64>>( 64 - expDiff );
1948         bSig <<= 6;
1949         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1950     }
1951     do {
1952         alternateASig = aSig;
1953         ++q;
1954         aSig -= bSig;
1955     } while ( 0 <= (sbits32) aSig );
1956     sigMean = aSig + alternateASig;
1957     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1958         aSig = alternateASig;
1959     }
1960     zSign = ( (sbits32) aSig < 0 );
1961     if ( zSign ) aSig = - aSig;
1962     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
1963
1964 }
1965
1966 /*----------------------------------------------------------------------------
1967 | Returns the square root of the single-precision floating-point value `a'.
1968 | The operation is performed according to the IEC/IEEE Standard for Binary
1969 | Floating-Point Arithmetic.
1970 *----------------------------------------------------------------------------*/
1971
1972 float32 float32_sqrt( float32 a STATUS_PARAM )
1973 {
1974     flag aSign;
1975     int16 aExp, zExp;
1976     bits32 aSig, zSig;
1977     bits64 rem, term;
1978
1979     aSig = extractFloat32Frac( a );
1980     aExp = extractFloat32Exp( a );
1981     aSign = extractFloat32Sign( a );
1982     if ( aExp == 0xFF ) {
1983         if ( aSig ) return propagateFloat32NaN( a, 0 STATUS_VAR );
1984         if ( ! aSign ) return a;
1985         float_raise( float_flag_invalid STATUS_VAR);
1986         return float32_default_nan;
1987     }
1988     if ( aSign ) {
1989         if ( ( aExp | aSig ) == 0 ) return a;
1990         float_raise( float_flag_invalid STATUS_VAR);
1991         return float32_default_nan;
1992     }
1993     if ( aExp == 0 ) {
1994         if ( aSig == 0 ) return 0;
1995         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1996     }
1997     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1998     aSig = ( aSig | 0x00800000 )<<8;
1999     zSig = estimateSqrt32( aExp, aSig ) + 2;
2000     if ( ( zSig & 0x7F ) <= 5 ) {
2001         if ( zSig < 2 ) {
2002             zSig = 0x7FFFFFFF;
2003             goto roundAndPack;
2004         }
2005         aSig >>= aExp & 1;
2006         term = ( (bits64) zSig ) * zSig;
2007         rem = ( ( (bits64) aSig )<<32 ) - term;
2008         while ( (sbits64) rem < 0 ) {
2009             --zSig;
2010             rem += ( ( (bits64) zSig )<<1 ) | 1;
2011         }
2012         zSig |= ( rem != 0 );
2013     }
2014     shift32RightJamming( zSig, 1, &zSig );
2015  roundAndPack:
2016     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2017
2018 }
2019
2020 /*----------------------------------------------------------------------------
2021 | Returns 1 if the single-precision floating-point value `a' is equal to
2022 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2023 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2024 *----------------------------------------------------------------------------*/
2025
2026 flag float32_eq( float32 a, float32 b STATUS_PARAM )
2027 {
2028
2029     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2030          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2031        ) {
2032         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2033             float_raise( float_flag_invalid STATUS_VAR);
2034         }
2035         return 0;
2036     }
2037     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2038
2039 }
2040
2041 /*----------------------------------------------------------------------------
2042 | Returns 1 if the single-precision floating-point value `a' is less than
2043 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
2044 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2045 | Arithmetic.
2046 *----------------------------------------------------------------------------*/
2047
2048 flag float32_le( float32 a, float32 b STATUS_PARAM )
2049 {
2050     flag aSign, bSign;
2051
2052     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2053          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2054        ) {
2055         float_raise( float_flag_invalid STATUS_VAR);
2056         return 0;
2057     }
2058     aSign = extractFloat32Sign( a );
2059     bSign = extractFloat32Sign( b );
2060     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2061     return ( a == b ) || ( aSign ^ ( a < b ) );
2062
2063 }
2064
2065 /*----------------------------------------------------------------------------
2066 | Returns 1 if the single-precision floating-point value `a' is less than
2067 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2068 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2069 *----------------------------------------------------------------------------*/
2070
2071 flag float32_lt( float32 a, float32 b STATUS_PARAM )
2072 {
2073     flag aSign, bSign;
2074
2075     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2076          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2077        ) {
2078         float_raise( float_flag_invalid STATUS_VAR);
2079         return 0;
2080     }
2081     aSign = extractFloat32Sign( a );
2082     bSign = extractFloat32Sign( b );
2083     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2084     return ( a != b ) && ( aSign ^ ( a < b ) );
2085
2086 }
2087
2088 /*----------------------------------------------------------------------------
2089 | Returns 1 if the single-precision floating-point value `a' is equal to
2090 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2091 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2092 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2093 *----------------------------------------------------------------------------*/
2094
2095 flag float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2096 {
2097
2098     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2099          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2100        ) {
2101         float_raise( float_flag_invalid STATUS_VAR);
2102         return 0;
2103     }
2104     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2105
2106 }
2107
2108 /*----------------------------------------------------------------------------
2109 | Returns 1 if the single-precision floating-point value `a' is less than or
2110 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2111 | cause an exception.  Otherwise, the comparison is performed according to the
2112 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2113 *----------------------------------------------------------------------------*/
2114
2115 flag float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2116 {
2117     flag aSign, bSign;
2118
2119     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2120          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2121        ) {
2122         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2123             float_raise( float_flag_invalid STATUS_VAR);
2124         }
2125         return 0;
2126     }
2127     aSign = extractFloat32Sign( a );
2128     bSign = extractFloat32Sign( b );
2129     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2130     return ( a == b ) || ( aSign ^ ( a < b ) );
2131
2132 }
2133
2134 /*----------------------------------------------------------------------------
2135 | Returns 1 if the single-precision floating-point value `a' is less than
2136 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2137 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2138 | Standard for Binary Floating-Point Arithmetic.
2139 *----------------------------------------------------------------------------*/
2140
2141 flag float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2142 {
2143     flag aSign, bSign;
2144
2145     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2146          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2147        ) {
2148         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2149             float_raise( float_flag_invalid STATUS_VAR);
2150         }
2151         return 0;
2152     }
2153     aSign = extractFloat32Sign( a );
2154     bSign = extractFloat32Sign( b );
2155     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2156     return ( a != b ) && ( aSign ^ ( a < b ) );
2157
2158 }
2159
2160 /*----------------------------------------------------------------------------
2161 | Returns the result of converting the double-precision floating-point value
2162 | `a' to the 32-bit two's complement integer format.  The conversion is
2163 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2164 | Arithmetic---which means in particular that the conversion is rounded
2165 | according to the current rounding mode.  If `a' is a NaN, the largest
2166 | positive integer is returned.  Otherwise, if the conversion overflows, the
2167 | largest integer with the same sign as `a' is returned.
2168 *----------------------------------------------------------------------------*/
2169
2170 int32 float64_to_int32( float64 a STATUS_PARAM )
2171 {
2172     flag aSign;
2173     int16 aExp, shiftCount;
2174     bits64 aSig;
2175
2176     aSig = extractFloat64Frac( a );
2177     aExp = extractFloat64Exp( a );
2178     aSign = extractFloat64Sign( a );
2179     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2180     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2181     shiftCount = 0x42C - aExp;
2182     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2183     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2184
2185 }
2186
2187 /*----------------------------------------------------------------------------
2188 | Returns the result of converting the double-precision floating-point value
2189 | `a' to the 32-bit two's complement integer format.  The conversion is
2190 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2191 | Arithmetic, except that the conversion is always rounded toward zero.
2192 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2193 | the conversion overflows, the largest integer with the same sign as `a' is
2194 | returned.
2195 *----------------------------------------------------------------------------*/
2196
2197 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2198 {
2199     flag aSign;
2200     int16 aExp, shiftCount;
2201     bits64 aSig, savedASig;
2202     int32 z;
2203
2204     aSig = extractFloat64Frac( a );
2205     aExp = extractFloat64Exp( a );
2206     aSign = extractFloat64Sign( a );
2207     if ( 0x41E < aExp ) {
2208         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2209         goto invalid;
2210     }
2211     else if ( aExp < 0x3FF ) {
2212         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2213         return 0;
2214     }
2215     aSig |= LIT64( 0x0010000000000000 );
2216     shiftCount = 0x433 - aExp;
2217     savedASig = aSig;
2218     aSig >>= shiftCount;
2219     z = aSig;
2220     if ( aSign ) z = - z;
2221     if ( ( z < 0 ) ^ aSign ) {
2222  invalid:
2223         float_raise( float_flag_invalid STATUS_VAR);
2224         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2225     }
2226     if ( ( aSig<<shiftCount ) != savedASig ) {
2227         STATUS(float_exception_flags) |= float_flag_inexact;
2228     }
2229     return z;
2230
2231 }
2232
2233 /*----------------------------------------------------------------------------
2234 | Returns the result of converting the double-precision floating-point value
2235 | `a' to the 64-bit two's complement integer format.  The conversion is
2236 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2237 | Arithmetic---which means in particular that the conversion is rounded
2238 | according to the current rounding mode.  If `a' is a NaN, the largest
2239 | positive integer is returned.  Otherwise, if the conversion overflows, the
2240 | largest integer with the same sign as `a' is returned.
2241 *----------------------------------------------------------------------------*/
2242
2243 int64 float64_to_int64( float64 a STATUS_PARAM )
2244 {
2245     flag aSign;
2246     int16 aExp, shiftCount;
2247     bits64 aSig, aSigExtra;
2248
2249     aSig = extractFloat64Frac( a );
2250     aExp = extractFloat64Exp( a );
2251     aSign = extractFloat64Sign( a );
2252     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2253     shiftCount = 0x433 - aExp;
2254     if ( shiftCount <= 0 ) {
2255         if ( 0x43E < aExp ) {
2256             float_raise( float_flag_invalid STATUS_VAR);
2257             if (    ! aSign
2258                  || (    ( aExp == 0x7FF )
2259                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2260                ) {
2261                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2262             }
2263             return (sbits64) LIT64( 0x8000000000000000 );
2264         }
2265         aSigExtra = 0;
2266         aSig <<= - shiftCount;
2267     }
2268     else {
2269         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2270     }
2271     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2272
2273 }
2274
2275 /*----------------------------------------------------------------------------
2276 | Returns the result of converting the double-precision floating-point value
2277 | `a' to the 64-bit two's complement integer format.  The conversion is
2278 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2279 | Arithmetic, except that the conversion is always rounded toward zero.
2280 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2281 | the conversion overflows, the largest integer with the same sign as `a' is
2282 | returned.
2283 *----------------------------------------------------------------------------*/
2284
2285 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2286 {
2287     flag aSign;
2288     int16 aExp, shiftCount;
2289     bits64 aSig;
2290     int64 z;
2291
2292     aSig = extractFloat64Frac( a );
2293     aExp = extractFloat64Exp( a );
2294     aSign = extractFloat64Sign( a );
2295     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2296     shiftCount = aExp - 0x433;
2297     if ( 0 <= shiftCount ) {
2298         if ( 0x43E <= aExp ) {
2299             if ( a != LIT64( 0xC3E0000000000000 ) ) {
2300                 float_raise( float_flag_invalid STATUS_VAR);
2301                 if (    ! aSign
2302                      || (    ( aExp == 0x7FF )
2303                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2304                    ) {
2305                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2306                 }
2307             }
2308             return (sbits64) LIT64( 0x8000000000000000 );
2309         }
2310         z = aSig<<shiftCount;
2311     }
2312     else {
2313         if ( aExp < 0x3FE ) {
2314             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2315             return 0;
2316         }
2317         z = aSig>>( - shiftCount );
2318         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2319             STATUS(float_exception_flags) |= float_flag_inexact;
2320         }
2321     }
2322     if ( aSign ) z = - z;
2323     return z;
2324
2325 }
2326
2327 /*----------------------------------------------------------------------------
2328 | Returns the result of converting the double-precision floating-point value
2329 | `a' to the single-precision floating-point format.  The conversion is
2330 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2331 | Arithmetic.
2332 *----------------------------------------------------------------------------*/
2333
2334 float32 float64_to_float32( float64 a STATUS_PARAM )
2335 {
2336     flag aSign;
2337     int16 aExp;
2338     bits64 aSig;
2339     bits32 zSig;
2340
2341     aSig = extractFloat64Frac( a );
2342     aExp = extractFloat64Exp( a );
2343     aSign = extractFloat64Sign( a );
2344     if ( aExp == 0x7FF ) {
2345         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2346         return packFloat32( aSign, 0xFF, 0 );
2347     }
2348     shift64RightJamming( aSig, 22, &aSig );
2349     zSig = aSig;
2350     if ( aExp || zSig ) {
2351         zSig |= 0x40000000;
2352         aExp -= 0x381;
2353     }
2354     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2355
2356 }
2357
2358 #ifdef FLOATX80
2359
2360 /*----------------------------------------------------------------------------
2361 | Returns the result of converting the double-precision floating-point value
2362 | `a' to the extended double-precision floating-point format.  The conversion
2363 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2364 | Arithmetic.
2365 *----------------------------------------------------------------------------*/
2366
2367 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2368 {
2369     flag aSign;
2370     int16 aExp;
2371     bits64 aSig;
2372
2373     aSig = extractFloat64Frac( a );
2374     aExp = extractFloat64Exp( a );
2375     aSign = extractFloat64Sign( a );
2376     if ( aExp == 0x7FF ) {
2377         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2378         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2379     }
2380     if ( aExp == 0 ) {
2381         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2382         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2383     }
2384     return
2385         packFloatx80(
2386             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2387
2388 }
2389
2390 #endif
2391
2392 #ifdef FLOAT128
2393
2394 /*----------------------------------------------------------------------------
2395 | Returns the result of converting the double-precision floating-point value
2396 | `a' to the quadruple-precision floating-point format.  The conversion is
2397 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2398 | Arithmetic.
2399 *----------------------------------------------------------------------------*/
2400
2401 float128 float64_to_float128( float64 a STATUS_PARAM )
2402 {
2403     flag aSign;
2404     int16 aExp;
2405     bits64 aSig, zSig0, zSig1;
2406
2407     aSig = extractFloat64Frac( a );
2408     aExp = extractFloat64Exp( a );
2409     aSign = extractFloat64Sign( a );
2410     if ( aExp == 0x7FF ) {
2411         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2412         return packFloat128( aSign, 0x7FFF, 0, 0 );
2413     }
2414     if ( aExp == 0 ) {
2415         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2416         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2417         --aExp;
2418     }
2419     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2420     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2421
2422 }
2423
2424 #endif
2425
2426 /*----------------------------------------------------------------------------
2427 | Rounds the double-precision floating-point value `a' to an integer, and
2428 | returns the result as a double-precision floating-point value.  The
2429 | operation is performed according to the IEC/IEEE Standard for Binary
2430 | Floating-Point Arithmetic.
2431 *----------------------------------------------------------------------------*/
2432
2433 float64 float64_round_to_int( float64 a STATUS_PARAM )
2434 {
2435     flag aSign;
2436     int16 aExp;
2437     bits64 lastBitMask, roundBitsMask;
2438     int8 roundingMode;
2439     float64 z;
2440
2441     aExp = extractFloat64Exp( a );
2442     if ( 0x433 <= aExp ) {
2443         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2444             return propagateFloat64NaN( a, a STATUS_VAR );
2445         }
2446         return a;
2447     }
2448     if ( aExp < 0x3FF ) {
2449         if ( (bits64) ( a<<1 ) == 0 ) return a;
2450         STATUS(float_exception_flags) |= float_flag_inexact;
2451         aSign = extractFloat64Sign( a );
2452         switch ( STATUS(float_rounding_mode) ) {
2453          case float_round_nearest_even:
2454             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2455                 return packFloat64( aSign, 0x3FF, 0 );
2456             }
2457             break;
2458          case float_round_down:
2459             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2460          case float_round_up:
2461             return
2462             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2463         }
2464         return packFloat64( aSign, 0, 0 );
2465     }
2466     lastBitMask = 1;
2467     lastBitMask <<= 0x433 - aExp;
2468     roundBitsMask = lastBitMask - 1;
2469     z = a;
2470     roundingMode = STATUS(float_rounding_mode);
2471     if ( roundingMode == float_round_nearest_even ) {
2472         z += lastBitMask>>1;
2473         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2474     }
2475     else if ( roundingMode != float_round_to_zero ) {
2476         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2477             z += roundBitsMask;
2478         }
2479     }
2480     z &= ~ roundBitsMask;
2481     if ( z != a ) STATUS(float_exception_flags) |= float_flag_inexact;
2482     return z;
2483
2484 }
2485
2486 /*----------------------------------------------------------------------------
2487 | Returns the result of adding the absolute values of the double-precision
2488 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2489 | before being returned.  `zSign' is ignored if the result is a NaN.
2490 | The addition is performed according to the IEC/IEEE Standard for Binary
2491 | Floating-Point Arithmetic.
2492 *----------------------------------------------------------------------------*/
2493
2494 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2495 {
2496     int16 aExp, bExp, zExp;
2497     bits64 aSig, bSig, zSig;
2498     int16 expDiff;
2499
2500     aSig = extractFloat64Frac( a );
2501     aExp = extractFloat64Exp( a );
2502     bSig = extractFloat64Frac( b );
2503     bExp = extractFloat64Exp( b );
2504     expDiff = aExp - bExp;
2505     aSig <<= 9;
2506     bSig <<= 9;
2507     if ( 0 < expDiff ) {
2508         if ( aExp == 0x7FF ) {
2509             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2510             return a;
2511         }
2512         if ( bExp == 0 ) {
2513             --expDiff;
2514         }
2515         else {
2516             bSig |= LIT64( 0x2000000000000000 );
2517         }
2518         shift64RightJamming( bSig, expDiff, &bSig );
2519         zExp = aExp;
2520     }
2521     else if ( expDiff < 0 ) {
2522         if ( bExp == 0x7FF ) {
2523             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2524             return packFloat64( zSign, 0x7FF, 0 );
2525         }
2526         if ( aExp == 0 ) {
2527             ++expDiff;
2528         }
2529         else {
2530             aSig |= LIT64( 0x2000000000000000 );
2531         }
2532         shift64RightJamming( aSig, - expDiff, &aSig );
2533         zExp = bExp;
2534     }
2535     else {
2536         if ( aExp == 0x7FF ) {
2537             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2538             return a;
2539         }
2540         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2541         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2542         zExp = aExp;
2543         goto roundAndPack;
2544     }
2545     aSig |= LIT64( 0x2000000000000000 );
2546     zSig = ( aSig + bSig )<<1;
2547     --zExp;
2548     if ( (sbits64) zSig < 0 ) {
2549         zSig = aSig + bSig;
2550         ++zExp;
2551     }
2552  roundAndPack:
2553     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2554
2555 }
2556
2557 /*----------------------------------------------------------------------------
2558 | Returns the result of subtracting the absolute values of the double-
2559 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2560 | difference is negated before being returned.  `zSign' is ignored if the
2561 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2562 | Standard for Binary Floating-Point Arithmetic.
2563 *----------------------------------------------------------------------------*/
2564
2565 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2566 {
2567     int16 aExp, bExp, zExp;
2568     bits64 aSig, bSig, zSig;
2569     int16 expDiff;
2570
2571     aSig = extractFloat64Frac( a );
2572     aExp = extractFloat64Exp( a );
2573     bSig = extractFloat64Frac( b );
2574     bExp = extractFloat64Exp( b );
2575     expDiff = aExp - bExp;
2576     aSig <<= 10;
2577     bSig <<= 10;
2578     if ( 0 < expDiff ) goto aExpBigger;
2579     if ( expDiff < 0 ) goto bExpBigger;
2580     if ( aExp == 0x7FF ) {
2581         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2582         float_raise( float_flag_invalid STATUS_VAR);
2583         return float64_default_nan;
2584     }
2585     if ( aExp == 0 ) {
2586         aExp = 1;
2587         bExp = 1;
2588     }
2589     if ( bSig < aSig ) goto aBigger;
2590     if ( aSig < bSig ) goto bBigger;
2591     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2592  bExpBigger:
2593     if ( bExp == 0x7FF ) {
2594         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2595         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2596     }
2597     if ( aExp == 0 ) {
2598         ++expDiff;
2599     }
2600     else {
2601         aSig |= LIT64( 0x4000000000000000 );
2602     }
2603     shift64RightJamming( aSig, - expDiff, &aSig );
2604     bSig |= LIT64( 0x4000000000000000 );
2605  bBigger:
2606     zSig = bSig - aSig;
2607     zExp = bExp;
2608     zSign ^= 1;
2609     goto normalizeRoundAndPack;
2610  aExpBigger:
2611     if ( aExp == 0x7FF ) {
2612         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2613         return a;
2614     }
2615     if ( bExp == 0 ) {
2616         --expDiff;
2617     }
2618     else {
2619         bSig |= LIT64( 0x4000000000000000 );
2620     }
2621     shift64RightJamming( bSig, expDiff, &bSig );
2622     aSig |= LIT64( 0x4000000000000000 );
2623  aBigger:
2624     zSig = aSig - bSig;
2625     zExp = aExp;
2626  normalizeRoundAndPack:
2627     --zExp;
2628     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2629
2630 }
2631
2632 /*----------------------------------------------------------------------------
2633 | Returns the result of adding the double-precision floating-point values `a'
2634 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2635 | Binary Floating-Point Arithmetic.
2636 *----------------------------------------------------------------------------*/
2637
2638 float64 float64_add( float64 a, float64 b STATUS_PARAM )
2639 {
2640     flag aSign, bSign;
2641
2642     aSign = extractFloat64Sign( a );
2643     bSign = extractFloat64Sign( b );
2644     if ( aSign == bSign ) {
2645         return addFloat64Sigs( a, b, aSign STATUS_VAR );
2646     }
2647     else {
2648         return subFloat64Sigs( a, b, aSign STATUS_VAR );
2649     }
2650
2651 }
2652
2653 /*----------------------------------------------------------------------------
2654 | Returns the result of subtracting the double-precision floating-point values
2655 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2656 | for Binary Floating-Point Arithmetic.
2657 *----------------------------------------------------------------------------*/
2658
2659 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2660 {
2661     flag aSign, bSign;
2662
2663     aSign = extractFloat64Sign( a );
2664     bSign = extractFloat64Sign( b );
2665     if ( aSign == bSign ) {
2666         return subFloat64Sigs( a, b, aSign STATUS_VAR );
2667     }
2668     else {
2669         return addFloat64Sigs( a, b, aSign STATUS_VAR );
2670     }
2671
2672 }
2673
2674 /*----------------------------------------------------------------------------
2675 | Returns the result of multiplying the double-precision floating-point values
2676 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2677 | for Binary Floating-Point Arithmetic.
2678 *----------------------------------------------------------------------------*/
2679
2680 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2681 {
2682     flag aSign, bSign, zSign;
2683     int16 aExp, bExp, zExp;
2684     bits64 aSig, bSig, zSig0, zSig1;
2685
2686     aSig = extractFloat64Frac( a );
2687     aExp = extractFloat64Exp( a );
2688     aSign = extractFloat64Sign( a );
2689     bSig = extractFloat64Frac( b );
2690     bExp = extractFloat64Exp( b );
2691     bSign = extractFloat64Sign( b );
2692     zSign = aSign ^ bSign;
2693     if ( aExp == 0x7FF ) {
2694         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2695             return propagateFloat64NaN( a, b STATUS_VAR );
2696         }
2697         if ( ( bExp | bSig ) == 0 ) {
2698             float_raise( float_flag_invalid STATUS_VAR);
2699             return float64_default_nan;
2700         }
2701         return packFloat64( zSign, 0x7FF, 0 );
2702     }
2703     if ( bExp == 0x7FF ) {
2704         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2705         if ( ( aExp | aSig ) == 0 ) {
2706             float_raise( float_flag_invalid STATUS_VAR);
2707             return float64_default_nan;
2708         }
2709         return packFloat64( zSign, 0x7FF, 0 );
2710     }
2711     if ( aExp == 0 ) {
2712         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2713         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2714     }
2715     if ( bExp == 0 ) {
2716         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2717         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2718     }
2719     zExp = aExp + bExp - 0x3FF;
2720     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2721     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2722     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2723     zSig0 |= ( zSig1 != 0 );
2724     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2725         zSig0 <<= 1;
2726         --zExp;
2727     }
2728     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2729
2730 }
2731
2732 /*----------------------------------------------------------------------------
2733 | Returns the result of dividing the double-precision floating-point value `a'
2734 | by the corresponding value `b'.  The operation is performed according to
2735 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2736 *----------------------------------------------------------------------------*/
2737
2738 float64 float64_div( float64 a, float64 b STATUS_PARAM )
2739 {
2740     flag aSign, bSign, zSign;
2741     int16 aExp, bExp, zExp;
2742     bits64 aSig, bSig, zSig;
2743     bits64 rem0, rem1;
2744     bits64 term0, term1;
2745
2746     aSig = extractFloat64Frac( a );
2747     aExp = extractFloat64Exp( a );
2748     aSign = extractFloat64Sign( a );
2749     bSig = extractFloat64Frac( b );
2750     bExp = extractFloat64Exp( b );
2751     bSign = extractFloat64Sign( b );
2752     zSign = aSign ^ bSign;
2753     if ( aExp == 0x7FF ) {
2754         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2755         if ( bExp == 0x7FF ) {
2756             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2757             float_raise( float_flag_invalid STATUS_VAR);
2758             return float64_default_nan;
2759         }
2760         return packFloat64( zSign, 0x7FF, 0 );
2761     }
2762     if ( bExp == 0x7FF ) {
2763         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2764         return packFloat64( zSign, 0, 0 );
2765     }
2766     if ( bExp == 0 ) {
2767         if ( bSig == 0 ) {
2768             if ( ( aExp | aSig ) == 0 ) {
2769                 float_raise( float_flag_invalid STATUS_VAR);
2770                 return float64_default_nan;
2771             }
2772             float_raise( float_flag_divbyzero STATUS_VAR);
2773             return packFloat64( zSign, 0x7FF, 0 );
2774         }
2775         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2776     }
2777     if ( aExp == 0 ) {
2778         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2779         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2780     }
2781     zExp = aExp - bExp + 0x3FD;
2782     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2783     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2784     if ( bSig <= ( aSig + aSig ) ) {
2785         aSig >>= 1;
2786         ++zExp;
2787     }
2788     zSig = estimateDiv128To64( aSig, 0, bSig );
2789     if ( ( zSig & 0x1FF ) <= 2 ) {
2790         mul64To128( bSig, zSig, &term0, &term1 );
2791         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2792         while ( (sbits64) rem0 < 0 ) {
2793             --zSig;
2794             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2795         }
2796         zSig |= ( rem1 != 0 );
2797     }
2798     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2799
2800 }
2801
2802 /*----------------------------------------------------------------------------
2803 | Returns the remainder of the double-precision floating-point value `a'
2804 | with respect to the corresponding value `b'.  The operation is performed
2805 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2806 *----------------------------------------------------------------------------*/
2807
2808 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
2809 {
2810     flag aSign, bSign, zSign;
2811     int16 aExp, bExp, expDiff;
2812     bits64 aSig, bSig;
2813     bits64 q, alternateASig;
2814     sbits64 sigMean;
2815
2816     aSig = extractFloat64Frac( a );
2817     aExp = extractFloat64Exp( a );
2818     aSign = extractFloat64Sign( a );
2819     bSig = extractFloat64Frac( b );
2820     bExp = extractFloat64Exp( b );
2821     bSign = extractFloat64Sign( b );
2822     if ( aExp == 0x7FF ) {
2823         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2824             return propagateFloat64NaN( a, b STATUS_VAR );
2825         }
2826         float_raise( float_flag_invalid STATUS_VAR);
2827         return float64_default_nan;
2828     }
2829     if ( bExp == 0x7FF ) {
2830         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2831         return a;
2832     }
2833     if ( bExp == 0 ) {
2834         if ( bSig == 0 ) {
2835             float_raise( float_flag_invalid STATUS_VAR);
2836             return float64_default_nan;
2837         }
2838         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2839     }
2840     if ( aExp == 0 ) {
2841         if ( aSig == 0 ) return a;
2842         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2843     }
2844     expDiff = aExp - bExp;
2845     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2846     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2847     if ( expDiff < 0 ) {
2848         if ( expDiff < -1 ) return a;
2849         aSig >>= 1;
2850     }
2851     q = ( bSig <= aSig );
2852     if ( q ) aSig -= bSig;
2853     expDiff -= 64;
2854     while ( 0 < expDiff ) {
2855         q = estimateDiv128To64( aSig, 0, bSig );
2856         q = ( 2 < q ) ? q - 2 : 0;
2857         aSig = - ( ( bSig>>2 ) * q );
2858         expDiff -= 62;
2859     }
2860     expDiff += 64;
2861     if ( 0 < expDiff ) {
2862         q = estimateDiv128To64( aSig, 0, bSig );
2863         q = ( 2 < q ) ? q - 2 : 0;
2864         q >>= 64 - expDiff;
2865         bSig >>= 2;
2866         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2867     }
2868     else {
2869         aSig >>= 2;
2870         bSig >>= 2;
2871     }
2872     do {
2873         alternateASig = aSig;
2874         ++q;
2875         aSig -= bSig;
2876     } while ( 0 <= (sbits64) aSig );
2877     sigMean = aSig + alternateASig;
2878     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2879         aSig = alternateASig;
2880     }
2881     zSign = ( (sbits64) aSig < 0 );
2882     if ( zSign ) aSig = - aSig;
2883     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
2884
2885 }
2886
2887 /*----------------------------------------------------------------------------
2888 | Returns the square root of the double-precision floating-point value `a'.
2889 | The operation is performed according to the IEC/IEEE Standard for Binary
2890 | Floating-Point Arithmetic.
2891 *----------------------------------------------------------------------------*/
2892
2893 float64 float64_sqrt( float64 a STATUS_PARAM )
2894 {
2895     flag aSign;
2896     int16 aExp, zExp;
2897     bits64 aSig, zSig, doubleZSig;
2898     bits64 rem0, rem1, term0, term1;
2899
2900     aSig = extractFloat64Frac( a );
2901     aExp = extractFloat64Exp( a );
2902     aSign = extractFloat64Sign( a );
2903     if ( aExp == 0x7FF ) {
2904         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
2905         if ( ! aSign ) return a;
2906         float_raise( float_flag_invalid STATUS_VAR);
2907         return float64_default_nan;
2908     }
2909     if ( aSign ) {
2910         if ( ( aExp | aSig ) == 0 ) return a;
2911         float_raise( float_flag_invalid STATUS_VAR);
2912         return float64_default_nan;
2913     }
2914     if ( aExp == 0 ) {
2915         if ( aSig == 0 ) return 0;
2916         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2917     }
2918     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2919     aSig |= LIT64( 0x0010000000000000 );
2920     zSig = estimateSqrt32( aExp, aSig>>21 );
2921     aSig <<= 9 - ( aExp & 1 );
2922     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2923     if ( ( zSig & 0x1FF ) <= 5 ) {
2924         doubleZSig = zSig<<1;
2925         mul64To128( zSig, zSig, &term0, &term1 );
2926         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2927         while ( (sbits64) rem0 < 0 ) {
2928             --zSig;
2929             doubleZSig -= 2;
2930             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
2931         }
2932         zSig |= ( ( rem0 | rem1 ) != 0 );
2933     }
2934     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
2935
2936 }
2937
2938 /*----------------------------------------------------------------------------
2939 | Returns 1 if the double-precision floating-point value `a' is equal to the
2940 | corresponding value `b', and 0 otherwise.  The comparison is performed
2941 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2942 *----------------------------------------------------------------------------*/
2943
2944 flag float64_eq( float64 a, float64 b STATUS_PARAM )
2945 {
2946
2947     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2948          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2949        ) {
2950         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2951             float_raise( float_flag_invalid STATUS_VAR);
2952         }
2953         return 0;
2954     }
2955     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2956
2957 }
2958
2959 /*----------------------------------------------------------------------------
2960 | Returns 1 if the double-precision floating-point value `a' is less than or
2961 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
2962 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2963 | Arithmetic.
2964 *----------------------------------------------------------------------------*/
2965
2966 flag float64_le( float64 a, float64 b STATUS_PARAM )
2967 {
2968     flag aSign, bSign;
2969
2970     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2971          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2972        ) {
2973         float_raise( float_flag_invalid STATUS_VAR);
2974         return 0;
2975     }
2976     aSign = extractFloat64Sign( a );
2977     bSign = extractFloat64Sign( b );
2978     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2979     return ( a == b ) || ( aSign ^ ( a < b ) );
2980
2981 }
2982
2983 /*----------------------------------------------------------------------------
2984 | Returns 1 if the double-precision floating-point value `a' is less than
2985 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2986 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2987 *----------------------------------------------------------------------------*/
2988
2989 flag float64_lt( float64 a, float64 b STATUS_PARAM )
2990 {
2991     flag aSign, bSign;
2992
2993     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2994          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2995        ) {
2996         float_raise( float_flag_invalid STATUS_VAR);
2997         return 0;
2998     }
2999     aSign = extractFloat64Sign( a );
3000     bSign = extractFloat64Sign( b );
3001     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3002     return ( a != b ) && ( aSign ^ ( a < b ) );
3003
3004 }
3005
3006 /*----------------------------------------------------------------------------
3007 | Returns 1 if the double-precision floating-point value `a' is equal to the
3008 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
3009 | if either operand is a NaN.  Otherwise, the comparison is performed
3010 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3011 *----------------------------------------------------------------------------*/
3012
3013 flag float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3014 {
3015
3016     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3017          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3018        ) {
3019         float_raise( float_flag_invalid STATUS_VAR);
3020         return 0;
3021     }
3022     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3023
3024 }
3025
3026 /*----------------------------------------------------------------------------
3027 | Returns 1 if the double-precision floating-point value `a' is less than or
3028 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3029 | cause an exception.  Otherwise, the comparison is performed according to the
3030 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3031 *----------------------------------------------------------------------------*/
3032
3033 flag float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3034 {
3035     flag aSign, bSign;
3036
3037     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3038          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3039        ) {
3040         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3041             float_raise( float_flag_invalid STATUS_VAR);
3042         }
3043         return 0;
3044     }
3045     aSign = extractFloat64Sign( a );
3046     bSign = extractFloat64Sign( b );
3047     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3048     return ( a == b ) || ( aSign ^ ( a < b ) );
3049
3050 }
3051
3052 /*----------------------------------------------------------------------------
3053 | Returns 1 if the double-precision floating-point value `a' is less than
3054 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3055 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3056 | Standard for Binary Floating-Point Arithmetic.
3057 *----------------------------------------------------------------------------*/
3058
3059 flag float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3060 {
3061     flag aSign, bSign;
3062
3063     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3064          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3065        ) {
3066         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3067             float_raise( float_flag_invalid STATUS_VAR);
3068         }
3069         return 0;
3070     }
3071     aSign = extractFloat64Sign( a );
3072     bSign = extractFloat64Sign( b );
3073     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3074     return ( a != b ) && ( aSign ^ ( a < b ) );
3075
3076 }
3077
3078 #ifdef FLOATX80
3079
3080 /*----------------------------------------------------------------------------
3081 | Returns the result of converting the extended double-precision floating-
3082 | point value `a' to the 32-bit two's complement integer format.  The
3083 | conversion is performed according to the IEC/IEEE Standard for Binary
3084 | Floating-Point Arithmetic---which means in particular that the conversion
3085 | is rounded according to the current rounding mode.  If `a' is a NaN, the
3086 | largest positive integer is returned.  Otherwise, if the conversion
3087 | overflows, the largest integer with the same sign as `a' is returned.
3088 *----------------------------------------------------------------------------*/
3089
3090 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3091 {
3092     flag aSign;
3093     int32 aExp, shiftCount;
3094     bits64 aSig;
3095
3096     aSig = extractFloatx80Frac( a );
3097     aExp = extractFloatx80Exp( a );
3098     aSign = extractFloatx80Sign( a );
3099     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3100     shiftCount = 0x4037 - aExp;
3101     if ( shiftCount <= 0 ) shiftCount = 1;
3102     shift64RightJamming( aSig, shiftCount, &aSig );
3103     return roundAndPackInt32( aSign, aSig STATUS_VAR );
3104
3105 }
3106
3107 /*----------------------------------------------------------------------------
3108 | Returns the result of converting the extended double-precision floating-
3109 | point value `a' to the 32-bit two's complement integer format.  The
3110 | conversion is performed according to the IEC/IEEE Standard for Binary
3111 | Floating-Point Arithmetic, except that the conversion is always rounded
3112 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3113 | Otherwise, if the conversion overflows, the largest integer with the same
3114 | sign as `a' is returned.
3115 *----------------------------------------------------------------------------*/
3116
3117 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3118 {
3119     flag aSign;
3120     int32 aExp, shiftCount;
3121     bits64 aSig, savedASig;
3122     int32 z;
3123
3124     aSig = extractFloatx80Frac( a );
3125     aExp = extractFloatx80Exp( a );
3126     aSign = extractFloatx80Sign( a );
3127     if ( 0x401E < aExp ) {
3128         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3129         goto invalid;
3130     }
3131     else if ( aExp < 0x3FFF ) {
3132         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3133         return 0;
3134     }
3135     shiftCount = 0x403E - aExp;
3136     savedASig = aSig;
3137     aSig >>= shiftCount;
3138     z = aSig;
3139     if ( aSign ) z = - z;
3140     if ( ( z < 0 ) ^ aSign ) {
3141  invalid:
3142         float_raise( float_flag_invalid STATUS_VAR);
3143         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3144     }
3145     if ( ( aSig<<shiftCount ) != savedASig ) {
3146         STATUS(float_exception_flags) |= float_flag_inexact;
3147     }
3148     return z;
3149
3150 }
3151
3152 /*----------------------------------------------------------------------------
3153 | Returns the result of converting the extended double-precision floating-
3154 | point value `a' to the 64-bit two's complement integer format.  The
3155 | conversion is performed according to the IEC/IEEE Standard for Binary
3156 | Floating-Point Arithmetic---which means in particular that the conversion
3157 | is rounded according to the current rounding mode.  If `a' is a NaN,
3158 | the largest positive integer is returned.  Otherwise, if the conversion
3159 | overflows, the largest integer with the same sign as `a' is returned.
3160 *----------------------------------------------------------------------------*/
3161
3162 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3163 {
3164     flag aSign;
3165     int32 aExp, shiftCount;
3166     bits64 aSig, aSigExtra;
3167
3168     aSig = extractFloatx80Frac( a );
3169     aExp = extractFloatx80Exp( a );
3170     aSign = extractFloatx80Sign( a );
3171     shiftCount = 0x403E - aExp;
3172     if ( shiftCount <= 0 ) {
3173         if ( shiftCount ) {
3174             float_raise( float_flag_invalid STATUS_VAR);
3175             if (    ! aSign
3176                  || (    ( aExp == 0x7FFF )
3177                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3178                ) {
3179                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3180             }
3181             return (sbits64) LIT64( 0x8000000000000000 );
3182         }
3183         aSigExtra = 0;
3184     }
3185     else {
3186         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3187     }
3188     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3189
3190 }
3191
3192 /*----------------------------------------------------------------------------
3193 | Returns the result of converting the extended double-precision floating-
3194 | point value `a' to the 64-bit two's complement integer format.  The
3195 | conversion is performed according to the IEC/IEEE Standard for Binary
3196 | Floating-Point Arithmetic, except that the conversion is always rounded
3197 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3198 | Otherwise, if the conversion overflows, the largest integer with the same
3199 | sign as `a' is returned.
3200 *----------------------------------------------------------------------------*/
3201
3202 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3203 {
3204     flag aSign;
3205     int32 aExp, shiftCount;
3206     bits64 aSig;
3207     int64 z;
3208
3209     aSig = extractFloatx80Frac( a );
3210     aExp = extractFloatx80Exp( a );
3211     aSign = extractFloatx80Sign( a );
3212     shiftCount = aExp - 0x403E;
3213     if ( 0 <= shiftCount ) {
3214         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3215         if ( ( a.high != 0xC03E ) || aSig ) {
3216             float_raise( float_flag_invalid STATUS_VAR);
3217             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3218                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3219             }
3220         }
3221         return (sbits64) LIT64( 0x8000000000000000 );
3222     }
3223     else if ( aExp < 0x3FFF ) {
3224         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3225         return 0;
3226     }
3227     z = aSig>>( - shiftCount );
3228     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3229         STATUS(float_exception_flags) |= float_flag_inexact;
3230     }
3231     if ( aSign ) z = - z;
3232     return z;
3233
3234 }
3235
3236 /*----------------------------------------------------------------------------
3237 | Returns the result of converting the extended double-precision floating-
3238 | point value `a' to the single-precision floating-point format.  The
3239 | conversion is performed according to the IEC/IEEE Standard for Binary
3240 | Floating-Point Arithmetic.
3241 *----------------------------------------------------------------------------*/
3242
3243 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3244 {
3245     flag aSign;
3246     int32 aExp;
3247     bits64 aSig;
3248
3249     aSig = extractFloatx80Frac( a );
3250     aExp = extractFloatx80Exp( a );
3251     aSign = extractFloatx80Sign( a );
3252     if ( aExp == 0x7FFF ) {
3253         if ( (bits64) ( aSig<<1 ) ) {
3254             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3255         }
3256         return packFloat32( aSign, 0xFF, 0 );
3257     }
3258     shift64RightJamming( aSig, 33, &aSig );
3259     if ( aExp || aSig ) aExp -= 0x3F81;
3260     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3261
3262 }
3263
3264 /*----------------------------------------------------------------------------
3265 | Returns the result of converting the extended double-precision floating-
3266 | point value `a' to the double-precision floating-point format.  The
3267 | conversion is performed according to the IEC/IEEE Standard for Binary
3268 | Floating-Point Arithmetic.
3269 *----------------------------------------------------------------------------*/
3270
3271 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3272 {
3273     flag aSign;
3274     int32 aExp;
3275     bits64 aSig, zSig;
3276
3277     aSig = extractFloatx80Frac( a );
3278     aExp = extractFloatx80Exp( a );
3279     aSign = extractFloatx80Sign( a );
3280     if ( aExp == 0x7FFF ) {
3281         if ( (bits64) ( aSig<<1 ) ) {
3282             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3283         }
3284         return packFloat64( aSign, 0x7FF, 0 );
3285     }
3286     shift64RightJamming( aSig, 1, &zSig );
3287     if ( aExp || aSig ) aExp -= 0x3C01;
3288     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3289
3290 }
3291
3292 #ifdef FLOAT128
3293
3294 /*----------------------------------------------------------------------------
3295 | Returns the result of converting the extended double-precision floating-
3296 | point value `a' to the quadruple-precision floating-point format.  The
3297 | conversion is performed according to the IEC/IEEE Standard for Binary
3298 | Floating-Point Arithmetic.
3299 *----------------------------------------------------------------------------*/
3300
3301 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3302 {
3303     flag aSign;
3304     int16 aExp;
3305     bits64 aSig, zSig0, zSig1;
3306
3307     aSig = extractFloatx80Frac( a );
3308     aExp = extractFloatx80Exp( a );
3309     aSign = extractFloatx80Sign( a );
3310     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3311         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3312     }
3313     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3314     return packFloat128( aSign, aExp, zSig0, zSig1 );
3315
3316 }
3317
3318 #endif
3319
3320 /*----------------------------------------------------------------------------
3321 | Rounds the extended double-precision floating-point value `a' to an integer,
3322 | and returns the result as an extended quadruple-precision floating-point
3323 | value.  The operation is performed according to the IEC/IEEE Standard for
3324 | Binary Floating-Point Arithmetic.
3325 *----------------------------------------------------------------------------*/
3326
3327 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3328 {
3329     flag aSign;
3330     int32 aExp;
3331     bits64 lastBitMask, roundBitsMask;
3332     int8 roundingMode;
3333     floatx80 z;
3334
3335     aExp = extractFloatx80Exp( a );
3336     if ( 0x403E <= aExp ) {
3337         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3338             return propagateFloatx80NaN( a, a STATUS_VAR );
3339         }
3340         return a;
3341     }
3342     if ( aExp < 0x3FFF ) {
3343         if (    ( aExp == 0 )
3344              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3345             return a;
3346         }
3347         STATUS(float_exception_flags) |= float_flag_inexact;
3348         aSign = extractFloatx80Sign( a );
3349         switch ( STATUS(float_rounding_mode) ) {
3350          case float_round_nearest_even:
3351             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3352                ) {
3353                 return
3354                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3355             }
3356             break;
3357          case float_round_down:
3358             return
3359                   aSign ?
3360                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3361                 : packFloatx80( 0, 0, 0 );
3362          case float_round_up:
3363             return
3364                   aSign ? packFloatx80( 1, 0, 0 )
3365                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3366         }
3367         return packFloatx80( aSign, 0, 0 );
3368     }
3369     lastBitMask = 1;
3370     lastBitMask <<= 0x403E - aExp;
3371     roundBitsMask = lastBitMask - 1;
3372     z = a;
3373     roundingMode = STATUS(float_rounding_mode);
3374     if ( roundingMode == float_round_nearest_even ) {
3375         z.low += lastBitMask>>1;
3376         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3377     }
3378     else if ( roundingMode != float_round_to_zero ) {
3379         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3380             z.low += roundBitsMask;
3381         }
3382     }
3383     z.low &= ~ roundBitsMask;
3384     if ( z.low == 0 ) {
3385         ++z.high;
3386         z.low = LIT64( 0x8000000000000000 );
3387     }
3388     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3389     return z;
3390
3391 }
3392
3393 /*----------------------------------------------------------------------------
3394 | Returns the result of adding the absolute values of the extended double-
3395 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3396 | negated before being returned.  `zSign' is ignored if the result is a NaN.
3397 | The addition is performed according to the IEC/IEEE Standard for Binary
3398 | Floating-Point Arithmetic.
3399 *----------------------------------------------------------------------------*/
3400
3401 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3402 {
3403     int32 aExp, bExp, zExp;
3404     bits64 aSig, bSig, zSig0, zSig1;
3405     int32 expDiff;
3406
3407     aSig = extractFloatx80Frac( a );
3408     aExp = extractFloatx80Exp( a );
3409     bSig = extractFloatx80Frac( b );
3410     bExp = extractFloatx80Exp( b );
3411     expDiff = aExp - bExp;
3412     if ( 0 < expDiff ) {
3413         if ( aExp == 0x7FFF ) {
3414             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3415             return a;
3416         }
3417         if ( bExp == 0 ) --expDiff;
3418         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3419         zExp = aExp;
3420     }
3421     else if ( expDiff < 0 ) {
3422         if ( bExp == 0x7FFF ) {
3423             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3424             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3425         }
3426         if ( aExp == 0 ) ++expDiff;
3427         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3428         zExp = bExp;
3429     }
3430     else {
3431         if ( aExp == 0x7FFF ) {
3432             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3433                 return propagateFloatx80NaN( a, b STATUS_VAR );
3434             }
3435             return a;
3436         }
3437         zSig1 = 0;
3438         zSig0 = aSig + bSig;
3439         if ( aExp == 0 ) {
3440             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3441             goto roundAndPack;
3442         }
3443         zExp = aExp;
3444         goto shiftRight1;
3445     }
3446     zSig0 = aSig + bSig;
3447     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3448  shiftRight1:
3449     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3450     zSig0 |= LIT64( 0x8000000000000000 );
3451     ++zExp;
3452  roundAndPack:
3453     return
3454         roundAndPackFloatx80(
3455             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3456
3457 }
3458
3459 /*----------------------------------------------------------------------------
3460 | Returns the result of subtracting the absolute values of the extended
3461 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3462 | difference is negated before being returned.  `zSign' is ignored if the
3463 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3464 | Standard for Binary Floating-Point Arithmetic.
3465 *----------------------------------------------------------------------------*/
3466
3467 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3468 {
3469     int32 aExp, bExp, zExp;
3470     bits64 aSig, bSig, zSig0, zSig1;
3471     int32 expDiff;
3472     floatx80 z;
3473
3474     aSig = extractFloatx80Frac( a );
3475     aExp = extractFloatx80Exp( a );
3476     bSig = extractFloatx80Frac( b );
3477     bExp = extractFloatx80Exp( b );
3478     expDiff = aExp - bExp;
3479     if ( 0 < expDiff ) goto aExpBigger;
3480     if ( expDiff < 0 ) goto bExpBigger;
3481     if ( aExp == 0x7FFF ) {
3482         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3483             return propagateFloatx80NaN( a, b STATUS_VAR );
3484         }
3485         float_raise( float_flag_invalid STATUS_VAR);
3486         z.low = floatx80_default_nan_low;
3487         z.high = floatx80_default_nan_high;
3488         return z;
3489     }
3490     if ( aExp == 0 ) {
3491         aExp = 1;
3492         bExp = 1;
3493     }
3494     zSig1 = 0;
3495     if ( bSig < aSig ) goto aBigger;
3496     if ( aSig < bSig ) goto bBigger;
3497     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3498  bExpBigger:
3499     if ( bExp == 0x7FFF ) {
3500         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3501         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3502     }
3503     if ( aExp == 0 ) ++expDiff;
3504     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3505  bBigger:
3506     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3507     zExp = bExp;
3508     zSign ^= 1;
3509     goto normalizeRoundAndPack;
3510  aExpBigger:
3511     if ( aExp == 0x7FFF ) {
3512         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3513         return a;
3514     }
3515     if ( bExp == 0 ) --expDiff;
3516     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3517  aBigger:
3518     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3519     zExp = aExp;
3520  normalizeRoundAndPack:
3521     return
3522         normalizeRoundAndPackFloatx80(
3523             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3524
3525 }
3526
3527 /*----------------------------------------------------------------------------
3528 | Returns the result of adding the extended double-precision floating-point
3529 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
3530 | Standard for Binary Floating-Point Arithmetic.
3531 *----------------------------------------------------------------------------*/
3532
3533 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3534 {
3535     flag aSign, bSign;
3536
3537     aSign = extractFloatx80Sign( a );
3538     bSign = extractFloatx80Sign( b );
3539     if ( aSign == bSign ) {
3540         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3541     }
3542     else {
3543         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3544     }
3545
3546 }
3547
3548 /*----------------------------------------------------------------------------
3549 | Returns the result of subtracting the extended double-precision floating-
3550 | point values `a' and `b'.  The operation is performed according to the
3551 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3552 *----------------------------------------------------------------------------*/
3553
3554 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3555 {
3556     flag aSign, bSign;
3557
3558     aSign = extractFloatx80Sign( a );
3559     bSign = extractFloatx80Sign( b );
3560     if ( aSign == bSign ) {
3561         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3562     }
3563     else {
3564         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3565     }
3566
3567 }
3568
3569 /*----------------------------------------------------------------------------
3570 | Returns the result of multiplying the extended double-precision floating-
3571 | point values `a' and `b'.  The operation is performed according to the
3572 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3573 *----------------------------------------------------------------------------*/
3574
3575 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3576 {
3577     flag aSign, bSign, zSign;
3578     int32 aExp, bExp, zExp;
3579     bits64 aSig, bSig, zSig0, zSig1;
3580     floatx80 z;
3581
3582     aSig = extractFloatx80Frac( a );
3583     aExp = extractFloatx80Exp( a );
3584     aSign = extractFloatx80Sign( a );
3585     bSig = extractFloatx80Frac( b );
3586     bExp = extractFloatx80Exp( b );
3587     bSign = extractFloatx80Sign( b );
3588     zSign = aSign ^ bSign;
3589     if ( aExp == 0x7FFF ) {
3590         if (    (bits64) ( aSig<<1 )
3591              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3592             return propagateFloatx80NaN( a, b STATUS_VAR );
3593         }
3594         if ( ( bExp | bSig ) == 0 ) goto invalid;
3595         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3596     }
3597     if ( bExp == 0x7FFF ) {
3598         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3599         if ( ( aExp | aSig ) == 0 ) {
3600  invalid:
3601             float_raise( float_flag_invalid STATUS_VAR);
3602             z.low = floatx80_default_nan_low;
3603             z.high = floatx80_default_nan_high;
3604             return z;
3605         }
3606         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3607     }
3608     if ( aExp == 0 ) {
3609         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3610         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3611     }
3612     if ( bExp == 0 ) {
3613         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3614         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3615     }
3616     zExp = aExp + bExp - 0x3FFE;
3617     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3618     if ( 0 < (sbits64) zSig0 ) {
3619         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3620         --zExp;
3621     }
3622     return
3623         roundAndPackFloatx80(
3624             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3625
3626 }
3627
3628 /*----------------------------------------------------------------------------
3629 | Returns the result of dividing the extended double-precision floating-point
3630 | value `a' by the corresponding value `b'.  The operation is performed
3631 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3632 *----------------------------------------------------------------------------*/
3633
3634 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3635 {
3636     flag aSign, bSign, zSign;
3637     int32 aExp, bExp, zExp;
3638     bits64 aSig, bSig, zSig0, zSig1;
3639     bits64 rem0, rem1, rem2, term0, term1, term2;
3640     floatx80 z;
3641
3642     aSig = extractFloatx80Frac( a );
3643     aExp = extractFloatx80Exp( a );
3644     aSign = extractFloatx80Sign( a );
3645     bSig = extractFloatx80Frac( b );
3646     bExp = extractFloatx80Exp( b );
3647     bSign = extractFloatx80Sign( b );
3648     zSign = aSign ^ bSign;
3649     if ( aExp == 0x7FFF ) {
3650         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3651         if ( bExp == 0x7FFF ) {
3652             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3653             goto invalid;
3654         }
3655         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3656     }
3657     if ( bExp == 0x7FFF ) {
3658         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3659         return packFloatx80( zSign, 0, 0 );
3660     }
3661     if ( bExp == 0 ) {
3662         if ( bSig == 0 ) {
3663             if ( ( aExp | aSig ) == 0 ) {
3664  invalid:
3665                 float_raise( float_flag_invalid STATUS_VAR);
3666                 z.low = floatx80_default_nan_low;
3667                 z.high = floatx80_default_nan_high;
3668                 return z;
3669             }
3670             float_raise( float_flag_divbyzero STATUS_VAR);
3671             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3672         }
3673         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3674     }
3675     if ( aExp == 0 ) {
3676         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3677         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3678     }
3679     zExp = aExp - bExp + 0x3FFE;
3680     rem1 = 0;
3681     if ( bSig <= aSig ) {
3682         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3683         ++zExp;
3684     }
3685     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3686     mul64To128( bSig, zSig0, &term0, &term1 );
3687     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3688     while ( (sbits64) rem0 < 0 ) {
3689         --zSig0;
3690         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3691     }
3692     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3693     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3694         mul64To128( bSig, zSig1, &term1, &term2 );
3695         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3696         while ( (sbits64) rem1 < 0 ) {
3697             --zSig1;
3698             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3699         }
3700         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3701     }
3702     return
3703         roundAndPackFloatx80(
3704             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3705
3706 }
3707
3708 /*----------------------------------------------------------------------------
3709 | Returns the remainder of the extended double-precision floating-point value
3710 | `a' with respect to the corresponding value `b'.  The operation is performed
3711 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3712 *----------------------------------------------------------------------------*/
3713
3714 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
3715 {
3716     flag aSign, bSign, zSign;
3717     int32 aExp, bExp, expDiff;
3718     bits64 aSig0, aSig1, bSig;
3719     bits64 q, term0, term1, alternateASig0, alternateASig1;
3720     floatx80 z;
3721
3722     aSig0 = extractFloatx80Frac( a );
3723     aExp = extractFloatx80Exp( a );
3724     aSign = extractFloatx80Sign( a );
3725     bSig = extractFloatx80Frac( b );
3726     bExp = extractFloatx80Exp( b );
3727     bSign = extractFloatx80Sign( b );
3728     if ( aExp == 0x7FFF ) {
3729         if (    (bits64) ( aSig0<<1 )
3730              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3731             return propagateFloatx80NaN( a, b STATUS_VAR );
3732         }
3733         goto invalid;
3734     }
3735     if ( bExp == 0x7FFF ) {
3736         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3737         return a;
3738     }
3739     if ( bExp == 0 ) {
3740         if ( bSig == 0 ) {
3741  invalid:
3742             float_raise( float_flag_invalid STATUS_VAR);
3743             z.low = floatx80_default_nan_low;
3744             z.high = floatx80_default_nan_high;
3745             return z;
3746         }
3747         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3748     }
3749     if ( aExp == 0 ) {
3750         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3751         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3752     }
3753     bSig |= LIT64( 0x8000000000000000 );
3754     zSign = aSign;
3755     expDiff = aExp - bExp;
3756     aSig1 = 0;
3757     if ( expDiff < 0 ) {
3758         if ( expDiff < -1 ) return a;
3759         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3760         expDiff = 0;
3761     }
3762     q = ( bSig <= aSig0 );
3763     if ( q ) aSig0 -= bSig;
3764     expDiff -= 64;
3765     while ( 0 < expDiff ) {
3766         q = estimateDiv128To64( aSig0, aSig1, bSig );
3767         q = ( 2 < q ) ? q - 2 : 0;
3768         mul64To128( bSig, q, &term0, &term1 );
3769         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3770         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3771         expDiff -= 62;
3772     }
3773     expDiff += 64;
3774     if ( 0 < expDiff ) {
3775         q = estimateDiv128To64( aSig0, aSig1, bSig );
3776         q = ( 2 < q ) ? q - 2 : 0;
3777         q >>= 64 - expDiff;
3778         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3779         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3780         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3781         while ( le128( term0, term1, aSig0, aSig1 ) ) {
3782             ++q;
3783             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3784         }
3785     }
3786     else {
3787         term1 = 0;
3788         term0 = bSig;
3789     }
3790     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3791     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3792          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3793               && ( q & 1 ) )
3794        ) {
3795         aSig0 = alternateASig0;
3796         aSig1 = alternateASig1;
3797         zSign = ! zSign;
3798     }
3799     return
3800         normalizeRoundAndPackFloatx80(
3801             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
3802
3803 }
3804
3805 /*----------------------------------------------------------------------------
3806 | Returns the square root of the extended double-precision floating-point
3807 | value `a'.  The operation is performed according to the IEC/IEEE Standard
3808 | for Binary Floating-Point Arithmetic.
3809 *----------------------------------------------------------------------------*/
3810
3811 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
3812 {
3813     flag aSign;
3814     int32 aExp, zExp;
3815     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3816     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3817     floatx80 z;
3818
3819     aSig0 = extractFloatx80Frac( a );
3820     aExp = extractFloatx80Exp( a );
3821     aSign = extractFloatx80Sign( a );
3822     if ( aExp == 0x7FFF ) {
3823         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
3824         if ( ! aSign ) return a;
3825         goto invalid;
3826     }
3827     if ( aSign ) {
3828         if ( ( aExp | aSig0 ) == 0 ) return a;
3829  invalid:
3830         float_raise( float_flag_invalid STATUS_VAR);
3831         z.low = floatx80_default_nan_low;
3832         z.high = floatx80_default_nan_high;
3833         return z;
3834     }
3835     if ( aExp == 0 ) {
3836         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3837         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3838     }
3839     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3840     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3841     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3842     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3843     doubleZSig0 = zSig0<<1;
3844     mul64To128( zSig0, zSig0, &term0, &term1 );
3845     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3846     while ( (sbits64) rem0 < 0 ) {
3847         --zSig0;
3848         doubleZSig0 -= 2;
3849         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3850     }
3851     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3852     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3853         if ( zSig1 == 0 ) zSig1 = 1;
3854         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3855         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3856         mul64To128( zSig1, zSig1, &term2, &term3 );
3857         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3858         while ( (sbits64) rem1 < 0 ) {
3859             --zSig1;
3860             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3861             term3 |= 1;
3862             term2 |= doubleZSig0;
3863             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3864         }
3865         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3866     }
3867     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3868     zSig0 |= doubleZSig0;
3869     return
3870         roundAndPackFloatx80(
3871             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
3872
3873 }
3874
3875 /*----------------------------------------------------------------------------
3876 | Returns 1 if the extended double-precision floating-point value `a' is
3877 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
3878 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3879 | Arithmetic.
3880 *----------------------------------------------------------------------------*/
3881
3882 flag floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
3883 {
3884
3885     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3886               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3887          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3888               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3889        ) {
3890         if (    floatx80_is_signaling_nan( a )
3891              || floatx80_is_signaling_nan( b ) ) {
3892             float_raise( float_flag_invalid STATUS_VAR);
3893         }
3894         return 0;
3895     }
3896     return
3897            ( a.low == b.low )
3898         && (    ( a.high == b.high )
3899              || (    ( a.low == 0 )
3900                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3901            );
3902
3903 }
3904
3905 /*----------------------------------------------------------------------------
3906 | Returns 1 if the extended double-precision floating-point value `a' is
3907 | less than or equal to the corresponding value `b', and 0 otherwise.  The
3908 | comparison is performed according to the IEC/IEEE Standard for Binary
3909 | Floating-Point Arithmetic.
3910 *----------------------------------------------------------------------------*/
3911
3912 flag floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
3913 {
3914     flag aSign, bSign;
3915
3916     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3917               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3918          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3919               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3920        ) {
3921         float_raise( float_flag_invalid STATUS_VAR);
3922         return 0;
3923     }
3924     aSign = extractFloatx80Sign( a );
3925     bSign = extractFloatx80Sign( b );
3926     if ( aSign != bSign ) {
3927         return
3928                aSign
3929             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3930                  == 0 );
3931     }
3932     return
3933           aSign ? le128( b.high, b.low, a.high, a.low )
3934         : le128( a.high, a.low, b.high, b.low );
3935
3936 }
3937
3938 /*----------------------------------------------------------------------------
3939 | Returns 1 if the extended double-precision floating-point value `a' is
3940 | less than the corresponding value `b', and 0 otherwise.  The comparison
3941 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3942 | Arithmetic.
3943 *----------------------------------------------------------------------------*/
3944
3945 flag floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
3946 {
3947     flag aSign, bSign;
3948
3949     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3950               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3951          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3952               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3953        ) {
3954         float_raise( float_flag_invalid STATUS_VAR);
3955         return 0;
3956     }
3957     aSign = extractFloatx80Sign( a );
3958     bSign = extractFloatx80Sign( b );
3959     if ( aSign != bSign ) {
3960         return
3961                aSign
3962             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3963                  != 0 );
3964     }
3965     return
3966           aSign ? lt128( b.high, b.low, a.high, a.low )
3967         : lt128( a.high, a.low, b.high, b.low );
3968
3969 }
3970
3971 /*----------------------------------------------------------------------------
3972 | Returns 1 if the extended double-precision floating-point value `a' is equal
3973 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
3974 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3975 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3976 *----------------------------------------------------------------------------*/
3977
3978 flag floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
3979 {
3980
3981     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3982               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3983          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3984               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3985        ) {
3986         float_raise( float_flag_invalid STATUS_VAR);
3987         return 0;
3988     }
3989     return
3990            ( a.low == b.low )
3991         && (    ( a.high == b.high )
3992              || (    ( a.low == 0 )
3993                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3994            );
3995
3996 }
3997
3998 /*----------------------------------------------------------------------------
3999 | Returns 1 if the extended double-precision floating-point value `a' is less
4000 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4001 | do not cause an exception.  Otherwise, the comparison is performed according
4002 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4003 *----------------------------------------------------------------------------*/
4004
4005 flag floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4006 {
4007     flag aSign, bSign;
4008
4009     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4010               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4011          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4012               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4013        ) {
4014         if (    floatx80_is_signaling_nan( a )
4015              || floatx80_is_signaling_nan( b ) ) {
4016             float_raise( float_flag_invalid STATUS_VAR);
4017         }
4018         return 0;
4019     }
4020     aSign = extractFloatx80Sign( a );
4021     bSign = extractFloatx80Sign( b );
4022     if ( aSign != bSign ) {
4023         return
4024                aSign
4025             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4026                  == 0 );
4027     }
4028     return
4029           aSign ? le128( b.high, b.low, a.high, a.low )
4030         : le128( a.high, a.low, b.high, b.low );
4031
4032 }
4033
4034 /*----------------------------------------------------------------------------
4035 | Returns 1 if the extended double-precision floating-point value `a' is less
4036 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4037 | an exception.  Otherwise, the comparison is performed according to the
4038 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4039 *----------------------------------------------------------------------------*/
4040
4041 flag floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4042 {
4043     flag aSign, bSign;
4044
4045     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4046               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4047          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4048               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4049        ) {
4050         if (    floatx80_is_signaling_nan( a )
4051              || floatx80_is_signaling_nan( b ) ) {
4052             float_raise( float_flag_invalid STATUS_VAR);
4053         }
4054         return 0;
4055     }
4056     aSign = extractFloatx80Sign( a );
4057     bSign = extractFloatx80Sign( b );
4058     if ( aSign != bSign ) {
4059         return
4060                aSign
4061             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4062                  != 0 );
4063     }
4064     return
4065           aSign ? lt128( b.high, b.low, a.high, a.low )
4066         : lt128( a.high, a.low, b.high, b.low );
4067
4068 }
4069
4070 #endif
4071
4072 #ifdef FLOAT128
4073
4074 /*----------------------------------------------------------------------------
4075 | Returns the result of converting the quadruple-precision floating-point
4076 | value `a' to the 32-bit two's complement integer format.  The conversion
4077 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4078 | Arithmetic---which means in particular that the conversion is rounded
4079 | according to the current rounding mode.  If `a' is a NaN, the largest
4080 | positive integer is returned.  Otherwise, if the conversion overflows, the
4081 | largest integer with the same sign as `a' is returned.
4082 *----------------------------------------------------------------------------*/
4083
4084 int32 float128_to_int32( float128 a STATUS_PARAM )
4085 {
4086     flag aSign;
4087     int32 aExp, shiftCount;
4088     bits64 aSig0, aSig1;
4089
4090     aSig1 = extractFloat128Frac1( a );
4091     aSig0 = extractFloat128Frac0( a );
4092     aExp = extractFloat128Exp( a );
4093     aSign = extractFloat128Sign( a );
4094     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4095     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4096     aSig0 |= ( aSig1 != 0 );
4097     shiftCount = 0x4028 - aExp;
4098     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4099     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4100
4101 }
4102
4103 /*----------------------------------------------------------------------------
4104 | Returns the result of converting the quadruple-precision floating-point
4105 | value `a' to the 32-bit two's complement integer format.  The conversion
4106 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4107 | Arithmetic, except that the conversion is always rounded toward zero.  If
4108 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4109 | conversion overflows, the largest integer with the same sign as `a' is
4110 | returned.
4111 *----------------------------------------------------------------------------*/
4112
4113 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4114 {
4115     flag aSign;
4116     int32 aExp, shiftCount;
4117     bits64 aSig0, aSig1, savedASig;
4118     int32 z;
4119
4120     aSig1 = extractFloat128Frac1( a );
4121     aSig0 = extractFloat128Frac0( a );
4122     aExp = extractFloat128Exp( a );
4123     aSign = extractFloat128Sign( a );
4124     aSig0 |= ( aSig1 != 0 );
4125     if ( 0x401E < aExp ) {
4126         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4127         goto invalid;
4128     }
4129     else if ( aExp < 0x3FFF ) {
4130         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4131         return 0;
4132     }
4133     aSig0 |= LIT64( 0x0001000000000000 );
4134     shiftCount = 0x402F - aExp;
4135     savedASig = aSig0;
4136     aSig0 >>= shiftCount;
4137     z = aSig0;
4138     if ( aSign ) z = - z;
4139     if ( ( z < 0 ) ^ aSign ) {
4140  invalid:
4141         float_raise( float_flag_invalid STATUS_VAR);
4142         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4143     }
4144     if ( ( aSig0<<shiftCount ) != savedASig ) {
4145         STATUS(float_exception_flags) |= float_flag_inexact;
4146     }
4147     return z;
4148
4149 }
4150
4151 /*----------------------------------------------------------------------------
4152 | Returns the result of converting the quadruple-precision floating-point
4153 | value `a' to the 64-bit two's complement integer format.  The conversion
4154 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4155 | Arithmetic---which means in particular that the conversion is rounded
4156 | according to the current rounding mode.  If `a' is a NaN, the largest
4157 | positive integer is returned.  Otherwise, if the conversion overflows, the
4158 | largest integer with the same sign as `a' is returned.
4159 *----------------------------------------------------------------------------*/
4160
4161 int64 float128_to_int64( float128 a STATUS_PARAM )
4162 {
4163     flag aSign;
4164     int32 aExp, shiftCount;
4165     bits64 aSig0, aSig1;
4166
4167     aSig1 = extractFloat128Frac1( a );
4168     aSig0 = extractFloat128Frac0( a );
4169     aExp = extractFloat128Exp( a );
4170     aSign = extractFloat128Sign( a );
4171     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4172     shiftCount = 0x402F - aExp;
4173     if ( shiftCount <= 0 ) {
4174         if ( 0x403E < aExp ) {
4175             float_raise( float_flag_invalid STATUS_VAR);
4176             if (    ! aSign
4177                  || (    ( aExp == 0x7FFF )
4178                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4179                     )
4180                ) {
4181                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4182             }
4183             return (sbits64) LIT64( 0x8000000000000000 );
4184         }
4185         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4186     }
4187     else {
4188         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4189     }
4190     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4191
4192 }
4193
4194 /*----------------------------------------------------------------------------
4195 | Returns the result of converting the quadruple-precision floating-point
4196 | value `a' to the 64-bit two's complement integer format.  The conversion
4197 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4198 | Arithmetic, except that the conversion is always rounded toward zero.
4199 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4200 | the conversion overflows, the largest integer with the same sign as `a' is
4201 | returned.
4202 *----------------------------------------------------------------------------*/
4203
4204 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4205 {
4206     flag aSign;
4207     int32 aExp, shiftCount;
4208     bits64 aSig0, aSig1;
4209     int64 z;
4210
4211     aSig1 = extractFloat128Frac1( a );
4212     aSig0 = extractFloat128Frac0( a );
4213     aExp = extractFloat128Exp( a );
4214     aSign = extractFloat128Sign( a );
4215     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4216     shiftCount = aExp - 0x402F;
4217     if ( 0 < shiftCount ) {
4218         if ( 0x403E <= aExp ) {
4219             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4220             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4221                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4222                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4223             }
4224             else {
4225                 float_raise( float_flag_invalid STATUS_VAR);
4226                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4227                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4228                 }
4229             }
4230             return (sbits64) LIT64( 0x8000000000000000 );
4231         }
4232         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4233         if ( (bits64) ( aSig1<<shiftCount ) ) {
4234             STATUS(float_exception_flags) |= float_flag_inexact;
4235         }
4236     }
4237     else {
4238         if ( aExp < 0x3FFF ) {
4239             if ( aExp | aSig0 | aSig1 ) {
4240                 STATUS(float_exception_flags) |= float_flag_inexact;
4241             }
4242             return 0;
4243         }
4244         z = aSig0>>( - shiftCount );
4245         if (    aSig1
4246              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4247             STATUS(float_exception_flags) |= float_flag_inexact;
4248         }
4249     }
4250     if ( aSign ) z = - z;
4251     return z;
4252
4253 }
4254
4255 /*----------------------------------------------------------------------------
4256 | Returns the result of converting the quadruple-precision floating-point
4257 | value `a' to the single-precision floating-point format.  The conversion
4258 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4259 | Arithmetic.
4260 *----------------------------------------------------------------------------*/
4261
4262 float32 float128_to_float32( float128 a STATUS_PARAM )
4263 {
4264     flag aSign;
4265     int32 aExp;
4266     bits64 aSig0, aSig1;
4267     bits32 zSig;
4268
4269     aSig1 = extractFloat128Frac1( a );
4270     aSig0 = extractFloat128Frac0( a );
4271     aExp = extractFloat128Exp( a );
4272     aSign = extractFloat128Sign( a );
4273     if ( aExp == 0x7FFF ) {
4274         if ( aSig0 | aSig1 ) {
4275             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4276         }
4277         return packFloat32( aSign, 0xFF, 0 );
4278     }
4279     aSig0 |= ( aSig1 != 0 );
4280     shift64RightJamming( aSig0, 18, &aSig0 );
4281     zSig = aSig0;
4282     if ( aExp || zSig ) {
4283         zSig |= 0x40000000;
4284         aExp -= 0x3F81;
4285     }
4286     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4287
4288 }
4289
4290 /*----------------------------------------------------------------------------
4291 | Returns the result of converting the quadruple-precision floating-point
4292 | value `a' to the double-precision floating-point format.  The conversion
4293 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4294 | Arithmetic.
4295 *----------------------------------------------------------------------------*/
4296
4297 float64 float128_to_float64( float128 a STATUS_PARAM )
4298 {
4299     flag aSign;
4300     int32 aExp;
4301     bits64 aSig0, aSig1;
4302
4303     aSig1 = extractFloat128Frac1( a );
4304     aSig0 = extractFloat128Frac0( a );
4305     aExp = extractFloat128Exp( a );
4306     aSign = extractFloat128Sign( a );
4307     if ( aExp == 0x7FFF ) {
4308         if ( aSig0 | aSig1 ) {
4309             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4310         }
4311         return packFloat64( aSign, 0x7FF, 0 );
4312     }
4313     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4314     aSig0 |= ( aSig1 != 0 );
4315     if ( aExp || aSig0 ) {
4316         aSig0 |= LIT64( 0x4000000000000000 );
4317         aExp -= 0x3C01;
4318     }
4319     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4320
4321 }
4322
4323 #ifdef FLOATX80
4324
4325 /*----------------------------------------------------------------------------
4326 | Returns the result of converting the quadruple-precision floating-point
4327 | value `a' to the extended double-precision floating-point format.  The
4328 | conversion is performed according to the IEC/IEEE Standard for Binary
4329 | Floating-Point Arithmetic.
4330 *----------------------------------------------------------------------------*/
4331
4332 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4333 {
4334     flag aSign;
4335     int32 aExp;
4336     bits64 aSig0, aSig1;
4337
4338     aSig1 = extractFloat128Frac1( a );
4339     aSig0 = extractFloat128Frac0( a );
4340     aExp = extractFloat128Exp( a );
4341     aSign = extractFloat128Sign( a );
4342     if ( aExp == 0x7FFF ) {
4343         if ( aSig0 | aSig1 ) {
4344             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4345         }
4346         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4347     }
4348     if ( aExp == 0 ) {
4349         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4350         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4351     }
4352     else {
4353         aSig0 |= LIT64( 0x0001000000000000 );
4354     }
4355     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4356     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4357
4358 }
4359
4360 #endif
4361
4362 /*----------------------------------------------------------------------------
4363 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4364 | returns the result as a quadruple-precision floating-point value.  The
4365 | operation is performed according to the IEC/IEEE Standard for Binary
4366 | Floating-Point Arithmetic.
4367 *----------------------------------------------------------------------------*/
4368
4369 float128 float128_round_to_int( float128 a STATUS_PARAM )
4370 {
4371     flag aSign;
4372     int32 aExp;
4373     bits64 lastBitMask, roundBitsMask;
4374     int8 roundingMode;
4375     float128 z;
4376
4377     aExp = extractFloat128Exp( a );
4378     if ( 0x402F <= aExp ) {
4379         if ( 0x406F <= aExp ) {
4380             if (    ( aExp == 0x7FFF )
4381                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4382                ) {
4383                 return propagateFloat128NaN( a, a STATUS_VAR );
4384             }
4385             return a;
4386         }
4387         lastBitMask = 1;
4388         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4389         roundBitsMask = lastBitMask - 1;
4390         z = a;
4391         roundingMode = STATUS(float_rounding_mode);
4392         if ( roundingMode == float_round_nearest_even ) {
4393             if ( lastBitMask ) {
4394                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4395                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4396             }
4397             else {
4398                 if ( (sbits64) z.low < 0 ) {
4399                     ++z.high;
4400                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4401                 }
4402             }
4403         }
4404         else if ( roundingMode != float_round_to_zero ) {
4405             if (   extractFloat128Sign( z )
4406                  ^ ( roundingMode == float_round_up ) ) {
4407                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4408             }
4409         }
4410         z.low &= ~ roundBitsMask;
4411     }
4412     else {
4413         if ( aExp < 0x3FFF ) {
4414             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4415             STATUS(float_exception_flags) |= float_flag_inexact;
4416             aSign = extractFloat128Sign( a );
4417             switch ( STATUS(float_rounding_mode) ) {
4418              case float_round_nearest_even:
4419                 if (    ( aExp == 0x3FFE )
4420                      && (   extractFloat128Frac0( a )
4421                           | extractFloat128Frac1( a ) )
4422                    ) {
4423                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4424                 }
4425                 break;
4426              case float_round_down:
4427                 return
4428                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4429                     : packFloat128( 0, 0, 0, 0 );
4430              case float_round_up:
4431                 return
4432                       aSign ? packFloat128( 1, 0, 0, 0 )
4433                     : packFloat128( 0, 0x3FFF, 0, 0 );
4434             }
4435             return packFloat128( aSign, 0, 0, 0 );
4436         }
4437         lastBitMask = 1;
4438         lastBitMask <<= 0x402F - aExp;
4439         roundBitsMask = lastBitMask - 1;
4440         z.low = 0;
4441         z.high = a.high;
4442         roundingMode = STATUS(float_rounding_mode);
4443         if ( roundingMode == float_round_nearest_even ) {
4444             z.high += lastBitMask>>1;
4445             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4446                 z.high &= ~ lastBitMask;
4447             }
4448         }
4449         else if ( roundingMode != float_round_to_zero ) {
4450             if (   extractFloat128Sign( z )
4451                  ^ ( roundingMode == float_round_up ) ) {
4452                 z.high |= ( a.low != 0 );
4453                 z.high += roundBitsMask;
4454             }
4455         }
4456         z.high &= ~ roundBitsMask;
4457     }
4458     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4459         STATUS(float_exception_flags) |= float_flag_inexact;
4460     }
4461     return z;
4462
4463 }
4464
4465 /*----------------------------------------------------------------------------
4466 | Returns the result of adding the absolute values of the quadruple-precision
4467 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4468 | before being returned.  `zSign' is ignored if the result is a NaN.
4469 | The addition is performed according to the IEC/IEEE Standard for Binary
4470 | Floating-Point Arithmetic.
4471 *----------------------------------------------------------------------------*/
4472
4473 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4474 {
4475     int32 aExp, bExp, zExp;
4476     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4477     int32 expDiff;
4478
4479     aSig1 = extractFloat128Frac1( a );
4480     aSig0 = extractFloat128Frac0( a );
4481     aExp = extractFloat128Exp( a );
4482     bSig1 = extractFloat128Frac1( b );
4483     bSig0 = extractFloat128Frac0( b );
4484     bExp = extractFloat128Exp( b );
4485     expDiff = aExp - bExp;
4486     if ( 0 < expDiff ) {
4487         if ( aExp == 0x7FFF ) {
4488             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4489             return a;
4490         }
4491         if ( bExp == 0 ) {
4492             --expDiff;
4493         }
4494         else {
4495             bSig0 |= LIT64( 0x0001000000000000 );
4496         }
4497         shift128ExtraRightJamming(
4498             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4499         zExp = aExp;
4500     }
4501     else if ( expDiff < 0 ) {
4502         if ( bExp == 0x7FFF ) {
4503             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4504             return packFloat128( zSign, 0x7FFF, 0, 0 );
4505         }
4506         if ( aExp == 0 ) {
4507             ++expDiff;
4508         }
4509         else {
4510             aSig0 |= LIT64( 0x0001000000000000 );
4511         }
4512         shift128ExtraRightJamming(
4513             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4514         zExp = bExp;
4515     }
4516     else {
4517         if ( aExp == 0x7FFF ) {
4518             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4519                 return propagateFloat128NaN( a, b STATUS_VAR );
4520             }
4521             return a;
4522         }
4523         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4524         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4525         zSig2 = 0;
4526         zSig0 |= LIT64( 0x0002000000000000 );
4527         zExp = aExp;
4528         goto shiftRight1;
4529     }
4530     aSig0 |= LIT64( 0x0001000000000000 );
4531     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4532     --zExp;
4533     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4534     ++zExp;
4535  shiftRight1:
4536     shift128ExtraRightJamming(
4537         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4538  roundAndPack:
4539     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4540
4541 }
4542
4543 /*----------------------------------------------------------------------------
4544 | Returns the result of subtracting the absolute values of the quadruple-
4545 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
4546 | difference is negated before being returned.  `zSign' is ignored if the
4547 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4548 | Standard for Binary Floating-Point Arithmetic.
4549 *----------------------------------------------------------------------------*/
4550
4551 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4552 {
4553     int32 aExp, bExp, zExp;
4554     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4555     int32 expDiff;
4556     float128 z;
4557
4558     aSig1 = extractFloat128Frac1( a );
4559     aSig0 = extractFloat128Frac0( a );
4560     aExp = extractFloat128Exp( a );
4561     bSig1 = extractFloat128Frac1( b );
4562     bSig0 = extractFloat128Frac0( b );
4563     bExp = extractFloat128Exp( b );
4564     expDiff = aExp - bExp;
4565     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4566     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4567     if ( 0 < expDiff ) goto aExpBigger;
4568     if ( expDiff < 0 ) goto bExpBigger;
4569     if ( aExp == 0x7FFF ) {
4570         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4571             return propagateFloat128NaN( a, b STATUS_VAR );
4572         }
4573         float_raise( float_flag_invalid STATUS_VAR);
4574         z.low = float128_default_nan_low;
4575         z.high = float128_default_nan_high;
4576         return z;
4577     }
4578     if ( aExp == 0 ) {
4579         aExp = 1;
4580         bExp = 1;
4581     }
4582     if ( bSig0 < aSig0 ) goto aBigger;
4583     if ( aSig0 < bSig0 ) goto bBigger;
4584     if ( bSig1 < aSig1 ) goto aBigger;
4585     if ( aSig1 < bSig1 ) goto bBigger;
4586     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4587  bExpBigger:
4588     if ( bExp == 0x7FFF ) {
4589         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4590         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4591     }
4592     if ( aExp == 0 ) {
4593         ++expDiff;
4594     }
4595     else {
4596         aSig0 |= LIT64( 0x4000000000000000 );
4597     }
4598     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4599     bSig0 |= LIT64( 0x4000000000000000 );
4600  bBigger:
4601     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4602     zExp = bExp;
4603     zSign ^= 1;
4604     goto normalizeRoundAndPack;
4605  aExpBigger:
4606     if ( aExp == 0x7FFF ) {
4607         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4608         return a;
4609     }
4610     if ( bExp == 0 ) {
4611         --expDiff;
4612     }
4613     else {
4614         bSig0 |= LIT64( 0x4000000000000000 );
4615     }
4616     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4617     aSig0 |= LIT64( 0x4000000000000000 );
4618  aBigger:
4619     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4620     zExp = aExp;
4621  normalizeRoundAndPack:
4622     --zExp;
4623     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4624
4625 }
4626
4627 /*----------------------------------------------------------------------------
4628 | Returns the result of adding the quadruple-precision floating-point values
4629 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4630 | for Binary Floating-Point Arithmetic.
4631 *----------------------------------------------------------------------------*/
4632
4633 float128 float128_add( float128 a, float128 b STATUS_PARAM )
4634 {
4635     flag aSign, bSign;
4636
4637     aSign = extractFloat128Sign( a );
4638     bSign = extractFloat128Sign( b );
4639     if ( aSign == bSign ) {
4640         return addFloat128Sigs( a, b, aSign STATUS_VAR );
4641     }
4642     else {
4643         return subFloat128Sigs( a, b, aSign STATUS_VAR );
4644     }
4645
4646 }
4647
4648 /*----------------------------------------------------------------------------
4649 | Returns the result of subtracting the quadruple-precision floating-point
4650 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4651 | Standard for Binary Floating-Point Arithmetic.
4652 *----------------------------------------------------------------------------*/
4653
4654 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4655 {
4656     flag aSign, bSign;
4657
4658     aSign = extractFloat128Sign( a );
4659     bSign = extractFloat128Sign( b );
4660     if ( aSign == bSign ) {
4661         return subFloat128Sigs( a, b, aSign STATUS_VAR );
4662     }
4663     else {
4664         return addFloat128Sigs( a, b, aSign STATUS_VAR );
4665     }
4666
4667 }
4668
4669 /*----------------------------------------------------------------------------
4670 | Returns the result of multiplying the quadruple-precision floating-point
4671 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4672 | Standard for Binary Floating-Point Arithmetic.
4673 *----------------------------------------------------------------------------*/
4674
4675 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4676 {
4677     flag aSign, bSign, zSign;
4678     int32 aExp, bExp, zExp;
4679     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4680     float128 z;
4681
4682     aSig1 = extractFloat128Frac1( a );
4683     aSig0 = extractFloat128Frac0( a );
4684     aExp = extractFloat128Exp( a );
4685     aSign = extractFloat128Sign( a );
4686     bSig1 = extractFloat128Frac1( b );
4687     bSig0 = extractFloat128Frac0( b );
4688     bExp = extractFloat128Exp( b );
4689     bSign = extractFloat128Sign( b );
4690     zSign = aSign ^ bSign;
4691     if ( aExp == 0x7FFF ) {
4692         if (    ( aSig0 | aSig1 )
4693              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4694             return propagateFloat128NaN( a, b STATUS_VAR );
4695         }
4696         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4697         return packFloat128( zSign, 0x7FFF, 0, 0 );
4698     }
4699     if ( bExp == 0x7FFF ) {
4700         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4701         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4702  invalid:
4703             float_raise( float_flag_invalid STATUS_VAR);
4704             z.low = float128_default_nan_low;
4705             z.high = float128_default_nan_high;
4706             return z;
4707         }
4708         return packFloat128( zSign, 0x7FFF, 0, 0 );
4709     }
4710     if ( aExp == 0 ) {
4711         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4712         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4713     }
4714     if ( bExp == 0 ) {
4715         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4716         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4717     }
4718     zExp = aExp + bExp - 0x4000;
4719     aSig0 |= LIT64( 0x0001000000000000 );
4720     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4721     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4722     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4723     zSig2 |= ( zSig3 != 0 );
4724     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4725         shift128ExtraRightJamming(
4726             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4727         ++zExp;
4728     }
4729     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4730
4731 }
4732
4733 /*----------------------------------------------------------------------------
4734 | Returns the result of dividing the quadruple-precision floating-point value
4735 | `a' by the corresponding value `b'.  The operation is performed according to
4736 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4737 *----------------------------------------------------------------------------*/
4738
4739 float128 float128_div( float128 a, float128 b STATUS_PARAM )
4740 {
4741     flag aSign, bSign, zSign;
4742     int32 aExp, bExp, zExp;
4743     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4744     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4745     float128 z;
4746
4747     aSig1 = extractFloat128Frac1( a );
4748     aSig0 = extractFloat128Frac0( a );
4749     aExp = extractFloat128Exp( a );
4750     aSign = extractFloat128Sign( a );
4751     bSig1 = extractFloat128Frac1( b );
4752     bSig0 = extractFloat128Frac0( b );
4753     bExp = extractFloat128Exp( b );
4754     bSign = extractFloat128Sign( b );
4755     zSign = aSign ^ bSign;
4756     if ( aExp == 0x7FFF ) {
4757         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4758         if ( bExp == 0x7FFF ) {
4759             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4760             goto invalid;
4761         }
4762         return packFloat128( zSign, 0x7FFF, 0, 0 );
4763     }
4764     if ( bExp == 0x7FFF ) {
4765         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4766         return packFloat128( zSign, 0, 0, 0 );
4767     }
4768     if ( bExp == 0 ) {
4769         if ( ( bSig0 | bSig1 ) == 0 ) {
4770             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4771  invalid:
4772                 float_raise( float_flag_invalid STATUS_VAR);
4773                 z.low = float128_default_nan_low;
4774                 z.high = float128_default_nan_high;
4775                 return z;
4776             }
4777             float_raise( float_flag_divbyzero STATUS_VAR);
4778             return packFloat128( zSign, 0x7FFF, 0, 0 );
4779         }
4780         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4781     }
4782     if ( aExp == 0 ) {
4783         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4784         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4785     }
4786     zExp = aExp - bExp + 0x3FFD;
4787     shortShift128Left(
4788         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4789     shortShift128Left(
4790         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4791     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4792         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4793         ++zExp;
4794     }
4795     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4796     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4797     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4798     while ( (sbits64) rem0 < 0 ) {
4799         --zSig0;
4800         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4801     }
4802     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4803     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4804         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4805         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4806         while ( (sbits64) rem1 < 0 ) {
4807             --zSig1;
4808             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4809         }
4810         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4811     }
4812     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4813     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4814
4815 }
4816
4817 /*----------------------------------------------------------------------------
4818 | Returns the remainder of the quadruple-precision floating-point value `a'
4819 | with respect to the corresponding value `b'.  The operation is performed
4820 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4821 *----------------------------------------------------------------------------*/
4822
4823 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
4824 {
4825     flag aSign, bSign, zSign;
4826     int32 aExp, bExp, expDiff;
4827     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4828     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4829     sbits64 sigMean0;
4830     float128 z;
4831
4832     aSig1 = extractFloat128Frac1( a );
4833     aSig0 = extractFloat128Frac0( a );
4834     aExp = extractFloat128Exp( a );
4835     aSign = extractFloat128Sign( a );
4836     bSig1 = extractFloat128Frac1( b );
4837     bSig0 = extractFloat128Frac0( b );
4838     bExp = extractFloat128Exp( b );
4839     bSign = extractFloat128Sign( b );
4840     if ( aExp == 0x7FFF ) {
4841         if (    ( aSig0 | aSig1 )
4842              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4843             return propagateFloat128NaN( a, b STATUS_VAR );
4844         }
4845         goto invalid;
4846     }
4847     if ( bExp == 0x7FFF ) {
4848         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4849         return a;
4850     }
4851     if ( bExp == 0 ) {
4852         if ( ( bSig0 | bSig1 ) == 0 ) {
4853  invalid:
4854             float_raise( float_flag_invalid STATUS_VAR);
4855             z.low = float128_default_nan_low;
4856             z.high = float128_default_nan_high;
4857             return z;
4858         }
4859         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4860     }
4861     if ( aExp == 0 ) {
4862         if ( ( aSig0 | aSig1 ) == 0 ) return a;
4863         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4864     }
4865     expDiff = aExp - bExp;
4866     if ( expDiff < -1 ) return a;
4867     shortShift128Left(
4868         aSig0 | LIT64( 0x0001000000000000 ),
4869         aSig1,
4870         15 - ( expDiff < 0 ),
4871         &aSig0,
4872         &aSig1
4873     );
4874     shortShift128Left(
4875         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4876     q = le128( bSig0, bSig1, aSig0, aSig1 );
4877     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4878     expDiff -= 64;
4879     while ( 0 < expDiff ) {
4880         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4881         q = ( 4 < q ) ? q - 4 : 0;
4882         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4883         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4884         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4885         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4886         expDiff -= 61;
4887     }
4888     if ( -64 < expDiff ) {
4889         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4890         q = ( 4 < q ) ? q - 4 : 0;
4891         q >>= - expDiff;
4892         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4893         expDiff += 52;
4894         if ( expDiff < 0 ) {
4895             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4896         }
4897         else {
4898             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4899         }
4900         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4901         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4902     }
4903     else {
4904         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4905         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4906     }
4907     do {
4908         alternateASig0 = aSig0;
4909         alternateASig1 = aSig1;
4910         ++q;
4911         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4912     } while ( 0 <= (sbits64) aSig0 );
4913     add128(
4914         aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
4915     if (    ( sigMean0 < 0 )
4916          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4917         aSig0 = alternateASig0;
4918         aSig1 = alternateASig1;
4919     }
4920     zSign = ( (sbits64) aSig0 < 0 );
4921     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4922     return
4923         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
4924
4925 }
4926
4927 /*----------------------------------------------------------------------------
4928 | Returns the square root of the quadruple-precision floating-point value `a'.
4929 | The operation is performed according to the IEC/IEEE Standard for Binary
4930 | Floating-Point Arithmetic.
4931 *----------------------------------------------------------------------------*/
4932
4933 float128 float128_sqrt( float128 a STATUS_PARAM )
4934 {
4935     flag aSign;
4936     int32 aExp, zExp;
4937     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
4938     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4939     float128 z;
4940
4941     aSig1 = extractFloat128Frac1( a );
4942     aSig0 = extractFloat128Frac0( a );
4943     aExp = extractFloat128Exp( a );
4944     aSign = extractFloat128Sign( a );
4945     if ( aExp == 0x7FFF ) {
4946         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
4947         if ( ! aSign ) return a;
4948         goto invalid;
4949     }
4950     if ( aSign ) {
4951         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
4952  invalid:
4953         float_raise( float_flag_invalid STATUS_VAR);
4954         z.low = float128_default_nan_low;
4955         z.high = float128_default_nan_high;
4956         return z;
4957     }
4958     if ( aExp == 0 ) {
4959         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
4960         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4961     }
4962     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
4963     aSig0 |= LIT64( 0x0001000000000000 );
4964     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
4965     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
4966     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4967     doubleZSig0 = zSig0<<1;
4968     mul64To128( zSig0, zSig0, &term0, &term1 );
4969     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4970     while ( (sbits64) rem0 < 0 ) {
4971         --zSig0;
4972         doubleZSig0 -= 2;
4973         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4974     }
4975     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4976     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
4977         if ( zSig1 == 0 ) zSig1 = 1;
4978         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4979         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4980         mul64To128( zSig1, zSig1, &term2, &term3 );
4981         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4982         while ( (sbits64) rem1 < 0 ) {
4983             --zSig1;
4984             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4985             term3 |= 1;
4986             term2 |= doubleZSig0;
4987             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4988         }
4989         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4990     }
4991     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
4992     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4993
4994 }
4995
4996 /*----------------------------------------------------------------------------
4997 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
4998 | the corresponding value `b', and 0 otherwise.  The comparison is performed
4999 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5000 *----------------------------------------------------------------------------*/
5001
5002 flag float128_eq( float128 a, float128 b STATUS_PARAM )
5003 {
5004
5005     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5006               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5007          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5008               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5009        ) {
5010         if (    float128_is_signaling_nan( a )
5011              || float128_is_signaling_nan( b ) ) {
5012             float_raise( float_flag_invalid STATUS_VAR);
5013         }
5014         return 0;
5015     }
5016     return
5017            ( a.low == b.low )
5018         && (    ( a.high == b.high )
5019              || (    ( a.low == 0 )
5020                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5021            );
5022
5023 }
5024
5025 /*----------------------------------------------------------------------------
5026 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5027 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
5028 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5029 | Arithmetic.
5030 *----------------------------------------------------------------------------*/
5031
5032 flag float128_le( float128 a, float128 b STATUS_PARAM )
5033 {
5034     flag aSign, bSign;
5035
5036     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5037               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5038          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5039               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5040        ) {
5041         float_raise( float_flag_invalid STATUS_VAR);
5042         return 0;
5043     }
5044     aSign = extractFloat128Sign( a );
5045     bSign = extractFloat128Sign( b );
5046     if ( aSign != bSign ) {
5047         return
5048                aSign
5049             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5050                  == 0 );
5051     }
5052     return
5053           aSign ? le128( b.high, b.low, a.high, a.low )
5054         : le128( a.high, a.low, b.high, b.low );
5055
5056 }
5057
5058 /*----------------------------------------------------------------------------
5059 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5060 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5061 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5062 *----------------------------------------------------------------------------*/
5063
5064 flag float128_lt( float128 a, float128 b STATUS_PARAM )
5065 {
5066     flag aSign, bSign;
5067
5068     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5069               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5070          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5071               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5072        ) {
5073         float_raise( float_flag_invalid STATUS_VAR);
5074         return 0;
5075     }
5076     aSign = extractFloat128Sign( a );
5077     bSign = extractFloat128Sign( b );
5078     if ( aSign != bSign ) {
5079         return
5080                aSign
5081             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5082                  != 0 );
5083     }
5084     return
5085           aSign ? lt128( b.high, b.low, a.high, a.low )
5086         : lt128( a.high, a.low, b.high, b.low );
5087
5088 }
5089
5090 /*----------------------------------------------------------------------------
5091 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5092 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5093 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5094 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5095 *----------------------------------------------------------------------------*/
5096
5097 flag float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5098 {
5099
5100     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5101               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5102          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5103               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5104        ) {
5105         float_raise( float_flag_invalid STATUS_VAR);
5106         return 0;
5107     }
5108     return
5109            ( a.low == b.low )
5110         && (    ( a.high == b.high )
5111              || (    ( a.low == 0 )
5112                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5113            );
5114
5115 }
5116
5117 /*----------------------------------------------------------------------------
5118 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5119 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5120 | cause an exception.  Otherwise, the comparison is performed according to the
5121 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5122 *----------------------------------------------------------------------------*/
5123
5124 flag float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5125 {
5126     flag aSign, bSign;
5127
5128     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5129               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5130          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5131               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5132        ) {
5133         if (    float128_is_signaling_nan( a )
5134              || float128_is_signaling_nan( b ) ) {
5135             float_raise( float_flag_invalid STATUS_VAR);
5136         }
5137         return 0;
5138     }
5139     aSign = extractFloat128Sign( a );
5140     bSign = extractFloat128Sign( b );
5141     if ( aSign != bSign ) {
5142         return
5143                aSign
5144             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5145                  == 0 );
5146     }
5147     return
5148           aSign ? le128( b.high, b.low, a.high, a.low )
5149         : le128( a.high, a.low, b.high, b.low );
5150
5151 }
5152
5153 /*----------------------------------------------------------------------------
5154 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5155 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5156 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5157 | Standard for Binary Floating-Point Arithmetic.
5158 *----------------------------------------------------------------------------*/
5159
5160 flag float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5161 {
5162     flag aSign, bSign;
5163
5164     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5165               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5166          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5167               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5168        ) {
5169         if (    float128_is_signaling_nan( a )
5170              || float128_is_signaling_nan( b ) ) {
5171             float_raise( float_flag_invalid STATUS_VAR);
5172         }
5173         return 0;
5174     }
5175     aSign = extractFloat128Sign( a );
5176     bSign = extractFloat128Sign( b );
5177     if ( aSign != bSign ) {
5178         return
5179                aSign
5180             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5181                  != 0 );
5182     }
5183     return
5184           aSign ? lt128( b.high, b.low, a.high, a.low )
5185         : lt128( a.high, a.low, b.high, b.low );
5186
5187 }
5188
5189 #endif
5190
5191 /* misc functions */
5192 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5193 {
5194     return int64_to_float32(a STATUS_VAR);
5195 }
5196
5197 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5198 {
5199     return int64_to_float64(a STATUS_VAR);
5200 }
5201
5202 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5203 {
5204     int64_t v;
5205     unsigned int res;
5206
5207     v = float32_to_int64(a STATUS_VAR);
5208     if (v < 0) {
5209         res = 0;
5210         float_raise( float_flag_invalid STATUS_VAR);
5211     } else if (v > 0xffffffff) {
5212         res = 0xffffffff;
5213         float_raise( float_flag_invalid STATUS_VAR);
5214     } else {
5215         res = v;
5216     }
5217     return res;
5218 }
5219
5220 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5221 {
5222     int64_t v;
5223     unsigned int res;
5224
5225     v = float32_to_int64_round_to_zero(a STATUS_VAR);
5226     if (v < 0) {
5227         res = 0;
5228         float_raise( float_flag_invalid STATUS_VAR);
5229     } else if (v > 0xffffffff) {
5230         res = 0xffffffff;
5231         float_raise( float_flag_invalid STATUS_VAR);
5232     } else {
5233         res = v;
5234     }
5235     return res;
5236 }
5237
5238 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5239 {
5240     int64_t v;
5241     unsigned int res;
5242
5243     v = float64_to_int64(a STATUS_VAR);
5244     if (v < 0) {
5245         res = 0;
5246         float_raise( float_flag_invalid STATUS_VAR);
5247     } else if (v > 0xffffffff) {
5248         res = 0xffffffff;
5249         float_raise( float_flag_invalid STATUS_VAR);
5250     } else {
5251         res = v;
5252     }
5253     return res;
5254 }
5255
5256 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5257 {
5258     int64_t v;
5259     unsigned int res;
5260
5261     v = float64_to_int64_round_to_zero(a STATUS_VAR);
5262     if (v < 0) {
5263         res = 0;
5264         float_raise( float_flag_invalid STATUS_VAR);
5265     } else if (v > 0xffffffff) {
5266         res = 0xffffffff;
5267         float_raise( float_flag_invalid STATUS_VAR);
5268     } else {
5269         res = v;
5270     }
5271     return res;
5272 }
5273
5274 #define COMPARE(s, nan_exp)                                                  \
5275 INLINE char float ## s ## _compare_internal( float ## s a, float ## s b,     \
5276                                       int is_quiet STATUS_PARAM )            \
5277 {                                                                            \
5278     flag aSign, bSign;                                                       \
5279                                                                              \
5280     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5281          extractFloat ## s ## Frac( a ) ) ||                                 \
5282         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5283           extractFloat ## s ## Frac( b ) )) {                                \
5284         if (!is_quiet ||                                                     \
5285             float ## s ## _is_signaling_nan( a ) ||                          \
5286             float ## s ## _is_signaling_nan( b ) ) {                         \
5287             float_raise( float_flag_invalid STATUS_VAR);                     \
5288         }                                                                    \
5289         return float_relation_unordered;                                     \
5290     }                                                                        \
5291     aSign = extractFloat ## s ## Sign( a );                                  \
5292     bSign = extractFloat ## s ## Sign( b );                                  \
5293     if ( aSign != bSign ) {                                                  \
5294         if ( (bits ## s) ( ( a | b )<<1 ) == 0 ) {                           \
5295             /* zero case */                                                  \
5296             return float_relation_equal;                                     \
5297         } else {                                                             \
5298             return 1 - (2 * aSign);                                          \
5299         }                                                                    \
5300     } else {                                                                 \
5301         if (a == b) {                                                        \
5302             return float_relation_equal;                                     \
5303         } else {                                                             \
5304             return 1 - 2 * (aSign ^ ( a < b ));                              \
5305         }                                                                    \
5306     }                                                                        \
5307 }                                                                            \
5308                                                                              \
5309 char float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )       \
5310 {                                                                            \
5311     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
5312 }                                                                            \
5313                                                                              \
5314 char float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
5315 {                                                                            \
5316     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
5317 }
5318
5319 COMPARE(32, 0xff)
5320 COMPARE(64, 0x7ff)