fpu/softfloat.c

   1
   2 /*============================================================================
   3
   4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
   5 Package, Release 2b.
   6
   7 Written by John R. Hauser.  This work was made possible in part by the
   8 International Computer Science Institute, located at Suite 600, 1947 Center
   9 Street, Berkeley, California 94704.  Funding was partially provided by the
  10 National Science Foundation under grant MIP-9311980.  The original version
  11 of this code was written as part of a project to build a fixed-point vector
  12 processor in collaboration with the University of California at Berkeley,
  13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  15 arithmetic/SoftFloat.html'.
  16
  17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  19 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  25
  26 Derivative works are acceptable, even for commercial purposes, so long as
  27 (1) the source code for the derivative work includes prominent notice that
  28 the work is derivative, and (2) the source code includes prominent notice with
  29 these four paragraphs for those parts of this code that are retained.
  30
  31 =============================================================================*/
  32
  33 #include "softfloat.h"
  34
  35 /*----------------------------------------------------------------------------
  36 | Primitive arithmetic functions, including multi-word arithmetic, and
  37 | division and square root approximations.  (Can be specialized to target if
  38 | desired.)
  39 *----------------------------------------------------------------------------*/
  40 #include "softfloat-macros.h"
  41
  42 /*----------------------------------------------------------------------------
  43 | Functions and definitions to determine:  (1) whether tininess for underflow
  44 | is detected before or after rounding by default, (2) what (if anything)
  45 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  46 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  47 | are propagated from function inputs to output.  These details are target-
  48 | specific.
  49 *----------------------------------------------------------------------------*/
  50 #include "softfloat-specialize.h"
  51
  52 void set_float_rounding_mode(int val STATUS_PARAM)
  53 {
  54     STATUS(float_rounding_mode) = val;
  55 }
  56
  57 void set_float_exception_flags(int val STATUS_PARAM)
  58 {
  59     STATUS(float_exception_flags) = val;
  60 }
  61
  62 #ifdef FLOATX80
  63 void set_floatx80_rounding_precision(int val STATUS_PARAM)
  64 {
  65     STATUS(floatx80_rounding_precision) = val;
  66 }
  67 #endif
  68
  69 /*----------------------------------------------------------------------------
  70 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  71 | and 7, and returns the properly rounded 32-bit integer corresponding to the
  72 | input.  If `zSign' is 1, the input is negated before being converted to an
  73 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
  74 | is simply rounded to an integer, with the inexact exception raised if the
  75 | input cannot be represented exactly as an integer.  However, if the fixed-
  76 | point input is too large, the invalid exception is raised and the largest
  77 | positive or negative integer is returned.
  78 *----------------------------------------------------------------------------*/
  79
  80 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
  81 {
  82     int8 roundingMode;
  83     flag roundNearestEven;
  84     int8 roundIncrement, roundBits;
  85     int32 z;
  86
  87     roundingMode = STATUS(float_rounding_mode);
  88     roundNearestEven = ( roundingMode == float_round_nearest_even );
  89     roundIncrement = 0x40;
  90     if ( ! roundNearestEven ) {
  91         if ( roundingMode == float_round_to_zero ) {
  92             roundIncrement = 0;
  93         }
  94         else {
  95             roundIncrement = 0x7F;
  96             if ( zSign ) {
  97                 if ( roundingMode == float_round_up ) roundIncrement = 0;
  98             }
  99             else {
 100                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 101             }
 102         }
 103     }
 104     roundBits = absZ & 0x7F;
 105     absZ = ( absZ + roundIncrement )>>7;
 106     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 107     z = absZ;
 108     if ( zSign ) z = - z;
 109     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 110         float_raise( float_flag_invalid STATUS_VAR);
 111         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 112     }
 113     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 114     return z;
 115
 116 }
 117
 118 /*----------------------------------------------------------------------------
 119 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 120 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 121 | and returns the properly rounded 64-bit integer corresponding to the input.
 122 | If `zSign' is 1, the input is negated before being converted to an integer.
 123 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 124 | the inexact exception raised if the input cannot be represented exactly as
 125 | an integer.  However, if the fixed-point input is too large, the invalid
 126 | exception is raised and the largest positive or negative integer is
 127 | returned.
 128 *----------------------------------------------------------------------------*/
 129
 130 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
 131 {
 132     int8 roundingMode;
 133     flag roundNearestEven, increment;
 134     int64 z;
 135
 136     roundingMode = STATUS(float_rounding_mode);
 137     roundNearestEven = ( roundingMode == float_round_nearest_even );
 138     increment = ( (sbits64) absZ1 < 0 );
 139     if ( ! roundNearestEven ) {
 140         if ( roundingMode == float_round_to_zero ) {
 141             increment = 0;
 142         }
 143         else {
 144             if ( zSign ) {
 145                 increment = ( roundingMode == float_round_down ) && absZ1;
 146             }
 147             else {
 148                 increment = ( roundingMode == float_round_up ) && absZ1;
 149             }
 150         }
 151     }
 152     if ( increment ) {
 153         ++absZ0;
 154         if ( absZ0 == 0 ) goto overflow;
 155         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 156     }
 157     z = absZ0;
 158     if ( zSign ) z = - z;
 159     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 160  overflow:
 161         float_raise( float_flag_invalid STATUS_VAR);
 162         return
 163               zSign ? (sbits64) LIT64( 0x8000000000000000 )
 164             : LIT64( 0x7FFFFFFFFFFFFFFF );
 165     }
 166     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 167     return z;
 168
 169 }
 170
 171 /*----------------------------------------------------------------------------
 172 | Returns the fraction bits of the single-precision floating-point value `a'.
 173 *----------------------------------------------------------------------------*/
 174
 175 INLINE bits32 extractFloat32Frac( float32 a )
 176 {
 177
 178     return float32_val(a) & 0x007FFFFF;
 179
 180 }
 181
 182 /*----------------------------------------------------------------------------
 183 | Returns the exponent bits of the single-precision floating-point value `a'.
 184 *----------------------------------------------------------------------------*/
 185
 186 INLINE int16 extractFloat32Exp( float32 a )
 187 {
 188
 189     return ( float32_val(a)>>23 ) & 0xFF;
 190
 191 }
 192
 193 /*----------------------------------------------------------------------------
 194 | Returns the sign bit of the single-precision floating-point value `a'.
 195 *----------------------------------------------------------------------------*/
 196
 197 INLINE flag extractFloat32Sign( float32 a )
 198 {
 199
 200     return float32_val(a)>>31;
 201
 202 }
 203
 204 /*----------------------------------------------------------------------------
 205 | Normalizes the subnormal single-precision floating-point value represented
 206 | by the denormalized significand `aSig'.  The normalized exponent and
 207 | significand are stored at the locations pointed to by `zExpPtr' and
 208 | `zSigPtr', respectively.
 209 *----------------------------------------------------------------------------*/
 210
 211 static void
 212  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 213 {
 214     int8 shiftCount;
 215
 216     shiftCount = countLeadingZeros32( aSig ) - 8;
 217     *zSigPtr = aSig<<shiftCount;
 218     *zExpPtr = 1 - shiftCount;
 219
 220 }
 221
 222 /*----------------------------------------------------------------------------
 223 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 224 | single-precision floating-point value, returning the result.  After being
 225 | shifted into the proper positions, the three fields are simply added
 226 | together to form the result.  This means that any integer portion of `zSig'
 227 | will be added into the exponent.  Since a properly normalized significand
 228 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 229 | than the desired result exponent whenever `zSig' is a complete, normalized
 230 | significand.
 231 *----------------------------------------------------------------------------*/
 232
 233 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 234 {
 235
 236     return make_float32(
 237           ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
 238
 239 }
 240
 241 /*----------------------------------------------------------------------------
 242 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 243 | and significand `zSig', and returns the proper single-precision floating-
 244 | point value corresponding to the abstract input.  Ordinarily, the abstract
 245 | value is simply rounded and packed into the single-precision format, with
 246 | the inexact exception raised if the abstract input cannot be represented
 247 | exactly.  However, if the abstract value is too large, the overflow and
 248 | inexact exceptions are raised and an infinity or maximal finite value is
 249 | returned.  If the abstract value is too small, the input value is rounded to
 250 | a subnormal number, and the underflow and inexact exceptions are raised if
 251 | the abstract input cannot be represented exactly as a subnormal single-
 252 | precision floating-point number.
 253 |     The input significand `zSig' has its binary point between bits 30
 254 | and 29, which is 7 bits to the left of the usual location.  This shifted
 255 | significand must be normalized or smaller.  If `zSig' is not normalized,
 256 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 257 | and it must not require rounding.  In the usual case that `zSig' is
 258 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 259 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 260 | Binary Floating-Point Arithmetic.
 261 *----------------------------------------------------------------------------*/
 262
 263 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 264 {
 265     int8 roundingMode;
 266     flag roundNearestEven;
 267     int8 roundIncrement, roundBits;
 268     flag isTiny;
 269
 270     roundingMode = STATUS(float_rounding_mode);
 271     roundNearestEven = ( roundingMode == float_round_nearest_even );
 272     roundIncrement = 0x40;
 273     if ( ! roundNearestEven ) {
 274         if ( roundingMode == float_round_to_zero ) {
 275             roundIncrement = 0;
 276         }
 277         else {
 278             roundIncrement = 0x7F;
 279             if ( zSign ) {
 280                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 281             }
 282             else {
 283                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 284             }
 285         }
 286     }
 287     roundBits = zSig & 0x7F;
 288     if ( 0xFD <= (bits16) zExp ) {
 289         if (    ( 0xFD < zExp )
 290              || (    ( zExp == 0xFD )
 291                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 292            ) {
 293             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 294             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 295         }
 296         if ( zExp < 0 ) {
 297             isTiny =
 298                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 299                 || ( zExp < -1 )
 300                 || ( zSig + roundIncrement < 0x80000000 );
 301             shift32RightJamming( zSig, - zExp, &zSig );
 302             zExp = 0;
 303             roundBits = zSig & 0x7F;
 304             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 305         }
 306     }
 307     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 308     zSig = ( zSig + roundIncrement )>>7;
 309     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 310     if ( zSig == 0 ) zExp = 0;
 311     return packFloat32( zSign, zExp, zSig );
 312
 313 }
 314
 315 /*----------------------------------------------------------------------------
 316 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 317 | and significand `zSig', and returns the proper single-precision floating-
 318 | point value corresponding to the abstract input.  This routine is just like
 319 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 320 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 321 | floating-point exponent.
 322 *----------------------------------------------------------------------------*/
 323
 324 static float32
 325  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 326 {
 327     int8 shiftCount;
 328
 329     shiftCount = countLeadingZeros32( zSig ) - 1;
 330     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 331
 332 }
 333
 334 /*----------------------------------------------------------------------------
 335 | Returns the fraction bits of the double-precision floating-point value `a'.
 336 *----------------------------------------------------------------------------*/
 337
 338 INLINE bits64 extractFloat64Frac( float64 a )
 339 {
 340
 341     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 342
 343 }
 344
 345 /*----------------------------------------------------------------------------
 346 | Returns the exponent bits of the double-precision floating-point value `a'.
 347 *----------------------------------------------------------------------------*/
 348
 349 INLINE int16 extractFloat64Exp( float64 a )
 350 {
 351
 352     return ( float64_val(a)>>52 ) & 0x7FF;
 353
 354 }
 355
 356 /*----------------------------------------------------------------------------
 357 | Returns the sign bit of the double-precision floating-point value `a'.
 358 *----------------------------------------------------------------------------*/
 359
 360 INLINE flag extractFloat64Sign( float64 a )
 361 {
 362
 363     return float64_val(a)>>63;
 364
 365 }
 366
 367 /*----------------------------------------------------------------------------
 368 | Normalizes the subnormal double-precision floating-point value represented
 369 | by the denormalized significand `aSig'.  The normalized exponent and
 370 | significand are stored at the locations pointed to by `zExpPtr' and
 371 | `zSigPtr', respectively.
 372 *----------------------------------------------------------------------------*/
 373
 374 static void
 375  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 376 {
 377     int8 shiftCount;
 378
 379     shiftCount = countLeadingZeros64( aSig ) - 11;
 380     *zSigPtr = aSig<<shiftCount;
 381     *zExpPtr = 1 - shiftCount;
 382
 383 }
 384
 385 /*----------------------------------------------------------------------------
 386 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 387 | double-precision floating-point value, returning the result.  After being
 388 | shifted into the proper positions, the three fields are simply added
 389 | together to form the result.  This means that any integer portion of `zSig'
 390 | will be added into the exponent.  Since a properly normalized significand
 391 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 392 | than the desired result exponent whenever `zSig' is a complete, normalized
 393 | significand.
 394 *----------------------------------------------------------------------------*/
 395
 396 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 397 {
 398
 399     return make_float64(
 400         ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
 401
 402 }
 403
 404 /*----------------------------------------------------------------------------
 405 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 406 | and significand `zSig', and returns the proper double-precision floating-
 407 | point value corresponding to the abstract input.  Ordinarily, the abstract
 408 | value is simply rounded and packed into the double-precision format, with
 409 | the inexact exception raised if the abstract input cannot be represented
 410 | exactly.  However, if the abstract value is too large, the overflow and
 411 | inexact exceptions are raised and an infinity or maximal finite value is
 412 | returned.  If the abstract value is too small, the input value is rounded
 413 | to a subnormal number, and the underflow and inexact exceptions are raised
 414 | if the abstract input cannot be represented exactly as a subnormal double-
 415 | precision floating-point number.
 416 |     The input significand `zSig' has its binary point between bits 62
 417 | and 61, which is 10 bits to the left of the usual location.  This shifted
 418 | significand must be normalized or smaller.  If `zSig' is not normalized,
 419 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 420 | and it must not require rounding.  In the usual case that `zSig' is
 421 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 422 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 423 | Binary Floating-Point Arithmetic.
 424 *----------------------------------------------------------------------------*/
 425
 426 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 427 {
 428     int8 roundingMode;
 429     flag roundNearestEven;
 430     int16 roundIncrement, roundBits;
 431     flag isTiny;
 432
 433     roundingMode = STATUS(float_rounding_mode);
 434     roundNearestEven = ( roundingMode == float_round_nearest_even );
 435     roundIncrement = 0x200;
 436     if ( ! roundNearestEven ) {
 437         if ( roundingMode == float_round_to_zero ) {
 438             roundIncrement = 0;
 439         }
 440         else {
 441             roundIncrement = 0x3FF;
 442             if ( zSign ) {
 443                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 444             }
 445             else {
 446                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 447             }
 448         }
 449     }
 450     roundBits = zSig & 0x3FF;
 451     if ( 0x7FD <= (bits16) zExp ) {
 452         if (    ( 0x7FD < zExp )
 453              || (    ( zExp == 0x7FD )
 454                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 455            ) {
 456             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 457             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 458         }
 459         if ( zExp < 0 ) {
 460             isTiny =
 461                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 462                 || ( zExp < -1 )
 463                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 464             shift64RightJamming( zSig, - zExp, &zSig );
 465             zExp = 0;
 466             roundBits = zSig & 0x3FF;
 467             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 468         }
 469     }
 470     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 471     zSig = ( zSig + roundIncrement )>>10;
 472     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 473     if ( zSig == 0 ) zExp = 0;
 474     return packFloat64( zSign, zExp, zSig );
 475
 476 }
 477
 478 /*----------------------------------------------------------------------------
 479 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 480 | and significand `zSig', and returns the proper double-precision floating-
 481 | point value corresponding to the abstract input.  This routine is just like
 482 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 483 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 484 | floating-point exponent.
 485 *----------------------------------------------------------------------------*/
 486
 487 static float64
 488  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 489 {
 490     int8 shiftCount;
 491
 492     shiftCount = countLeadingZeros64( zSig ) - 1;
 493     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 494
 495 }
 496
 497 #ifdef FLOATX80
 498
 499 /*----------------------------------------------------------------------------
 500 | Returns the fraction bits of the extended double-precision floating-point
 501 | value `a'.
 502 *----------------------------------------------------------------------------*/
 503
 504 INLINE bits64 extractFloatx80Frac( floatx80 a )
 505 {
 506
 507     return a.low;
 508
 509 }
 510
 511 /*----------------------------------------------------------------------------
 512 | Returns the exponent bits of the extended double-precision floating-point
 513 | value `a'.
 514 *----------------------------------------------------------------------------*/
 515
 516 INLINE int32 extractFloatx80Exp( floatx80 a )
 517 {
 518
 519     return a.high & 0x7FFF;
 520
 521 }
 522
 523 /*----------------------------------------------------------------------------
 524 | Returns the sign bit of the extended double-precision floating-point value
 525 | `a'.
 526 *----------------------------------------------------------------------------*/
 527
 528 INLINE flag extractFloatx80Sign( floatx80 a )
 529 {
 530
 531     return a.high>>15;
 532
 533 }
 534
 535 /*----------------------------------------------------------------------------
 536 | Normalizes the subnormal extended double-precision floating-point value
 537 | represented by the denormalized significand `aSig'.  The normalized exponent
 538 | and significand are stored at the locations pointed to by `zExpPtr' and
 539 | `zSigPtr', respectively.
 540 *----------------------------------------------------------------------------*/
 541
 542 static void
 543  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 544 {
 545     int8 shiftCount;
 546
 547     shiftCount = countLeadingZeros64( aSig );
 548     *zSigPtr = aSig<<shiftCount;
 549     *zExpPtr = 1 - shiftCount;
 550
 551 }
 552
 553 /*----------------------------------------------------------------------------
 554 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 555 | extended double-precision floating-point value, returning the result.
 556 *----------------------------------------------------------------------------*/
 557
 558 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 559 {
 560     floatx80 z;
 561
 562     z.low = zSig;
 563     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 564     return z;
 565
 566 }
 567
 568 /*----------------------------------------------------------------------------
 569 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 570 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 571 | and returns the proper extended double-precision floating-point value
 572 | corresponding to the abstract input.  Ordinarily, the abstract value is
 573 | rounded and packed into the extended double-precision format, with the
 574 | inexact exception raised if the abstract input cannot be represented
 575 | exactly.  However, if the abstract value is too large, the overflow and
 576 | inexact exceptions are raised and an infinity or maximal finite value is
 577 | returned.  If the abstract value is too small, the input value is rounded to
 578 | a subnormal number, and the underflow and inexact exceptions are raised if
 579 | the abstract input cannot be represented exactly as a subnormal extended
 580 | double-precision floating-point number.
 581 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 582 | number of bits as single or double precision, respectively.  Otherwise, the
 583 | result is rounded to the full precision of the extended double-precision
 584 | format.
 585 |     The input significand must be normalized or smaller.  If the input
 586 | significand is not normalized, `zExp' must be 0; in that case, the result
 587 | returned is a subnormal number, and it must not require rounding.  The
 588 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 589 | Floating-Point Arithmetic.
 590 *----------------------------------------------------------------------------*/
 591
 592 static floatx80
 593  roundAndPackFloatx80(
 594      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 595  STATUS_PARAM)
 596 {
 597     int8 roundingMode;
 598     flag roundNearestEven, increment, isTiny;
 599     int64 roundIncrement, roundMask, roundBits;
 600
 601     roundingMode = STATUS(float_rounding_mode);
 602     roundNearestEven = ( roundingMode == float_round_nearest_even );
 603     if ( roundingPrecision == 80 ) goto precision80;
 604     if ( roundingPrecision == 64 ) {
 605         roundIncrement = LIT64( 0x0000000000000400 );
 606         roundMask = LIT64( 0x00000000000007FF );
 607     }
 608     else if ( roundingPrecision == 32 ) {
 609         roundIncrement = LIT64( 0x0000008000000000 );
 610         roundMask = LIT64( 0x000000FFFFFFFFFF );
 611     }
 612     else {
 613         goto precision80;
 614     }
 615     zSig0 |= ( zSig1 != 0 );
 616     if ( ! roundNearestEven ) {
 617         if ( roundingMode == float_round_to_zero ) {
 618             roundIncrement = 0;
 619         }
 620         else {
 621             roundIncrement = roundMask;
 622             if ( zSign ) {
 623                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 624             }
 625             else {
 626                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 627             }
 628         }
 629     }
 630     roundBits = zSig0 & roundMask;
 631     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 632         if (    ( 0x7FFE < zExp )
 633              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 634            ) {
 635             goto overflow;
 636         }
 637         if ( zExp <= 0 ) {
 638             isTiny =
 639                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 640                 || ( zExp < 0 )
 641                 || ( zSig0 <= zSig0 + roundIncrement );
 642             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 643             zExp = 0;
 644             roundBits = zSig0 & roundMask;
 645             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 646             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 647             zSig0 += roundIncrement;
 648             if ( (sbits64) zSig0 < 0 ) zExp = 1;
 649             roundIncrement = roundMask + 1;
 650             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 651                 roundMask |= roundIncrement;
 652             }
 653             zSig0 &= ~ roundMask;
 654             return packFloatx80( zSign, zExp, zSig0 );
 655         }
 656     }
 657     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 658     zSig0 += roundIncrement;
 659     if ( zSig0 < roundIncrement ) {
 660         ++zExp;
 661         zSig0 = LIT64( 0x8000000000000000 );
 662     }
 663     roundIncrement = roundMask + 1;
 664     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 665         roundMask |= roundIncrement;
 666     }
 667     zSig0 &= ~ roundMask;
 668     if ( zSig0 == 0 ) zExp = 0;
 669     return packFloatx80( zSign, zExp, zSig0 );
 670  precision80:
 671     increment = ( (sbits64) zSig1 < 0 );
 672     if ( ! roundNearestEven ) {
 673         if ( roundingMode == float_round_to_zero ) {
 674             increment = 0;
 675         }
 676         else {
 677             if ( zSign ) {
 678                 increment = ( roundingMode == float_round_down ) && zSig1;
 679             }
 680             else {
 681                 increment = ( roundingMode == float_round_up ) && zSig1;
 682             }
 683         }
 684     }
 685     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 686         if (    ( 0x7FFE < zExp )
 687              || (    ( zExp == 0x7FFE )
 688                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 689                   && increment
 690                 )
 691            ) {
 692             roundMask = 0;
 693  overflow:
 694             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 695             if (    ( roundingMode == float_round_to_zero )
 696                  || ( zSign && ( roundingMode == float_round_up ) )
 697                  || ( ! zSign && ( roundingMode == float_round_down ) )
 698                ) {
 699                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 700             }
 701             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 702         }
 703         if ( zExp <= 0 ) {
 704             isTiny =
 705                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 706                 || ( zExp < 0 )
 707                 || ! increment
 708                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 709             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 710             zExp = 0;
 711             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 712             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 713             if ( roundNearestEven ) {
 714                 increment = ( (sbits64) zSig1 < 0 );
 715             }
 716             else {
 717                 if ( zSign ) {
 718                     increment = ( roundingMode == float_round_down ) && zSig1;
 719                 }
 720                 else {
 721                     increment = ( roundingMode == float_round_up ) && zSig1;
 722                 }
 723             }
 724             if ( increment ) {
 725                 ++zSig0;
 726                 zSig0 &=
 727                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 728                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
 729             }
 730             return packFloatx80( zSign, zExp, zSig0 );
 731         }
 732     }
 733     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 734     if ( increment ) {
 735         ++zSig0;
 736         if ( zSig0 == 0 ) {
 737             ++zExp;
 738             zSig0 = LIT64( 0x8000000000000000 );
 739         }
 740         else {
 741             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 742         }
 743     }
 744     else {
 745         if ( zSig0 == 0 ) zExp = 0;
 746     }
 747     return packFloatx80( zSign, zExp, zSig0 );
 748
 749 }
 750
 751 /*----------------------------------------------------------------------------
 752 | Takes an abstract floating-point value having sign `zSign', exponent
 753 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 754 | and returns the proper extended double-precision floating-point value
 755 | corresponding to the abstract input.  This routine is just like
 756 | `roundAndPackFloatx80' except that the input significand does not have to be
 757 | normalized.
 758 *----------------------------------------------------------------------------*/
 759
 760 static floatx80
 761  normalizeRoundAndPackFloatx80(
 762      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 763  STATUS_PARAM)
 764 {
 765     int8 shiftCount;
 766
 767     if ( zSig0 == 0 ) {
 768         zSig0 = zSig1;
 769         zSig1 = 0;
 770         zExp -= 64;
 771     }
 772     shiftCount = countLeadingZeros64( zSig0 );
 773     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 774     zExp -= shiftCount;
 775     return
 776         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 777
 778 }
 779
 780 #endif
 781
 782 #ifdef FLOAT128
 783
 784 /*----------------------------------------------------------------------------
 785 | Returns the least-significant 64 fraction bits of the quadruple-precision
 786 | floating-point value `a'.
 787 *----------------------------------------------------------------------------*/
 788
 789 INLINE bits64 extractFloat128Frac1( float128 a )
 790 {
 791
 792     return a.low;
 793
 794 }
 795
 796 /*----------------------------------------------------------------------------
 797 | Returns the most-significant 48 fraction bits of the quadruple-precision
 798 | floating-point value `a'.
 799 *----------------------------------------------------------------------------*/
 800
 801 INLINE bits64 extractFloat128Frac0( float128 a )
 802 {
 803
 804     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 805
 806 }
 807
 808 /*----------------------------------------------------------------------------
 809 | Returns the exponent bits of the quadruple-precision floating-point value
 810 | `a'.
 811 *----------------------------------------------------------------------------*/
 812
 813 INLINE int32 extractFloat128Exp( float128 a )
 814 {
 815
 816     return ( a.high>>48 ) & 0x7FFF;
 817
 818 }
 819
 820 /*----------------------------------------------------------------------------
 821 | Returns the sign bit of the quadruple-precision floating-point value `a'.
 822 *----------------------------------------------------------------------------*/
 823
 824 INLINE flag extractFloat128Sign( float128 a )
 825 {
 826
 827     return a.high>>63;
 828
 829 }
 830
 831 /*----------------------------------------------------------------------------
 832 | Normalizes the subnormal quadruple-precision floating-point value
 833 | represented by the denormalized significand formed by the concatenation of
 834 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 835 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 836 | significand are stored at the location pointed to by `zSig0Ptr', and the
 837 | least significant 64 bits of the normalized significand are stored at the
 838 | location pointed to by `zSig1Ptr'.
 839 *----------------------------------------------------------------------------*/
 840
 841 static void
 842  normalizeFloat128Subnormal(
 843      bits64 aSig0,
 844      bits64 aSig1,
 845      int32 *zExpPtr,
 846      bits64 *zSig0Ptr,
 847      bits64 *zSig1Ptr
 848  )
 849 {
 850     int8 shiftCount;
 851
 852     if ( aSig0 == 0 ) {
 853         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 854         if ( shiftCount < 0 ) {
 855             *zSig0Ptr = aSig1>>( - shiftCount );
 856             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 857         }
 858         else {
 859             *zSig0Ptr = aSig1<<shiftCount;
 860             *zSig1Ptr = 0;
 861         }
 862         *zExpPtr = - shiftCount - 63;
 863     }
 864     else {
 865         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 866         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 867         *zExpPtr = 1 - shiftCount;
 868     }
 869
 870 }
 871
 872 /*----------------------------------------------------------------------------
 873 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 874 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 875 | floating-point value, returning the result.  After being shifted into the
 876 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 877 | added together to form the most significant 32 bits of the result.  This
 878 | means that any integer portion of `zSig0' will be added into the exponent.
 879 | Since a properly normalized significand will have an integer portion equal
 880 | to 1, the `zExp' input should be 1 less than the desired result exponent
 881 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 882 | significand.
 883 *----------------------------------------------------------------------------*/
 884
 885 INLINE float128
 886  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 887 {
 888     float128 z;
 889
 890     z.low = zSig1;
 891     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
 892     return z;
 893
 894 }
 895
 896 /*----------------------------------------------------------------------------
 897 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 898 | and extended significand formed by the concatenation of `zSig0', `zSig1',
 899 | and `zSig2', and returns the proper quadruple-precision floating-point value
 900 | corresponding to the abstract input.  Ordinarily, the abstract value is
 901 | simply rounded and packed into the quadruple-precision format, with the
 902 | inexact exception raised if the abstract input cannot be represented
 903 | exactly.  However, if the abstract value is too large, the overflow and
 904 | inexact exceptions are raised and an infinity or maximal finite value is
 905 | returned.  If the abstract value is too small, the input value is rounded to
 906 | a subnormal number, and the underflow and inexact exceptions are raised if
 907 | the abstract input cannot be represented exactly as a subnormal quadruple-
 908 | precision floating-point number.
 909 |     The input significand must be normalized or smaller.  If the input
 910 | significand is not normalized, `zExp' must be 0; in that case, the result
 911 | returned is a subnormal number, and it must not require rounding.  In the
 912 | usual case that the input significand is normalized, `zExp' must be 1 less
 913 | than the ``true'' floating-point exponent.  The handling of underflow and
 914 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 915 *----------------------------------------------------------------------------*/
 916
 917 static float128
 918  roundAndPackFloat128(
 919      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
 920 {
 921     int8 roundingMode;
 922     flag roundNearestEven, increment, isTiny;
 923
 924     roundingMode = STATUS(float_rounding_mode);
 925     roundNearestEven = ( roundingMode == float_round_nearest_even );
 926     increment = ( (sbits64) zSig2 < 0 );
 927     if ( ! roundNearestEven ) {
 928         if ( roundingMode == float_round_to_zero ) {
 929             increment = 0;
 930         }
 931         else {
 932             if ( zSign ) {
 933                 increment = ( roundingMode == float_round_down ) && zSig2;
 934             }
 935             else {
 936                 increment = ( roundingMode == float_round_up ) && zSig2;
 937             }
 938         }
 939     }
 940     if ( 0x7FFD <= (bits32) zExp ) {
 941         if (    ( 0x7FFD < zExp )
 942              || (    ( zExp == 0x7FFD )
 943                   && eq128(
 944                          LIT64( 0x0001FFFFFFFFFFFF ),
 945                          LIT64( 0xFFFFFFFFFFFFFFFF ),
 946                          zSig0,
 947                          zSig1
 948                      )
 949                   && increment
 950                 )
 951            ) {
 952             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 953             if (    ( roundingMode == float_round_to_zero )
 954                  || ( zSign && ( roundingMode == float_round_up ) )
 955                  || ( ! zSign && ( roundingMode == float_round_down ) )
 956                ) {
 957                 return
 958                     packFloat128(
 959                         zSign,
 960                         0x7FFE,
 961                         LIT64( 0x0000FFFFFFFFFFFF ),
 962                         LIT64( 0xFFFFFFFFFFFFFFFF )
 963                     );
 964             }
 965             return packFloat128( zSign, 0x7FFF, 0, 0 );
 966         }
 967         if ( zExp < 0 ) {
 968             isTiny =
 969                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 970                 || ( zExp < -1 )
 971                 || ! increment
 972                 || lt128(
 973                        zSig0,
 974                        zSig1,
 975                        LIT64( 0x0001FFFFFFFFFFFF ),
 976                        LIT64( 0xFFFFFFFFFFFFFFFF )
 977                    );
 978             shift128ExtraRightJamming(
 979                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
 980             zExp = 0;
 981             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
 982             if ( roundNearestEven ) {
 983                 increment = ( (sbits64) zSig2 < 0 );
 984             }
 985             else {
 986                 if ( zSign ) {
 987                     increment = ( roundingMode == float_round_down ) && zSig2;
 988                 }
 989                 else {
 990                     increment = ( roundingMode == float_round_up ) && zSig2;
 991                 }
 992             }
 993         }
 994     }
 995     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
 996     if ( increment ) {
 997         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
 998         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
 999     }
1000     else {
1001         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1002     }
1003     return packFloat128( zSign, zExp, zSig0, zSig1 );
1004
1005 }
1006
1007 /*----------------------------------------------------------------------------
1008 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1009 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1010 | returns the proper quadruple-precision floating-point value corresponding
1011 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1012 | except that the input significand has fewer bits and does not have to be
1013 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1014 | point exponent.
1015 *----------------------------------------------------------------------------*/
1016
1017 static float128
1018  normalizeRoundAndPackFloat128(
1019      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1020 {
1021     int8 shiftCount;
1022     bits64 zSig2;
1023
1024     if ( zSig0 == 0 ) {
1025         zSig0 = zSig1;
1026         zSig1 = 0;
1027         zExp -= 64;
1028     }
1029     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1030     if ( 0 <= shiftCount ) {
1031         zSig2 = 0;
1032         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1033     }
1034     else {
1035         shift128ExtraRightJamming(
1036             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1037     }
1038     zExp -= shiftCount;
1039     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1040
1041 }
1042
1043 #endif
1044
1045 /*----------------------------------------------------------------------------
1046 | Returns the result of converting the 32-bit two's complement integer `a'
1047 | to the single-precision floating-point format.  The conversion is performed
1048 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1049 *----------------------------------------------------------------------------*/
1050
1051 float32 int32_to_float32( int32 a STATUS_PARAM )
1052 {
1053     flag zSign;
1054
1055     if ( a == 0 ) return float32_zero;
1056     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1057     zSign = ( a < 0 );
1058     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1059
1060 }
1061
1062 /*----------------------------------------------------------------------------
1063 | Returns the result of converting the 32-bit two's complement integer `a'
1064 | to the double-precision floating-point format.  The conversion is performed
1065 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1066 *----------------------------------------------------------------------------*/
1067
1068 float64 int32_to_float64( int32 a STATUS_PARAM )
1069 {
1070     flag zSign;
1071     uint32 absA;
1072     int8 shiftCount;
1073     bits64 zSig;
1074
1075     if ( a == 0 ) return float64_zero;
1076     zSign = ( a < 0 );
1077     absA = zSign ? - a : a;
1078     shiftCount = countLeadingZeros32( absA ) + 21;
1079     zSig = absA;
1080     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1081
1082 }
1083
1084 #ifdef FLOATX80
1085
1086 /*----------------------------------------------------------------------------
1087 | Returns the result of converting the 32-bit two's complement integer `a'
1088 | to the extended double-precision floating-point format.  The conversion
1089 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1090 | Arithmetic.
1091 *----------------------------------------------------------------------------*/
1092
1093 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1094 {
1095     flag zSign;
1096     uint32 absA;
1097     int8 shiftCount;
1098     bits64 zSig;
1099
1100     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1101     zSign = ( a < 0 );
1102     absA = zSign ? - a : a;
1103     shiftCount = countLeadingZeros32( absA ) + 32;
1104     zSig = absA;
1105     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1106
1107 }
1108
1109 #endif
1110
1111 #ifdef FLOAT128
1112
1113 /*----------------------------------------------------------------------------
1114 | Returns the result of converting the 32-bit two's complement integer `a' to
1115 | the quadruple-precision floating-point format.  The conversion is performed
1116 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1117 *----------------------------------------------------------------------------*/
1118
1119 float128 int32_to_float128( int32 a STATUS_PARAM )
1120 {
1121     flag zSign;
1122     uint32 absA;
1123     int8 shiftCount;
1124     bits64 zSig0;
1125
1126     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1127     zSign = ( a < 0 );
1128     absA = zSign ? - a : a;
1129     shiftCount = countLeadingZeros32( absA ) + 17;
1130     zSig0 = absA;
1131     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1132
1133 }
1134
1135 #endif
1136
1137 /*----------------------------------------------------------------------------
1138 | Returns the result of converting the 64-bit two's complement integer `a'
1139 | to the single-precision floating-point format.  The conversion is performed
1140 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1141 *----------------------------------------------------------------------------*/
1142
1143 float32 int64_to_float32( int64 a STATUS_PARAM )
1144 {
1145     flag zSign;
1146     uint64 absA;
1147     int8 shiftCount;
1148
1149     if ( a == 0 ) return float32_zero;
1150     zSign = ( a < 0 );
1151     absA = zSign ? - a : a;
1152     shiftCount = countLeadingZeros64( absA ) - 40;
1153     if ( 0 <= shiftCount ) {
1154         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1155     }
1156     else {
1157         shiftCount += 7;
1158         if ( shiftCount < 0 ) {
1159             shift64RightJamming( absA, - shiftCount, &absA );
1160         }
1161         else {
1162             absA <<= shiftCount;
1163         }
1164         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1165     }
1166
1167 }
1168
1169 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1170 {
1171     int8 shiftCount;
1172
1173     if ( a == 0 ) return float32_zero;
1174     shiftCount = countLeadingZeros64( a ) - 40;
1175     if ( 0 <= shiftCount ) {
1176         return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1177     }
1178     else {
1179         shiftCount += 7;
1180         if ( shiftCount < 0 ) {
1181             shift64RightJamming( a, - shiftCount, &a );
1182         }
1183         else {
1184             a <<= shiftCount;
1185         }
1186         return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1187     }
1188 }
1189
1190 /*----------------------------------------------------------------------------
1191 | Returns the result of converting the 64-bit two's complement integer `a'
1192 | to the double-precision floating-point format.  The conversion is performed
1193 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1194 *----------------------------------------------------------------------------*/
1195
1196 float64 int64_to_float64( int64 a STATUS_PARAM )
1197 {
1198     flag zSign;
1199
1200     if ( a == 0 ) return float64_zero;
1201     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1202         return packFloat64( 1, 0x43E, 0 );
1203     }
1204     zSign = ( a < 0 );
1205     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1206
1207 }
1208
1209 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1210 {
1211     if ( a == 0 ) return float64_zero;
1212     return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1213
1214 }
1215
1216 #ifdef FLOATX80
1217
1218 /*----------------------------------------------------------------------------
1219 | Returns the result of converting the 64-bit two's complement integer `a'
1220 | to the extended double-precision floating-point format.  The conversion
1221 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1222 | Arithmetic.
1223 *----------------------------------------------------------------------------*/
1224
1225 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1226 {
1227     flag zSign;
1228     uint64 absA;
1229     int8 shiftCount;
1230
1231     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1232     zSign = ( a < 0 );
1233     absA = zSign ? - a : a;
1234     shiftCount = countLeadingZeros64( absA );
1235     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1236
1237 }
1238
1239 #endif
1240
1241 #ifdef FLOAT128
1242
1243 /*----------------------------------------------------------------------------
1244 | Returns the result of converting the 64-bit two's complement integer `a' to
1245 | the quadruple-precision floating-point format.  The conversion is performed
1246 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1247 *----------------------------------------------------------------------------*/
1248
1249 float128 int64_to_float128( int64 a STATUS_PARAM )
1250 {
1251     flag zSign;
1252     uint64 absA;
1253     int8 shiftCount;
1254     int32 zExp;
1255     bits64 zSig0, zSig1;
1256
1257     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1258     zSign = ( a < 0 );
1259     absA = zSign ? - a : a;
1260     shiftCount = countLeadingZeros64( absA ) + 49;
1261     zExp = 0x406E - shiftCount;
1262     if ( 64 <= shiftCount ) {
1263         zSig1 = 0;
1264         zSig0 = absA;
1265         shiftCount -= 64;
1266     }
1267     else {
1268         zSig1 = absA;
1269         zSig0 = 0;
1270     }
1271     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1272     return packFloat128( zSign, zExp, zSig0, zSig1 );
1273
1274 }
1275
1276 #endif
1277
1278 /*----------------------------------------------------------------------------
1279 | Returns the result of converting the single-precision floating-point value
1280 | `a' to the 32-bit two's complement integer format.  The conversion is
1281 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1282 | Arithmetic---which means in particular that the conversion is rounded
1283 | according to the current rounding mode.  If `a' is a NaN, the largest
1284 | positive integer is returned.  Otherwise, if the conversion overflows, the
1285 | largest integer with the same sign as `a' is returned.
1286 *----------------------------------------------------------------------------*/
1287
1288 int32 float32_to_int32( float32 a STATUS_PARAM )
1289 {
1290     flag aSign;
1291     int16 aExp, shiftCount;
1292     bits32 aSig;
1293     bits64 aSig64;
1294
1295     aSig = extractFloat32Frac( a );
1296     aExp = extractFloat32Exp( a );
1297     aSign = extractFloat32Sign( a );
1298     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1299     if ( aExp ) aSig |= 0x00800000;
1300     shiftCount = 0xAF - aExp;
1301     aSig64 = aSig;
1302     aSig64 <<= 32;
1303     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1304     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1305
1306 }
1307
1308 /*----------------------------------------------------------------------------
1309 | Returns the result of converting the single-precision floating-point value
1310 | `a' to the 32-bit two's complement integer format.  The conversion is
1311 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1312 | Arithmetic, except that the conversion is always rounded toward zero.
1313 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1314 | the conversion overflows, the largest integer with the same sign as `a' is
1315 | returned.
1316 *----------------------------------------------------------------------------*/
1317
1318 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1319 {
1320     flag aSign;
1321     int16 aExp, shiftCount;
1322     bits32 aSig;
1323     int32 z;
1324
1325     aSig = extractFloat32Frac( a );
1326     aExp = extractFloat32Exp( a );
1327     aSign = extractFloat32Sign( a );
1328     shiftCount = aExp - 0x9E;
1329     if ( 0 <= shiftCount ) {
1330         if ( float32_val(a) != 0xCF000000 ) {
1331             float_raise( float_flag_invalid STATUS_VAR);
1332             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1333         }
1334         return (sbits32) 0x80000000;
1335     }
1336     else if ( aExp <= 0x7E ) {
1337         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1338         return 0;
1339     }
1340     aSig = ( aSig | 0x00800000 )<<8;
1341     z = aSig>>( - shiftCount );
1342     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1343         STATUS(float_exception_flags) |= float_flag_inexact;
1344     }
1345     if ( aSign ) z = - z;
1346     return z;
1347
1348 }
1349
1350 /*----------------------------------------------------------------------------
1351 | Returns the result of converting the single-precision floating-point value
1352 | `a' to the 64-bit two's complement integer format.  The conversion is
1353 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1354 | Arithmetic---which means in particular that the conversion is rounded
1355 | according to the current rounding mode.  If `a' is a NaN, the largest
1356 | positive integer is returned.  Otherwise, if the conversion overflows, the
1357 | largest integer with the same sign as `a' is returned.
1358 *----------------------------------------------------------------------------*/
1359
1360 int64 float32_to_int64( float32 a STATUS_PARAM )
1361 {
1362     flag aSign;
1363     int16 aExp, shiftCount;
1364     bits32 aSig;
1365     bits64 aSig64, aSigExtra;
1366
1367     aSig = extractFloat32Frac( a );
1368     aExp = extractFloat32Exp( a );
1369     aSign = extractFloat32Sign( a );
1370     shiftCount = 0xBE - aExp;
1371     if ( shiftCount < 0 ) {
1372         float_raise( float_flag_invalid STATUS_VAR);
1373         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1374             return LIT64( 0x7FFFFFFFFFFFFFFF );
1375         }
1376         return (sbits64) LIT64( 0x8000000000000000 );
1377     }
1378     if ( aExp ) aSig |= 0x00800000;
1379     aSig64 = aSig;
1380     aSig64 <<= 40;
1381     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1382     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1383
1384 }
1385
1386 /*----------------------------------------------------------------------------
1387 | Returns the result of converting the single-precision floating-point value
1388 | `a' to the 64-bit two's complement integer format.  The conversion is
1389 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1390 | Arithmetic, except that the conversion is always rounded toward zero.  If
1391 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1392 | conversion overflows, the largest integer with the same sign as `a' is
1393 | returned.
1394 *----------------------------------------------------------------------------*/
1395
1396 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1397 {
1398     flag aSign;
1399     int16 aExp, shiftCount;
1400     bits32 aSig;
1401     bits64 aSig64;
1402     int64 z;
1403
1404     aSig = extractFloat32Frac( a );
1405     aExp = extractFloat32Exp( a );
1406     aSign = extractFloat32Sign( a );
1407     shiftCount = aExp - 0xBE;
1408     if ( 0 <= shiftCount ) {
1409         if ( float32_val(a) != 0xDF000000 ) {
1410             float_raise( float_flag_invalid STATUS_VAR);
1411             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1412                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1413             }
1414         }
1415         return (sbits64) LIT64( 0x8000000000000000 );
1416     }
1417     else if ( aExp <= 0x7E ) {
1418         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1419         return 0;
1420     }
1421     aSig64 = aSig | 0x00800000;
1422     aSig64 <<= 40;
1423     z = aSig64>>( - shiftCount );
1424     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1425         STATUS(float_exception_flags) |= float_flag_inexact;
1426     }
1427     if ( aSign ) z = - z;
1428     return z;
1429
1430 }
1431
1432 /*----------------------------------------------------------------------------
1433 | Returns the result of converting the single-precision floating-point value
1434 | `a' to the double-precision floating-point format.  The conversion is
1435 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1436 | Arithmetic.
1437 *----------------------------------------------------------------------------*/
1438
1439 float64 float32_to_float64( float32 a STATUS_PARAM )
1440 {
1441     flag aSign;
1442     int16 aExp;
1443     bits32 aSig;
1444
1445     aSig = extractFloat32Frac( a );
1446     aExp = extractFloat32Exp( a );
1447     aSign = extractFloat32Sign( a );
1448     if ( aExp == 0xFF ) {
1449         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1450         return packFloat64( aSign, 0x7FF, 0 );
1451     }
1452     if ( aExp == 0 ) {
1453         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1454         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1455         --aExp;
1456     }
1457     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1458
1459 }
1460
1461 #ifdef FLOATX80
1462
1463 /*----------------------------------------------------------------------------
1464 | Returns the result of converting the single-precision floating-point value
1465 | `a' to the extended double-precision floating-point format.  The conversion
1466 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1467 | Arithmetic.
1468 *----------------------------------------------------------------------------*/
1469
1470 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1471 {
1472     flag aSign;
1473     int16 aExp;
1474     bits32 aSig;
1475
1476     aSig = extractFloat32Frac( a );
1477     aExp = extractFloat32Exp( a );
1478     aSign = extractFloat32Sign( a );
1479     if ( aExp == 0xFF ) {
1480         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1481         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1482     }
1483     if ( aExp == 0 ) {
1484         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1485         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1486     }
1487     aSig |= 0x00800000;
1488     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1489
1490 }
1491
1492 #endif
1493
1494 #ifdef FLOAT128
1495
1496 /*----------------------------------------------------------------------------
1497 | Returns the result of converting the single-precision floating-point value
1498 | `a' to the double-precision floating-point format.  The conversion is
1499 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1500 | Arithmetic.
1501 *----------------------------------------------------------------------------*/
1502
1503 float128 float32_to_float128( float32 a STATUS_PARAM )
1504 {
1505     flag aSign;
1506     int16 aExp;
1507     bits32 aSig;
1508
1509     aSig = extractFloat32Frac( a );
1510     aExp = extractFloat32Exp( a );
1511     aSign = extractFloat32Sign( a );
1512     if ( aExp == 0xFF ) {
1513         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1514         return packFloat128( aSign, 0x7FFF, 0, 0 );
1515     }
1516     if ( aExp == 0 ) {
1517         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1518         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1519         --aExp;
1520     }
1521     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1522
1523 }
1524
1525 #endif
1526
1527 /*----------------------------------------------------------------------------
1528 | Rounds the single-precision floating-point value `a' to an integer, and
1529 | returns the result as a single-precision floating-point value.  The
1530 | operation is performed according to the IEC/IEEE Standard for Binary
1531 | Floating-Point Arithmetic.
1532 *----------------------------------------------------------------------------*/
1533
1534 float32 float32_round_to_int( float32 a STATUS_PARAM)
1535 {
1536     flag aSign;
1537     int16 aExp;
1538     bits32 lastBitMask, roundBitsMask;
1539     int8 roundingMode;
1540     bits32 z;
1541
1542     aExp = extractFloat32Exp( a );
1543     if ( 0x96 <= aExp ) {
1544         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1545             return propagateFloat32NaN( a, a STATUS_VAR );
1546         }
1547         return a;
1548     }
1549     if ( aExp <= 0x7E ) {
1550         if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1551         STATUS(float_exception_flags) |= float_flag_inexact;
1552         aSign = extractFloat32Sign( a );
1553         switch ( STATUS(float_rounding_mode) ) {
1554          case float_round_nearest_even:
1555             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1556                 return packFloat32( aSign, 0x7F, 0 );
1557             }
1558             break;
1559          case float_round_down:
1560             return make_float32(aSign ? 0xBF800000 : 0);
1561          case float_round_up:
1562             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1563         }
1564         return packFloat32( aSign, 0, 0 );
1565     }
1566     lastBitMask = 1;
1567     lastBitMask <<= 0x96 - aExp;
1568     roundBitsMask = lastBitMask - 1;
1569     z = float32_val(a);
1570     roundingMode = STATUS(float_rounding_mode);
1571     if ( roundingMode == float_round_nearest_even ) {
1572         z += lastBitMask>>1;
1573         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1574     }
1575     else if ( roundingMode != float_round_to_zero ) {
1576         if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1577             z += roundBitsMask;
1578         }
1579     }
1580     z &= ~ roundBitsMask;
1581     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1582     return make_float32(z);
1583
1584 }
1585
1586 /*----------------------------------------------------------------------------
1587 | Returns the result of adding the absolute values of the single-precision
1588 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1589 | before being returned.  `zSign' is ignored if the result is a NaN.
1590 | The addition is performed according to the IEC/IEEE Standard for Binary
1591 | Floating-Point Arithmetic.
1592 *----------------------------------------------------------------------------*/
1593
1594 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1595 {
1596     int16 aExp, bExp, zExp;
1597     bits32 aSig, bSig, zSig;
1598     int16 expDiff;
1599
1600     aSig = extractFloat32Frac( a );
1601     aExp = extractFloat32Exp( a );
1602     bSig = extractFloat32Frac( b );
1603     bExp = extractFloat32Exp( b );
1604     expDiff = aExp - bExp;
1605     aSig <<= 6;
1606     bSig <<= 6;
1607     if ( 0 < expDiff ) {
1608         if ( aExp == 0xFF ) {
1609             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1610             return a;
1611         }
1612         if ( bExp == 0 ) {
1613             --expDiff;
1614         }
1615         else {
1616             bSig |= 0x20000000;
1617         }
1618         shift32RightJamming( bSig, expDiff, &bSig );
1619         zExp = aExp;
1620     }
1621     else if ( expDiff < 0 ) {
1622         if ( bExp == 0xFF ) {
1623             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1624             return packFloat32( zSign, 0xFF, 0 );
1625         }
1626         if ( aExp == 0 ) {
1627             ++expDiff;
1628         }
1629         else {
1630             aSig |= 0x20000000;
1631         }
1632         shift32RightJamming( aSig, - expDiff, &aSig );
1633         zExp = bExp;
1634     }
1635     else {
1636         if ( aExp == 0xFF ) {
1637             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1638             return a;
1639         }
1640         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1641         zSig = 0x40000000 + aSig + bSig;
1642         zExp = aExp;
1643         goto roundAndPack;
1644     }
1645     aSig |= 0x20000000;
1646     zSig = ( aSig + bSig )<<1;
1647     --zExp;
1648     if ( (sbits32) zSig < 0 ) {
1649         zSig = aSig + bSig;
1650         ++zExp;
1651     }
1652  roundAndPack:
1653     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1654
1655 }
1656
1657 /*----------------------------------------------------------------------------
1658 | Returns the result of subtracting the absolute values of the single-
1659 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1660 | difference is negated before being returned.  `zSign' is ignored if the
1661 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1662 | Standard for Binary Floating-Point Arithmetic.
1663 *----------------------------------------------------------------------------*/
1664
1665 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1666 {
1667     int16 aExp, bExp, zExp;
1668     bits32 aSig, bSig, zSig;
1669     int16 expDiff;
1670
1671     aSig = extractFloat32Frac( a );
1672     aExp = extractFloat32Exp( a );
1673     bSig = extractFloat32Frac( b );
1674     bExp = extractFloat32Exp( b );
1675     expDiff = aExp - bExp;
1676     aSig <<= 7;
1677     bSig <<= 7;
1678     if ( 0 < expDiff ) goto aExpBigger;
1679     if ( expDiff < 0 ) goto bExpBigger;
1680     if ( aExp == 0xFF ) {
1681         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1682         float_raise( float_flag_invalid STATUS_VAR);
1683         return float32_default_nan;
1684     }
1685     if ( aExp == 0 ) {
1686         aExp = 1;
1687         bExp = 1;
1688     }
1689     if ( bSig < aSig ) goto aBigger;
1690     if ( aSig < bSig ) goto bBigger;
1691     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1692  bExpBigger:
1693     if ( bExp == 0xFF ) {
1694         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1695         return packFloat32( zSign ^ 1, 0xFF, 0 );
1696     }
1697     if ( aExp == 0 ) {
1698         ++expDiff;
1699     }
1700     else {
1701         aSig |= 0x40000000;
1702     }
1703     shift32RightJamming( aSig, - expDiff, &aSig );
1704     bSig |= 0x40000000;
1705  bBigger:
1706     zSig = bSig - aSig;
1707     zExp = bExp;
1708     zSign ^= 1;
1709     goto normalizeRoundAndPack;
1710  aExpBigger:
1711     if ( aExp == 0xFF ) {
1712         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1713         return a;
1714     }
1715     if ( bExp == 0 ) {
1716         --expDiff;
1717     }
1718     else {
1719         bSig |= 0x40000000;
1720     }
1721     shift32RightJamming( bSig, expDiff, &bSig );
1722     aSig |= 0x40000000;
1723  aBigger:
1724     zSig = aSig - bSig;
1725     zExp = aExp;
1726  normalizeRoundAndPack:
1727     --zExp;
1728     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1729
1730 }
1731
1732 /*----------------------------------------------------------------------------
1733 | Returns the result of adding the single-precision floating-point values `a'
1734 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1735 | Binary Floating-Point Arithmetic.
1736 *----------------------------------------------------------------------------*/
1737
1738 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1739 {
1740     flag aSign, bSign;
1741
1742     aSign = extractFloat32Sign( a );
1743     bSign = extractFloat32Sign( b );
1744     if ( aSign == bSign ) {
1745         return addFloat32Sigs( a, b, aSign STATUS_VAR);
1746     }
1747     else {
1748         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1749     }
1750
1751 }
1752
1753 /*----------------------------------------------------------------------------
1754 | Returns the result of subtracting the single-precision floating-point values
1755 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1756 | for Binary Floating-Point Arithmetic.
1757 *----------------------------------------------------------------------------*/
1758
1759 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1760 {
1761     flag aSign, bSign;
1762
1763     aSign = extractFloat32Sign( a );
1764     bSign = extractFloat32Sign( b );
1765     if ( aSign == bSign ) {
1766         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1767     }
1768     else {
1769         return addFloat32Sigs( a, b, aSign STATUS_VAR );
1770     }
1771
1772 }
1773
1774 /*----------------------------------------------------------------------------
1775 | Returns the result of multiplying the single-precision floating-point values
1776 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1777 | for Binary Floating-Point Arithmetic.
1778 *----------------------------------------------------------------------------*/
1779
1780 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1781 {
1782     flag aSign, bSign, zSign;
1783     int16 aExp, bExp, zExp;
1784     bits32 aSig, bSig;
1785     bits64 zSig64;
1786     bits32 zSig;
1787
1788     aSig = extractFloat32Frac( a );
1789     aExp = extractFloat32Exp( a );
1790     aSign = extractFloat32Sign( a );
1791     bSig = extractFloat32Frac( b );
1792     bExp = extractFloat32Exp( b );
1793     bSign = extractFloat32Sign( b );
1794     zSign = aSign ^ bSign;
1795     if ( aExp == 0xFF ) {
1796         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1797             return propagateFloat32NaN( a, b STATUS_VAR );
1798         }
1799         if ( ( bExp | bSig ) == 0 ) {
1800             float_raise( float_flag_invalid STATUS_VAR);
1801             return float32_default_nan;
1802         }
1803         return packFloat32( zSign, 0xFF, 0 );
1804     }
1805     if ( bExp == 0xFF ) {
1806         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1807         if ( ( aExp | aSig ) == 0 ) {
1808             float_raise( float_flag_invalid STATUS_VAR);
1809             return float32_default_nan;
1810         }
1811         return packFloat32( zSign, 0xFF, 0 );
1812     }
1813     if ( aExp == 0 ) {
1814         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1815         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1816     }
1817     if ( bExp == 0 ) {
1818         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1819         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1820     }
1821     zExp = aExp + bExp - 0x7F;
1822     aSig = ( aSig | 0x00800000 )<<7;
1823     bSig = ( bSig | 0x00800000 )<<8;
1824     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1825     zSig = zSig64;
1826     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1827         zSig <<= 1;
1828         --zExp;
1829     }
1830     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1831
1832 }
1833
1834 /*----------------------------------------------------------------------------
1835 | Returns the result of dividing the single-precision floating-point value `a'
1836 | by the corresponding value `b'.  The operation is performed according to the
1837 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1838 *----------------------------------------------------------------------------*/
1839
1840 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1841 {
1842     flag aSign, bSign, zSign;
1843     int16 aExp, bExp, zExp;
1844     bits32 aSig, bSig, zSig;
1845
1846     aSig = extractFloat32Frac( a );
1847     aExp = extractFloat32Exp( a );
1848     aSign = extractFloat32Sign( a );
1849     bSig = extractFloat32Frac( b );
1850     bExp = extractFloat32Exp( b );
1851     bSign = extractFloat32Sign( b );
1852     zSign = aSign ^ bSign;
1853     if ( aExp == 0xFF ) {
1854         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1855         if ( bExp == 0xFF ) {
1856             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1857             float_raise( float_flag_invalid STATUS_VAR);
1858             return float32_default_nan;
1859         }
1860         return packFloat32( zSign, 0xFF, 0 );
1861     }
1862     if ( bExp == 0xFF ) {
1863         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1864         return packFloat32( zSign, 0, 0 );
1865     }
1866     if ( bExp == 0 ) {
1867         if ( bSig == 0 ) {
1868             if ( ( aExp | aSig ) == 0 ) {
1869                 float_raise( float_flag_invalid STATUS_VAR);
1870                 return float32_default_nan;
1871             }
1872             float_raise( float_flag_divbyzero STATUS_VAR);
1873             return packFloat32( zSign, 0xFF, 0 );
1874         }
1875         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1876     }
1877     if ( aExp == 0 ) {
1878         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1879         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1880     }
1881     zExp = aExp - bExp + 0x7D;
1882     aSig = ( aSig | 0x00800000 )<<7;
1883     bSig = ( bSig | 0x00800000 )<<8;
1884     if ( bSig <= ( aSig + aSig ) ) {
1885         aSig >>= 1;
1886         ++zExp;
1887     }
1888     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1889     if ( ( zSig & 0x3F ) == 0 ) {
1890         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1891     }
1892     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1893
1894 }
1895
1896 /*----------------------------------------------------------------------------
1897 | Returns the remainder of the single-precision floating-point value `a'
1898 | with respect to the corresponding value `b'.  The operation is performed
1899 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1900 *----------------------------------------------------------------------------*/
1901
1902 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1903 {
1904     flag aSign, bSign, zSign;
1905     int16 aExp, bExp, expDiff;
1906     bits32 aSig, bSig;
1907     bits32 q;
1908     bits64 aSig64, bSig64, q64;
1909     bits32 alternateASig;
1910     sbits32 sigMean;
1911
1912     aSig = extractFloat32Frac( a );
1913     aExp = extractFloat32Exp( a );
1914     aSign = extractFloat32Sign( a );
1915     bSig = extractFloat32Frac( b );
1916     bExp = extractFloat32Exp( b );
1917     bSign = extractFloat32Sign( b );
1918     if ( aExp == 0xFF ) {
1919         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1920             return propagateFloat32NaN( a, b STATUS_VAR );
1921         }
1922         float_raise( float_flag_invalid STATUS_VAR);
1923         return float32_default_nan;
1924     }
1925     if ( bExp == 0xFF ) {
1926         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1927         return a;
1928     }
1929     if ( bExp == 0 ) {
1930         if ( bSig == 0 ) {
1931             float_raise( float_flag_invalid STATUS_VAR);
1932             return float32_default_nan;
1933         }
1934         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1935     }
1936     if ( aExp == 0 ) {
1937         if ( aSig == 0 ) return a;
1938         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1939     }
1940     expDiff = aExp - bExp;
1941     aSig |= 0x00800000;
1942     bSig |= 0x00800000;
1943     if ( expDiff < 32 ) {
1944         aSig <<= 8;
1945         bSig <<= 8;
1946         if ( expDiff < 0 ) {
1947             if ( expDiff < -1 ) return a;
1948             aSig >>= 1;
1949         }
1950         q = ( bSig <= aSig );
1951         if ( q ) aSig -= bSig;
1952         if ( 0 < expDiff ) {
1953             q = ( ( (bits64) aSig )<<32 ) / bSig;
1954             q >>= 32 - expDiff;
1955             bSig >>= 2;
1956             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1957         }
1958         else {
1959             aSig >>= 2;
1960             bSig >>= 2;
1961         }
1962     }
1963     else {
1964         if ( bSig <= aSig ) aSig -= bSig;
1965         aSig64 = ( (bits64) aSig )<<40;
1966         bSig64 = ( (bits64) bSig )<<40;
1967         expDiff -= 64;
1968         while ( 0 < expDiff ) {
1969             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1970             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1971             aSig64 = - ( ( bSig * q64 )<<38 );
1972             expDiff -= 62;
1973         }
1974         expDiff += 64;
1975         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1976         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1977         q = q64>>( 64 - expDiff );
1978         bSig <<= 6;
1979         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1980     }
1981     do {
1982         alternateASig = aSig;
1983         ++q;
1984         aSig -= bSig;
1985     } while ( 0 <= (sbits32) aSig );
1986     sigMean = aSig + alternateASig;
1987     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1988         aSig = alternateASig;
1989     }
1990     zSign = ( (sbits32) aSig < 0 );
1991     if ( zSign ) aSig = - aSig;
1992     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
1993
1994 }
1995
1996 /*----------------------------------------------------------------------------
1997 | Returns the square root of the single-precision floating-point value `a'.
1998 | The operation is performed according to the IEC/IEEE Standard for Binary
1999 | Floating-Point Arithmetic.
2000 *----------------------------------------------------------------------------*/
2001
2002 float32 float32_sqrt( float32 a STATUS_PARAM )
2003 {
2004     flag aSign;
2005     int16 aExp, zExp;
2006     bits32 aSig, zSig;
2007     bits64 rem, term;
2008
2009     aSig = extractFloat32Frac( a );
2010     aExp = extractFloat32Exp( a );
2011     aSign = extractFloat32Sign( a );
2012     if ( aExp == 0xFF ) {
2013         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2014         if ( ! aSign ) return a;
2015         float_raise( float_flag_invalid STATUS_VAR);
2016         return float32_default_nan;
2017     }
2018     if ( aSign ) {
2019         if ( ( aExp | aSig ) == 0 ) return a;
2020         float_raise( float_flag_invalid STATUS_VAR);
2021         return float32_default_nan;
2022     }
2023     if ( aExp == 0 ) {
2024         if ( aSig == 0 ) return float32_zero;
2025         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2026     }
2027     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2028     aSig = ( aSig | 0x00800000 )<<8;
2029     zSig = estimateSqrt32( aExp, aSig ) + 2;
2030     if ( ( zSig & 0x7F ) <= 5 ) {
2031         if ( zSig < 2 ) {
2032             zSig = 0x7FFFFFFF;
2033             goto roundAndPack;
2034         }
2035         aSig >>= aExp & 1;
2036         term = ( (bits64) zSig ) * zSig;
2037         rem = ( ( (bits64) aSig )<<32 ) - term;
2038         while ( (sbits64) rem < 0 ) {
2039             --zSig;
2040             rem += ( ( (bits64) zSig )<<1 ) | 1;
2041         }
2042         zSig |= ( rem != 0 );
2043     }
2044     shift32RightJamming( zSig, 1, &zSig );
2045  roundAndPack:
2046     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2047
2048 }
2049
2050 /*----------------------------------------------------------------------------
2051 | Returns 1 if the single-precision floating-point value `a' is equal to
2052 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2053 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2054 *----------------------------------------------------------------------------*/
2055
2056 int float32_eq( float32 a, float32 b STATUS_PARAM )
2057 {
2058
2059     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2060          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2061        ) {
2062         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2063             float_raise( float_flag_invalid STATUS_VAR);
2064         }
2065         return 0;
2066     }
2067     return ( float32_val(a) == float32_val(b) ) ||
2068             ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2069
2070 }
2071
2072 /*----------------------------------------------------------------------------
2073 | Returns 1 if the single-precision floating-point value `a' is less than
2074 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
2075 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2076 | Arithmetic.
2077 *----------------------------------------------------------------------------*/
2078
2079 int float32_le( float32 a, float32 b STATUS_PARAM )
2080 {
2081     flag aSign, bSign;
2082     bits32 av, bv;
2083
2084     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2085          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2086        ) {
2087         float_raise( float_flag_invalid STATUS_VAR);
2088         return 0;
2089     }
2090     aSign = extractFloat32Sign( a );
2091     bSign = extractFloat32Sign( b );
2092     av = float32_val(a);
2093     bv = float32_val(b);
2094     if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2095     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2096
2097 }
2098
2099 /*----------------------------------------------------------------------------
2100 | Returns 1 if the single-precision floating-point value `a' is less than
2101 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2102 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2103 *----------------------------------------------------------------------------*/
2104
2105 int float32_lt( float32 a, float32 b STATUS_PARAM )
2106 {
2107     flag aSign, bSign;
2108     bits32 av, bv;
2109
2110     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2111          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2112        ) {
2113         float_raise( float_flag_invalid STATUS_VAR);
2114         return 0;
2115     }
2116     aSign = extractFloat32Sign( a );
2117     bSign = extractFloat32Sign( b );
2118     av = float32_val(a);
2119     bv = float32_val(b);
2120     if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2121     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2122
2123 }
2124
2125 /*----------------------------------------------------------------------------
2126 | Returns 1 if the single-precision floating-point value `a' is equal to
2127 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2128 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2129 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2130 *----------------------------------------------------------------------------*/
2131
2132 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2133 {
2134     bits32 av, bv;
2135
2136     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2137          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2138        ) {
2139         float_raise( float_flag_invalid STATUS_VAR);
2140         return 0;
2141     }
2142     av = float32_val(a);
2143     bv = float32_val(b);
2144     return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2145
2146 }
2147
2148 /*----------------------------------------------------------------------------
2149 | Returns 1 if the single-precision floating-point value `a' is less than or
2150 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2151 | cause an exception.  Otherwise, the comparison is performed according to the
2152 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2153 *----------------------------------------------------------------------------*/
2154
2155 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2156 {
2157     flag aSign, bSign;
2158     bits32 av, bv;
2159
2160     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2161          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2162        ) {
2163         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2164             float_raise( float_flag_invalid STATUS_VAR);
2165         }
2166         return 0;
2167     }
2168     aSign = extractFloat32Sign( a );
2169     bSign = extractFloat32Sign( b );
2170     av = float32_val(a);
2171     bv = float32_val(b);
2172     if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2173     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2174
2175 }
2176
2177 /*----------------------------------------------------------------------------
2178 | Returns 1 if the single-precision floating-point value `a' is less than
2179 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2180 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2181 | Standard for Binary Floating-Point Arithmetic.
2182 *----------------------------------------------------------------------------*/
2183
2184 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2185 {
2186     flag aSign, bSign;
2187     bits32 av, bv;
2188
2189     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2190          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2191        ) {
2192         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2193             float_raise( float_flag_invalid STATUS_VAR);
2194         }
2195         return 0;
2196     }
2197     aSign = extractFloat32Sign( a );
2198     bSign = extractFloat32Sign( b );
2199     av = float32_val(a);
2200     bv = float32_val(b);
2201     if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2202     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2203
2204 }
2205
2206 /*----------------------------------------------------------------------------
2207 | Returns the result of converting the double-precision floating-point value
2208 | `a' to the 32-bit two's complement integer format.  The conversion is
2209 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2210 | Arithmetic---which means in particular that the conversion is rounded
2211 | according to the current rounding mode.  If `a' is a NaN, the largest
2212 | positive integer is returned.  Otherwise, if the conversion overflows, the
2213 | largest integer with the same sign as `a' is returned.
2214 *----------------------------------------------------------------------------*/
2215
2216 int32 float64_to_int32( float64 a STATUS_PARAM )
2217 {
2218     flag aSign;
2219     int16 aExp, shiftCount;
2220     bits64 aSig;
2221
2222     aSig = extractFloat64Frac( a );
2223     aExp = extractFloat64Exp( a );
2224     aSign = extractFloat64Sign( a );
2225     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2226     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2227     shiftCount = 0x42C - aExp;
2228     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2229     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2230
2231 }
2232
2233 /*----------------------------------------------------------------------------
2234 | Returns the result of converting the double-precision floating-point value
2235 | `a' to the 32-bit two's complement integer format.  The conversion is
2236 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2237 | Arithmetic, except that the conversion is always rounded toward zero.
2238 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2239 | the conversion overflows, the largest integer with the same sign as `a' is
2240 | returned.
2241 *----------------------------------------------------------------------------*/
2242
2243 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2244 {
2245     flag aSign;
2246     int16 aExp, shiftCount;
2247     bits64 aSig, savedASig;
2248     int32 z;
2249
2250     aSig = extractFloat64Frac( a );
2251     aExp = extractFloat64Exp( a );
2252     aSign = extractFloat64Sign( a );
2253     if ( 0x41E < aExp ) {
2254         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2255         goto invalid;
2256     }
2257     else if ( aExp < 0x3FF ) {
2258         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2259         return 0;
2260     }
2261     aSig |= LIT64( 0x0010000000000000 );
2262     shiftCount = 0x433 - aExp;
2263     savedASig = aSig;
2264     aSig >>= shiftCount;
2265     z = aSig;
2266     if ( aSign ) z = - z;
2267     if ( ( z < 0 ) ^ aSign ) {
2268  invalid:
2269         float_raise( float_flag_invalid STATUS_VAR);
2270         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2271     }
2272     if ( ( aSig<<shiftCount ) != savedASig ) {
2273         STATUS(float_exception_flags) |= float_flag_inexact;
2274     }
2275     return z;
2276
2277 }
2278
2279 /*----------------------------------------------------------------------------
2280 | Returns the result of converting the double-precision floating-point value
2281 | `a' to the 64-bit two's complement integer format.  The conversion is
2282 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2283 | Arithmetic---which means in particular that the conversion is rounded
2284 | according to the current rounding mode.  If `a' is a NaN, the largest
2285 | positive integer is returned.  Otherwise, if the conversion overflows, the
2286 | largest integer with the same sign as `a' is returned.
2287 *----------------------------------------------------------------------------*/
2288
2289 int64 float64_to_int64( float64 a STATUS_PARAM )
2290 {
2291     flag aSign;
2292     int16 aExp, shiftCount;
2293     bits64 aSig, aSigExtra;
2294
2295     aSig = extractFloat64Frac( a );
2296     aExp = extractFloat64Exp( a );
2297     aSign = extractFloat64Sign( a );
2298     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2299     shiftCount = 0x433 - aExp;
2300     if ( shiftCount <= 0 ) {
2301         if ( 0x43E < aExp ) {
2302             float_raise( float_flag_invalid STATUS_VAR);
2303             if (    ! aSign
2304                  || (    ( aExp == 0x7FF )
2305                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2306                ) {
2307                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2308             }
2309             return (sbits64) LIT64( 0x8000000000000000 );
2310         }
2311         aSigExtra = 0;
2312         aSig <<= - shiftCount;
2313     }
2314     else {
2315         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2316     }
2317     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2318
2319 }
2320
2321 /*----------------------------------------------------------------------------
2322 | Returns the result of converting the double-precision floating-point value
2323 | `a' to the 64-bit two's complement integer format.  The conversion is
2324 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2325 | Arithmetic, except that the conversion is always rounded toward zero.
2326 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2327 | the conversion overflows, the largest integer with the same sign as `a' is
2328 | returned.
2329 *----------------------------------------------------------------------------*/
2330
2331 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2332 {
2333     flag aSign;
2334     int16 aExp, shiftCount;
2335     bits64 aSig;
2336     int64 z;
2337
2338     aSig = extractFloat64Frac( a );
2339     aExp = extractFloat64Exp( a );
2340     aSign = extractFloat64Sign( a );
2341     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2342     shiftCount = aExp - 0x433;
2343     if ( 0 <= shiftCount ) {
2344         if ( 0x43E <= aExp ) {
2345             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2346                 float_raise( float_flag_invalid STATUS_VAR);
2347                 if (    ! aSign
2348                      || (    ( aExp == 0x7FF )
2349                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2350                    ) {
2351                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2352                 }
2353             }
2354             return (sbits64) LIT64( 0x8000000000000000 );
2355         }
2356         z = aSig<<shiftCount;
2357     }
2358     else {
2359         if ( aExp < 0x3FE ) {
2360             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2361             return 0;
2362         }
2363         z = aSig>>( - shiftCount );
2364         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2365             STATUS(float_exception_flags) |= float_flag_inexact;
2366         }
2367     }
2368     if ( aSign ) z = - z;
2369     return z;
2370
2371 }
2372
2373 /*----------------------------------------------------------------------------
2374 | Returns the result of converting the double-precision floating-point value
2375 | `a' to the single-precision floating-point format.  The conversion is
2376 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2377 | Arithmetic.
2378 *----------------------------------------------------------------------------*/
2379
2380 float32 float64_to_float32( float64 a STATUS_PARAM )
2381 {
2382     flag aSign;
2383     int16 aExp;
2384     bits64 aSig;
2385     bits32 zSig;
2386
2387     aSig = extractFloat64Frac( a );
2388     aExp = extractFloat64Exp( a );
2389     aSign = extractFloat64Sign( a );
2390     if ( aExp == 0x7FF ) {
2391         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2392         return packFloat32( aSign, 0xFF, 0 );
2393     }
2394     shift64RightJamming( aSig, 22, &aSig );
2395     zSig = aSig;
2396     if ( aExp || zSig ) {
2397         zSig |= 0x40000000;
2398         aExp -= 0x381;
2399     }
2400     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2401
2402 }
2403
2404 #ifdef FLOATX80
2405
2406 /*----------------------------------------------------------------------------
2407 | Returns the result of converting the double-precision floating-point value
2408 | `a' to the extended double-precision floating-point format.  The conversion
2409 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2410 | Arithmetic.
2411 *----------------------------------------------------------------------------*/
2412
2413 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2414 {
2415     flag aSign;
2416     int16 aExp;
2417     bits64 aSig;
2418
2419     aSig = extractFloat64Frac( a );
2420     aExp = extractFloat64Exp( a );
2421     aSign = extractFloat64Sign( a );
2422     if ( aExp == 0x7FF ) {
2423         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2424         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2425     }
2426     if ( aExp == 0 ) {
2427         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2428         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2429     }
2430     return
2431         packFloatx80(
2432             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2433
2434 }
2435
2436 #endif
2437
2438 #ifdef FLOAT128
2439
2440 /*----------------------------------------------------------------------------
2441 | Returns the result of converting the double-precision floating-point value
2442 | `a' to the quadruple-precision floating-point format.  The conversion is
2443 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2444 | Arithmetic.
2445 *----------------------------------------------------------------------------*/
2446
2447 float128 float64_to_float128( float64 a STATUS_PARAM )
2448 {
2449     flag aSign;
2450     int16 aExp;
2451     bits64 aSig, zSig0, zSig1;
2452
2453     aSig = extractFloat64Frac( a );
2454     aExp = extractFloat64Exp( a );
2455     aSign = extractFloat64Sign( a );
2456     if ( aExp == 0x7FF ) {
2457         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2458         return packFloat128( aSign, 0x7FFF, 0, 0 );
2459     }
2460     if ( aExp == 0 ) {
2461         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2462         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2463         --aExp;
2464     }
2465     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2466     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2467
2468 }
2469
2470 #endif
2471
2472 /*----------------------------------------------------------------------------
2473 | Rounds the double-precision floating-point value `a' to an integer, and
2474 | returns the result as a double-precision floating-point value.  The
2475 | operation is performed according to the IEC/IEEE Standard for Binary
2476 | Floating-Point Arithmetic.
2477 *----------------------------------------------------------------------------*/
2478
2479 float64 float64_round_to_int( float64 a STATUS_PARAM )
2480 {
2481     flag aSign;
2482     int16 aExp;
2483     bits64 lastBitMask, roundBitsMask;
2484     int8 roundingMode;
2485     bits64 z;
2486
2487     aExp = extractFloat64Exp( a );
2488     if ( 0x433 <= aExp ) {
2489         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2490             return propagateFloat64NaN( a, a STATUS_VAR );
2491         }
2492         return a;
2493     }
2494     if ( aExp < 0x3FF ) {
2495         if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2496         STATUS(float_exception_flags) |= float_flag_inexact;
2497         aSign = extractFloat64Sign( a );
2498         switch ( STATUS(float_rounding_mode) ) {
2499          case float_round_nearest_even:
2500             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2501                 return packFloat64( aSign, 0x3FF, 0 );
2502             }
2503             break;
2504          case float_round_down:
2505             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2506          case float_round_up:
2507             return make_float64(
2508             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2509         }
2510         return packFloat64( aSign, 0, 0 );
2511     }
2512     lastBitMask = 1;
2513     lastBitMask <<= 0x433 - aExp;
2514     roundBitsMask = lastBitMask - 1;
2515     z = float64_val(a);
2516     roundingMode = STATUS(float_rounding_mode);
2517     if ( roundingMode == float_round_nearest_even ) {
2518         z += lastBitMask>>1;
2519         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2520     }
2521     else if ( roundingMode != float_round_to_zero ) {
2522         if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2523             z += roundBitsMask;
2524         }
2525     }
2526     z &= ~ roundBitsMask;
2527     if ( z != float64_val(a) )
2528         STATUS(float_exception_flags) |= float_flag_inexact;
2529     return make_float64(z);
2530
2531 }
2532
2533 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2534 {
2535     int oldmode;
2536     float64 res;
2537     oldmode = STATUS(float_rounding_mode);
2538     STATUS(float_rounding_mode) = float_round_to_zero;
2539     res = float64_round_to_int(a STATUS_VAR);
2540     STATUS(float_rounding_mode) = oldmode;
2541     return res;
2542 }
2543
2544 /*----------------------------------------------------------------------------
2545 | Returns the result of adding the absolute values of the double-precision
2546 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2547 | before being returned.  `zSign' is ignored if the result is a NaN.
2548 | The addition is performed according to the IEC/IEEE Standard for Binary
2549 | Floating-Point Arithmetic.
2550 *----------------------------------------------------------------------------*/
2551
2552 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2553 {
2554     int16 aExp, bExp, zExp;
2555     bits64 aSig, bSig, zSig;
2556     int16 expDiff;
2557
2558     aSig = extractFloat64Frac( a );
2559     aExp = extractFloat64Exp( a );
2560     bSig = extractFloat64Frac( b );
2561     bExp = extractFloat64Exp( b );
2562     expDiff = aExp - bExp;
2563     aSig <<= 9;
2564     bSig <<= 9;
2565     if ( 0 < expDiff ) {
2566         if ( aExp == 0x7FF ) {
2567             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2568             return a;
2569         }
2570         if ( bExp == 0 ) {
2571             --expDiff;
2572         }
2573         else {
2574             bSig |= LIT64( 0x2000000000000000 );
2575         }
2576         shift64RightJamming( bSig, expDiff, &bSig );
2577         zExp = aExp;
2578     }
2579     else if ( expDiff < 0 ) {
2580         if ( bExp == 0x7FF ) {
2581             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2582             return packFloat64( zSign, 0x7FF, 0 );
2583         }
2584         if ( aExp == 0 ) {
2585             ++expDiff;
2586         }
2587         else {
2588             aSig |= LIT64( 0x2000000000000000 );
2589         }
2590         shift64RightJamming( aSig, - expDiff, &aSig );
2591         zExp = bExp;
2592     }
2593     else {
2594         if ( aExp == 0x7FF ) {
2595             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2596             return a;
2597         }
2598         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2599         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2600         zExp = aExp;
2601         goto roundAndPack;
2602     }
2603     aSig |= LIT64( 0x2000000000000000 );
2604     zSig = ( aSig + bSig )<<1;
2605     --zExp;
2606     if ( (sbits64) zSig < 0 ) {
2607         zSig = aSig + bSig;
2608         ++zExp;
2609     }
2610  roundAndPack:
2611     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2612
2613 }
2614
2615 /*----------------------------------------------------------------------------
2616 | Returns the result of subtracting the absolute values of the double-
2617 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2618 | difference is negated before being returned.  `zSign' is ignored if the
2619 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2620 | Standard for Binary Floating-Point Arithmetic.
2621 *----------------------------------------------------------------------------*/
2622
2623 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2624 {
2625     int16 aExp, bExp, zExp;
2626     bits64 aSig, bSig, zSig;
2627     int16 expDiff;
2628
2629     aSig = extractFloat64Frac( a );
2630     aExp = extractFloat64Exp( a );
2631     bSig = extractFloat64Frac( b );
2632     bExp = extractFloat64Exp( b );
2633     expDiff = aExp - bExp;
2634     aSig <<= 10;
2635     bSig <<= 10;
2636     if ( 0 < expDiff ) goto aExpBigger;
2637     if ( expDiff < 0 ) goto bExpBigger;
2638     if ( aExp == 0x7FF ) {
2639         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2640         float_raise( float_flag_invalid STATUS_VAR);
2641         return float64_default_nan;
2642     }
2643     if ( aExp == 0 ) {
2644         aExp = 1;
2645         bExp = 1;
2646     }
2647     if ( bSig < aSig ) goto aBigger;
2648     if ( aSig < bSig ) goto bBigger;
2649     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2650  bExpBigger:
2651     if ( bExp == 0x7FF ) {
2652         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2653         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2654     }
2655     if ( aExp == 0 ) {
2656         ++expDiff;
2657     }
2658     else {
2659         aSig |= LIT64( 0x4000000000000000 );
2660     }
2661     shift64RightJamming( aSig, - expDiff, &aSig );
2662     bSig |= LIT64( 0x4000000000000000 );
2663  bBigger:
2664     zSig = bSig - aSig;
2665     zExp = bExp;
2666     zSign ^= 1;
2667     goto normalizeRoundAndPack;
2668  aExpBigger:
2669     if ( aExp == 0x7FF ) {
2670         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2671         return a;
2672     }
2673     if ( bExp == 0 ) {
2674         --expDiff;
2675     }
2676     else {
2677         bSig |= LIT64( 0x4000000000000000 );
2678     }
2679     shift64RightJamming( bSig, expDiff, &bSig );
2680     aSig |= LIT64( 0x4000000000000000 );
2681  aBigger:
2682     zSig = aSig - bSig;
2683     zExp = aExp;
2684  normalizeRoundAndPack:
2685     --zExp;
2686     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2687
2688 }
2689
2690 /*----------------------------------------------------------------------------
2691 | Returns the result of adding the double-precision floating-point values `a'
2692 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2693 | Binary Floating-Point Arithmetic.
2694 *----------------------------------------------------------------------------*/
2695
2696 float64 float64_add( float64 a, float64 b STATUS_PARAM )
2697 {
2698     flag aSign, bSign;
2699
2700     aSign = extractFloat64Sign( a );
2701     bSign = extractFloat64Sign( b );
2702     if ( aSign == bSign ) {
2703         return addFloat64Sigs( a, b, aSign STATUS_VAR );
2704     }
2705     else {
2706         return subFloat64Sigs( a, b, aSign STATUS_VAR );
2707     }
2708
2709 }
2710
2711 /*----------------------------------------------------------------------------
2712 | Returns the result of subtracting the double-precision floating-point values
2713 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2714 | for Binary Floating-Point Arithmetic.
2715 *----------------------------------------------------------------------------*/
2716
2717 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2718 {
2719     flag aSign, bSign;
2720
2721     aSign = extractFloat64Sign( a );
2722     bSign = extractFloat64Sign( b );
2723     if ( aSign == bSign ) {
2724         return subFloat64Sigs( a, b, aSign STATUS_VAR );
2725     }
2726     else {
2727         return addFloat64Sigs( a, b, aSign STATUS_VAR );
2728     }
2729
2730 }
2731
2732 /*----------------------------------------------------------------------------
2733 | Returns the result of multiplying the double-precision floating-point values
2734 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2735 | for Binary Floating-Point Arithmetic.
2736 *----------------------------------------------------------------------------*/
2737
2738 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2739 {
2740     flag aSign, bSign, zSign;
2741     int16 aExp, bExp, zExp;
2742     bits64 aSig, bSig, zSig0, zSig1;
2743
2744     aSig = extractFloat64Frac( a );
2745     aExp = extractFloat64Exp( a );
2746     aSign = extractFloat64Sign( a );
2747     bSig = extractFloat64Frac( b );
2748     bExp = extractFloat64Exp( b );
2749     bSign = extractFloat64Sign( b );
2750     zSign = aSign ^ bSign;
2751     if ( aExp == 0x7FF ) {
2752         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2753             return propagateFloat64NaN( a, b STATUS_VAR );
2754         }
2755         if ( ( bExp | bSig ) == 0 ) {
2756             float_raise( float_flag_invalid STATUS_VAR);
2757             return float64_default_nan;
2758         }
2759         return packFloat64( zSign, 0x7FF, 0 );
2760     }
2761     if ( bExp == 0x7FF ) {
2762         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2763         if ( ( aExp | aSig ) == 0 ) {
2764             float_raise( float_flag_invalid STATUS_VAR);
2765             return float64_default_nan;
2766         }
2767         return packFloat64( zSign, 0x7FF, 0 );
2768     }
2769     if ( aExp == 0 ) {
2770         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2771         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2772     }
2773     if ( bExp == 0 ) {
2774         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2775         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2776     }
2777     zExp = aExp + bExp - 0x3FF;
2778     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2779     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2780     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2781     zSig0 |= ( zSig1 != 0 );
2782     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2783         zSig0 <<= 1;
2784         --zExp;
2785     }
2786     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2787
2788 }
2789
2790 /*----------------------------------------------------------------------------
2791 | Returns the result of dividing the double-precision floating-point value `a'
2792 | by the corresponding value `b'.  The operation is performed according to
2793 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2794 *----------------------------------------------------------------------------*/
2795
2796 float64 float64_div( float64 a, float64 b STATUS_PARAM )
2797 {
2798     flag aSign, bSign, zSign;
2799     int16 aExp, bExp, zExp;
2800     bits64 aSig, bSig, zSig;
2801     bits64 rem0, rem1;
2802     bits64 term0, term1;
2803
2804     aSig = extractFloat64Frac( a );
2805     aExp = extractFloat64Exp( a );
2806     aSign = extractFloat64Sign( a );
2807     bSig = extractFloat64Frac( b );
2808     bExp = extractFloat64Exp( b );
2809     bSign = extractFloat64Sign( b );
2810     zSign = aSign ^ bSign;
2811     if ( aExp == 0x7FF ) {
2812         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2813         if ( bExp == 0x7FF ) {
2814             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2815             float_raise( float_flag_invalid STATUS_VAR);
2816             return float64_default_nan;
2817         }
2818         return packFloat64( zSign, 0x7FF, 0 );
2819     }
2820     if ( bExp == 0x7FF ) {
2821         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2822         return packFloat64( zSign, 0, 0 );
2823     }
2824     if ( bExp == 0 ) {
2825         if ( bSig == 0 ) {
2826             if ( ( aExp | aSig ) == 0 ) {
2827                 float_raise( float_flag_invalid STATUS_VAR);
2828                 return float64_default_nan;
2829             }
2830             float_raise( float_flag_divbyzero STATUS_VAR);
2831             return packFloat64( zSign, 0x7FF, 0 );
2832         }
2833         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2834     }
2835     if ( aExp == 0 ) {
2836         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2837         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2838     }
2839     zExp = aExp - bExp + 0x3FD;
2840     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2841     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2842     if ( bSig <= ( aSig + aSig ) ) {
2843         aSig >>= 1;
2844         ++zExp;
2845     }
2846     zSig = estimateDiv128To64( aSig, 0, bSig );
2847     if ( ( zSig & 0x1FF ) <= 2 ) {
2848         mul64To128( bSig, zSig, &term0, &term1 );
2849         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2850         while ( (sbits64) rem0 < 0 ) {
2851             --zSig;
2852             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2853         }
2854         zSig |= ( rem1 != 0 );
2855     }
2856     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2857
2858 }
2859
2860 /*----------------------------------------------------------------------------
2861 | Returns the remainder of the double-precision floating-point value `a'
2862 | with respect to the corresponding value `b'.  The operation is performed
2863 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2864 *----------------------------------------------------------------------------*/
2865
2866 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
2867 {
2868     flag aSign, bSign, zSign;
2869     int16 aExp, bExp, expDiff;
2870     bits64 aSig, bSig;
2871     bits64 q, alternateASig;
2872     sbits64 sigMean;
2873
2874     aSig = extractFloat64Frac( a );
2875     aExp = extractFloat64Exp( a );
2876     aSign = extractFloat64Sign( a );
2877     bSig = extractFloat64Frac( b );
2878     bExp = extractFloat64Exp( b );
2879     bSign = extractFloat64Sign( b );
2880     if ( aExp == 0x7FF ) {
2881         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2882             return propagateFloat64NaN( a, b STATUS_VAR );
2883         }
2884         float_raise( float_flag_invalid STATUS_VAR);
2885         return float64_default_nan;
2886     }
2887     if ( bExp == 0x7FF ) {
2888         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2889         return a;
2890     }
2891     if ( bExp == 0 ) {
2892         if ( bSig == 0 ) {
2893             float_raise( float_flag_invalid STATUS_VAR);
2894             return float64_default_nan;
2895         }
2896         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2897     }
2898     if ( aExp == 0 ) {
2899         if ( aSig == 0 ) return a;
2900         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2901     }
2902     expDiff = aExp - bExp;
2903     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2904     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2905     if ( expDiff < 0 ) {
2906         if ( expDiff < -1 ) return a;
2907         aSig >>= 1;
2908     }
2909     q = ( bSig <= aSig );
2910     if ( q ) aSig -= bSig;
2911     expDiff -= 64;
2912     while ( 0 < expDiff ) {
2913         q = estimateDiv128To64( aSig, 0, bSig );
2914         q = ( 2 < q ) ? q - 2 : 0;
2915         aSig = - ( ( bSig>>2 ) * q );
2916         expDiff -= 62;
2917     }
2918     expDiff += 64;
2919     if ( 0 < expDiff ) {
2920         q = estimateDiv128To64( aSig, 0, bSig );
2921         q = ( 2 < q ) ? q - 2 : 0;
2922         q >>= 64 - expDiff;
2923         bSig >>= 2;
2924         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2925     }
2926     else {
2927         aSig >>= 2;
2928         bSig >>= 2;
2929     }
2930     do {
2931         alternateASig = aSig;
2932         ++q;
2933         aSig -= bSig;
2934     } while ( 0 <= (sbits64) aSig );
2935     sigMean = aSig + alternateASig;
2936     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2937         aSig = alternateASig;
2938     }
2939     zSign = ( (sbits64) aSig < 0 );
2940     if ( zSign ) aSig = - aSig;
2941     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
2942
2943 }
2944
2945 /*----------------------------------------------------------------------------
2946 | Returns the square root of the double-precision floating-point value `a'.
2947 | The operation is performed according to the IEC/IEEE Standard for Binary
2948 | Floating-Point Arithmetic.
2949 *----------------------------------------------------------------------------*/
2950
2951 float64 float64_sqrt( float64 a STATUS_PARAM )
2952 {
2953     flag aSign;
2954     int16 aExp, zExp;
2955     bits64 aSig, zSig, doubleZSig;
2956     bits64 rem0, rem1, term0, term1;
2957
2958     aSig = extractFloat64Frac( a );
2959     aExp = extractFloat64Exp( a );
2960     aSign = extractFloat64Sign( a );
2961     if ( aExp == 0x7FF ) {
2962         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
2963         if ( ! aSign ) return a;
2964         float_raise( float_flag_invalid STATUS_VAR);
2965         return float64_default_nan;
2966     }
2967     if ( aSign ) {
2968         if ( ( aExp | aSig ) == 0 ) return a;
2969         float_raise( float_flag_invalid STATUS_VAR);
2970         return float64_default_nan;
2971     }
2972     if ( aExp == 0 ) {
2973         if ( aSig == 0 ) return float64_zero;
2974         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2975     }
2976     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2977     aSig |= LIT64( 0x0010000000000000 );
2978     zSig = estimateSqrt32( aExp, aSig>>21 );
2979     aSig <<= 9 - ( aExp & 1 );
2980     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2981     if ( ( zSig & 0x1FF ) <= 5 ) {
2982         doubleZSig = zSig<<1;
2983         mul64To128( zSig, zSig, &term0, &term1 );
2984         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2985         while ( (sbits64) rem0 < 0 ) {
2986             --zSig;
2987             doubleZSig -= 2;
2988             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
2989         }
2990         zSig |= ( ( rem0 | rem1 ) != 0 );
2991     }
2992     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
2993
2994 }
2995
2996 /*----------------------------------------------------------------------------
2997 | Returns 1 if the double-precision floating-point value `a' is equal to the
2998 | corresponding value `b', and 0 otherwise.  The comparison is performed
2999 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3000 *----------------------------------------------------------------------------*/
3001
3002 int float64_eq( float64 a, float64 b STATUS_PARAM )
3003 {
3004     bits64 av, bv;
3005
3006     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3007          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3008        ) {
3009         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3010             float_raise( float_flag_invalid STATUS_VAR);
3011         }
3012         return 0;
3013     }
3014     av = float64_val(a);
3015     bv = float64_val(b);
3016     return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3017
3018 }
3019
3020 /*----------------------------------------------------------------------------
3021 | Returns 1 if the double-precision floating-point value `a' is less than or
3022 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
3023 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3024 | Arithmetic.
3025 *----------------------------------------------------------------------------*/
3026
3027 int float64_le( float64 a, float64 b STATUS_PARAM )
3028 {
3029     flag aSign, bSign;
3030     bits64 av, bv;
3031
3032     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3033          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3034        ) {
3035         float_raise( float_flag_invalid STATUS_VAR);
3036         return 0;
3037     }
3038     aSign = extractFloat64Sign( a );
3039     bSign = extractFloat64Sign( b );
3040     av = float64_val(a);
3041     bv = float64_val(b);
3042     if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3043     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3044
3045 }
3046
3047 /*----------------------------------------------------------------------------
3048 | Returns 1 if the double-precision floating-point value `a' is less than
3049 | the corresponding value `b', and 0 otherwise.  The comparison is performed
3050 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3051 *----------------------------------------------------------------------------*/
3052
3053 int float64_lt( float64 a, float64 b STATUS_PARAM )
3054 {
3055     flag aSign, bSign;
3056     bits64 av, bv;
3057
3058     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3059          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3060        ) {
3061         float_raise( float_flag_invalid STATUS_VAR);
3062         return 0;
3063     }
3064     aSign = extractFloat64Sign( a );
3065     bSign = extractFloat64Sign( b );
3066     av = float64_val(a);
3067     bv = float64_val(b);
3068     if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3069     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3070
3071 }
3072
3073 /*----------------------------------------------------------------------------
3074 | Returns 1 if the double-precision floating-point value `a' is equal to the
3075 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
3076 | if either operand is a NaN.  Otherwise, the comparison is performed
3077 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3078 *----------------------------------------------------------------------------*/
3079
3080 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3081 {
3082     bits64 av, bv;
3083
3084     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3085          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3086        ) {
3087         float_raise( float_flag_invalid STATUS_VAR);
3088         return 0;
3089     }
3090     av = float64_val(a);
3091     bv = float64_val(b);
3092     return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3093
3094 }
3095
3096 /*----------------------------------------------------------------------------
3097 | Returns 1 if the double-precision floating-point value `a' is less than or
3098 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3099 | cause an exception.  Otherwise, the comparison is performed according to the
3100 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3101 *----------------------------------------------------------------------------*/
3102
3103 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3104 {
3105     flag aSign, bSign;
3106     bits64 av, bv;
3107
3108     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3109          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3110        ) {
3111         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3112             float_raise( float_flag_invalid STATUS_VAR);
3113         }
3114         return 0;
3115     }
3116     aSign = extractFloat64Sign( a );
3117     bSign = extractFloat64Sign( b );
3118     av = float64_val(a);
3119     bv = float64_val(b);
3120     if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3121     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3122
3123 }
3124
3125 /*----------------------------------------------------------------------------
3126 | Returns 1 if the double-precision floating-point value `a' is less than
3127 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3128 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3129 | Standard for Binary Floating-Point Arithmetic.
3130 *----------------------------------------------------------------------------*/
3131
3132 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3133 {
3134     flag aSign, bSign;
3135     bits64 av, bv;
3136
3137     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3138          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3139        ) {
3140         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3141             float_raise( float_flag_invalid STATUS_VAR);
3142         }
3143         return 0;
3144     }
3145     aSign = extractFloat64Sign( a );
3146     bSign = extractFloat64Sign( b );
3147     av = float64_val(a);
3148     bv = float64_val(b);
3149     if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3150     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3151
3152 }
3153
3154 #ifdef FLOATX80
3155
3156 /*----------------------------------------------------------------------------
3157 | Returns the result of converting the extended double-precision floating-
3158 | point value `a' to the 32-bit two's complement integer format.  The
3159 | conversion is performed according to the IEC/IEEE Standard for Binary
3160 | Floating-Point Arithmetic---which means in particular that the conversion
3161 | is rounded according to the current rounding mode.  If `a' is a NaN, the
3162 | largest positive integer is returned.  Otherwise, if the conversion
3163 | overflows, the largest integer with the same sign as `a' is returned.
3164 *----------------------------------------------------------------------------*/
3165
3166 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3167 {
3168     flag aSign;
3169     int32 aExp, shiftCount;
3170     bits64 aSig;
3171
3172     aSig = extractFloatx80Frac( a );
3173     aExp = extractFloatx80Exp( a );
3174     aSign = extractFloatx80Sign( a );
3175     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3176     shiftCount = 0x4037 - aExp;
3177     if ( shiftCount <= 0 ) shiftCount = 1;
3178     shift64RightJamming( aSig, shiftCount, &aSig );
3179     return roundAndPackInt32( aSign, aSig STATUS_VAR );
3180
3181 }
3182
3183 /*----------------------------------------------------------------------------
3184 | Returns the result of converting the extended double-precision floating-
3185 | point value `a' to the 32-bit two's complement integer format.  The
3186 | conversion is performed according to the IEC/IEEE Standard for Binary
3187 | Floating-Point Arithmetic, except that the conversion is always rounded
3188 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3189 | Otherwise, if the conversion overflows, the largest integer with the same
3190 | sign as `a' is returned.
3191 *----------------------------------------------------------------------------*/
3192
3193 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3194 {
3195     flag aSign;
3196     int32 aExp, shiftCount;
3197     bits64 aSig, savedASig;
3198     int32 z;
3199
3200     aSig = extractFloatx80Frac( a );
3201     aExp = extractFloatx80Exp( a );
3202     aSign = extractFloatx80Sign( a );
3203     if ( 0x401E < aExp ) {
3204         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3205         goto invalid;
3206     }
3207     else if ( aExp < 0x3FFF ) {
3208         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3209         return 0;
3210     }
3211     shiftCount = 0x403E - aExp;
3212     savedASig = aSig;
3213     aSig >>= shiftCount;
3214     z = aSig;
3215     if ( aSign ) z = - z;
3216     if ( ( z < 0 ) ^ aSign ) {
3217  invalid:
3218         float_raise( float_flag_invalid STATUS_VAR);
3219         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3220     }
3221     if ( ( aSig<<shiftCount ) != savedASig ) {
3222         STATUS(float_exception_flags) |= float_flag_inexact;
3223     }
3224     return z;
3225
3226 }
3227
3228 /*----------------------------------------------------------------------------
3229 | Returns the result of converting the extended double-precision floating-
3230 | point value `a' to the 64-bit two's complement integer format.  The
3231 | conversion is performed according to the IEC/IEEE Standard for Binary
3232 | Floating-Point Arithmetic---which means in particular that the conversion
3233 | is rounded according to the current rounding mode.  If `a' is a NaN,
3234 | the largest positive integer is returned.  Otherwise, if the conversion
3235 | overflows, the largest integer with the same sign as `a' is returned.
3236 *----------------------------------------------------------------------------*/
3237
3238 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3239 {
3240     flag aSign;
3241     int32 aExp, shiftCount;
3242     bits64 aSig, aSigExtra;
3243
3244     aSig = extractFloatx80Frac( a );
3245     aExp = extractFloatx80Exp( a );
3246     aSign = extractFloatx80Sign( a );
3247     shiftCount = 0x403E - aExp;
3248     if ( shiftCount <= 0 ) {
3249         if ( shiftCount ) {
3250             float_raise( float_flag_invalid STATUS_VAR);
3251             if (    ! aSign
3252                  || (    ( aExp == 0x7FFF )
3253                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3254                ) {
3255                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3256             }
3257             return (sbits64) LIT64( 0x8000000000000000 );
3258         }
3259         aSigExtra = 0;
3260     }
3261     else {
3262         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3263     }
3264     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3265
3266 }
3267
3268 /*----------------------------------------------------------------------------
3269 | Returns the result of converting the extended double-precision floating-
3270 | point value `a' to the 64-bit two's complement integer format.  The
3271 | conversion is performed according to the IEC/IEEE Standard for Binary
3272 | Floating-Point Arithmetic, except that the conversion is always rounded
3273 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3274 | Otherwise, if the conversion overflows, the largest integer with the same
3275 | sign as `a' is returned.
3276 *----------------------------------------------------------------------------*/
3277
3278 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3279 {
3280     flag aSign;
3281     int32 aExp, shiftCount;
3282     bits64 aSig;
3283     int64 z;
3284
3285     aSig = extractFloatx80Frac( a );
3286     aExp = extractFloatx80Exp( a );
3287     aSign = extractFloatx80Sign( a );
3288     shiftCount = aExp - 0x403E;
3289     if ( 0 <= shiftCount ) {
3290         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3291         if ( ( a.high != 0xC03E ) || aSig ) {
3292             float_raise( float_flag_invalid STATUS_VAR);
3293             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3294                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3295             }
3296         }
3297         return (sbits64) LIT64( 0x8000000000000000 );
3298     }
3299     else if ( aExp < 0x3FFF ) {
3300         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3301         return 0;
3302     }
3303     z = aSig>>( - shiftCount );
3304     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3305         STATUS(float_exception_flags) |= float_flag_inexact;
3306     }
3307     if ( aSign ) z = - z;
3308     return z;
3309
3310 }
3311
3312 /*----------------------------------------------------------------------------
3313 | Returns the result of converting the extended double-precision floating-
3314 | point value `a' to the single-precision floating-point format.  The
3315 | conversion is performed according to the IEC/IEEE Standard for Binary
3316 | Floating-Point Arithmetic.
3317 *----------------------------------------------------------------------------*/
3318
3319 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3320 {
3321     flag aSign;
3322     int32 aExp;
3323     bits64 aSig;
3324
3325     aSig = extractFloatx80Frac( a );
3326     aExp = extractFloatx80Exp( a );
3327     aSign = extractFloatx80Sign( a );
3328     if ( aExp == 0x7FFF ) {
3329         if ( (bits64) ( aSig<<1 ) ) {
3330             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3331         }
3332         return packFloat32( aSign, 0xFF, 0 );
3333     }
3334     shift64RightJamming( aSig, 33, &aSig );
3335     if ( aExp || aSig ) aExp -= 0x3F81;
3336     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3337
3338 }
3339
3340 /*----------------------------------------------------------------------------
3341 | Returns the result of converting the extended double-precision floating-
3342 | point value `a' to the double-precision floating-point format.  The
3343 | conversion is performed according to the IEC/IEEE Standard for Binary
3344 | Floating-Point Arithmetic.
3345 *----------------------------------------------------------------------------*/
3346
3347 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3348 {
3349     flag aSign;
3350     int32 aExp;
3351     bits64 aSig, zSig;
3352
3353     aSig = extractFloatx80Frac( a );
3354     aExp = extractFloatx80Exp( a );
3355     aSign = extractFloatx80Sign( a );
3356     if ( aExp == 0x7FFF ) {
3357         if ( (bits64) ( aSig<<1 ) ) {
3358             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3359         }
3360         return packFloat64( aSign, 0x7FF, 0 );
3361     }
3362     shift64RightJamming( aSig, 1, &zSig );
3363     if ( aExp || aSig ) aExp -= 0x3C01;
3364     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3365
3366 }
3367
3368 #ifdef FLOAT128
3369
3370 /*----------------------------------------------------------------------------
3371 | Returns the result of converting the extended double-precision floating-
3372 | point value `a' to the quadruple-precision floating-point format.  The
3373 | conversion is performed according to the IEC/IEEE Standard for Binary
3374 | Floating-Point Arithmetic.
3375 *----------------------------------------------------------------------------*/
3376
3377 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3378 {
3379     flag aSign;
3380     int16 aExp;
3381     bits64 aSig, zSig0, zSig1;
3382
3383     aSig = extractFloatx80Frac( a );
3384     aExp = extractFloatx80Exp( a );
3385     aSign = extractFloatx80Sign( a );
3386     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3387         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3388     }
3389     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3390     return packFloat128( aSign, aExp, zSig0, zSig1 );
3391
3392 }
3393
3394 #endif
3395
3396 /*----------------------------------------------------------------------------
3397 | Rounds the extended double-precision floating-point value `a' to an integer,
3398 | and returns the result as an extended quadruple-precision floating-point
3399 | value.  The operation is performed according to the IEC/IEEE Standard for
3400 | Binary Floating-Point Arithmetic.
3401 *----------------------------------------------------------------------------*/
3402
3403 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3404 {
3405     flag aSign;
3406     int32 aExp;
3407     bits64 lastBitMask, roundBitsMask;
3408     int8 roundingMode;
3409     floatx80 z;
3410
3411     aExp = extractFloatx80Exp( a );
3412     if ( 0x403E <= aExp ) {
3413         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3414             return propagateFloatx80NaN( a, a STATUS_VAR );
3415         }
3416         return a;
3417     }
3418     if ( aExp < 0x3FFF ) {
3419         if (    ( aExp == 0 )
3420              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3421             return a;
3422         }
3423         STATUS(float_exception_flags) |= float_flag_inexact;
3424         aSign = extractFloatx80Sign( a );
3425         switch ( STATUS(float_rounding_mode) ) {
3426          case float_round_nearest_even:
3427             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3428                ) {
3429                 return
3430                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3431             }
3432             break;
3433          case float_round_down:
3434             return
3435                   aSign ?
3436                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3437                 : packFloatx80( 0, 0, 0 );
3438          case float_round_up:
3439             return
3440                   aSign ? packFloatx80( 1, 0, 0 )
3441                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3442         }
3443         return packFloatx80( aSign, 0, 0 );
3444     }
3445     lastBitMask = 1;
3446     lastBitMask <<= 0x403E - aExp;
3447     roundBitsMask = lastBitMask - 1;
3448     z = a;
3449     roundingMode = STATUS(float_rounding_mode);
3450     if ( roundingMode == float_round_nearest_even ) {
3451         z.low += lastBitMask>>1;
3452         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3453     }
3454     else if ( roundingMode != float_round_to_zero ) {
3455         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3456             z.low += roundBitsMask;
3457         }
3458     }
3459     z.low &= ~ roundBitsMask;
3460     if ( z.low == 0 ) {
3461         ++z.high;
3462         z.low = LIT64( 0x8000000000000000 );
3463     }
3464     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3465     return z;
3466
3467 }
3468
3469 /*----------------------------------------------------------------------------
3470 | Returns the result of adding the absolute values of the extended double-
3471 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3472 | negated before being returned.  `zSign' is ignored if the result is a NaN.
3473 | The addition is performed according to the IEC/IEEE Standard for Binary
3474 | Floating-Point Arithmetic.
3475 *----------------------------------------------------------------------------*/
3476
3477 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3478 {
3479     int32 aExp, bExp, zExp;
3480     bits64 aSig, bSig, zSig0, zSig1;
3481     int32 expDiff;
3482
3483     aSig = extractFloatx80Frac( a );
3484     aExp = extractFloatx80Exp( a );
3485     bSig = extractFloatx80Frac( b );
3486     bExp = extractFloatx80Exp( b );
3487     expDiff = aExp - bExp;
3488     if ( 0 < expDiff ) {
3489         if ( aExp == 0x7FFF ) {
3490             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3491             return a;
3492         }
3493         if ( bExp == 0 ) --expDiff;
3494         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3495         zExp = aExp;
3496     }
3497     else if ( expDiff < 0 ) {
3498         if ( bExp == 0x7FFF ) {
3499             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3500             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3501         }
3502         if ( aExp == 0 ) ++expDiff;
3503         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3504         zExp = bExp;
3505     }
3506     else {
3507         if ( aExp == 0x7FFF ) {
3508             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3509                 return propagateFloatx80NaN( a, b STATUS_VAR );
3510             }
3511             return a;
3512         }
3513         zSig1 = 0;
3514         zSig0 = aSig + bSig;
3515         if ( aExp == 0 ) {
3516             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3517             goto roundAndPack;
3518         }
3519         zExp = aExp;
3520         goto shiftRight1;
3521     }
3522     zSig0 = aSig + bSig;
3523     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3524  shiftRight1:
3525     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3526     zSig0 |= LIT64( 0x8000000000000000 );
3527     ++zExp;
3528  roundAndPack:
3529     return
3530         roundAndPackFloatx80(
3531             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3532
3533 }
3534
3535 /*----------------------------------------------------------------------------
3536 | Returns the result of subtracting the absolute values of the extended
3537 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3538 | difference is negated before being returned.  `zSign' is ignored if the
3539 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3540 | Standard for Binary Floating-Point Arithmetic.
3541 *----------------------------------------------------------------------------*/
3542
3543 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3544 {
3545     int32 aExp, bExp, zExp;
3546     bits64 aSig, bSig, zSig0, zSig1;
3547     int32 expDiff;
3548     floatx80 z;
3549
3550     aSig = extractFloatx80Frac( a );
3551     aExp = extractFloatx80Exp( a );
3552     bSig = extractFloatx80Frac( b );
3553     bExp = extractFloatx80Exp( b );
3554     expDiff = aExp - bExp;
3555     if ( 0 < expDiff ) goto aExpBigger;
3556     if ( expDiff < 0 ) goto bExpBigger;
3557     if ( aExp == 0x7FFF ) {
3558         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3559             return propagateFloatx80NaN( a, b STATUS_VAR );
3560         }
3561         float_raise( float_flag_invalid STATUS_VAR);
3562         z.low = floatx80_default_nan_low;
3563         z.high = floatx80_default_nan_high;
3564         return z;
3565     }
3566     if ( aExp == 0 ) {
3567         aExp = 1;
3568         bExp = 1;
3569     }
3570     zSig1 = 0;
3571     if ( bSig < aSig ) goto aBigger;
3572     if ( aSig < bSig ) goto bBigger;
3573     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3574  bExpBigger:
3575     if ( bExp == 0x7FFF ) {
3576         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3577         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3578     }
3579     if ( aExp == 0 ) ++expDiff;
3580     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3581  bBigger:
3582     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3583     zExp = bExp;
3584     zSign ^= 1;
3585     goto normalizeRoundAndPack;
3586  aExpBigger:
3587     if ( aExp == 0x7FFF ) {
3588         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3589         return a;
3590     }
3591     if ( bExp == 0 ) --expDiff;
3592     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3593  aBigger:
3594     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3595     zExp = aExp;
3596  normalizeRoundAndPack:
3597     return
3598         normalizeRoundAndPackFloatx80(
3599             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3600
3601 }
3602
3603 /*----------------------------------------------------------------------------
3604 | Returns the result of adding the extended double-precision floating-point
3605 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
3606 | Standard for Binary Floating-Point Arithmetic.
3607 *----------------------------------------------------------------------------*/
3608
3609 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3610 {
3611     flag aSign, bSign;
3612
3613     aSign = extractFloatx80Sign( a );
3614     bSign = extractFloatx80Sign( b );
3615     if ( aSign == bSign ) {
3616         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3617     }
3618     else {
3619         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3620     }
3621
3622 }
3623
3624 /*----------------------------------------------------------------------------
3625 | Returns the result of subtracting the extended double-precision floating-
3626 | point values `a' and `b'.  The operation is performed according to the
3627 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3628 *----------------------------------------------------------------------------*/
3629
3630 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3631 {
3632     flag aSign, bSign;
3633
3634     aSign = extractFloatx80Sign( a );
3635     bSign = extractFloatx80Sign( b );
3636     if ( aSign == bSign ) {
3637         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3638     }
3639     else {
3640         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3641     }
3642
3643 }
3644
3645 /*----------------------------------------------------------------------------
3646 | Returns the result of multiplying the extended double-precision floating-
3647 | point values `a' and `b'.  The operation is performed according to the
3648 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3649 *----------------------------------------------------------------------------*/
3650
3651 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3652 {
3653     flag aSign, bSign, zSign;
3654     int32 aExp, bExp, zExp;
3655     bits64 aSig, bSig, zSig0, zSig1;
3656     floatx80 z;
3657
3658     aSig = extractFloatx80Frac( a );
3659     aExp = extractFloatx80Exp( a );
3660     aSign = extractFloatx80Sign( a );
3661     bSig = extractFloatx80Frac( b );
3662     bExp = extractFloatx80Exp( b );
3663     bSign = extractFloatx80Sign( b );
3664     zSign = aSign ^ bSign;
3665     if ( aExp == 0x7FFF ) {
3666         if (    (bits64) ( aSig<<1 )
3667              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3668             return propagateFloatx80NaN( a, b STATUS_VAR );
3669         }
3670         if ( ( bExp | bSig ) == 0 ) goto invalid;
3671         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3672     }
3673     if ( bExp == 0x7FFF ) {
3674         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3675         if ( ( aExp | aSig ) == 0 ) {
3676  invalid:
3677             float_raise( float_flag_invalid STATUS_VAR);
3678             z.low = floatx80_default_nan_low;
3679             z.high = floatx80_default_nan_high;
3680             return z;
3681         }
3682         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3683     }
3684     if ( aExp == 0 ) {
3685         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3686         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3687     }
3688     if ( bExp == 0 ) {
3689         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3690         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3691     }
3692     zExp = aExp + bExp - 0x3FFE;
3693     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3694     if ( 0 < (sbits64) zSig0 ) {
3695         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3696         --zExp;
3697     }
3698     return
3699         roundAndPackFloatx80(
3700             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3701
3702 }
3703
3704 /*----------------------------------------------------------------------------
3705 | Returns the result of dividing the extended double-precision floating-point
3706 | value `a' by the corresponding value `b'.  The operation is performed
3707 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3708 *----------------------------------------------------------------------------*/
3709
3710 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3711 {
3712     flag aSign, bSign, zSign;
3713     int32 aExp, bExp, zExp;
3714     bits64 aSig, bSig, zSig0, zSig1;
3715     bits64 rem0, rem1, rem2, term0, term1, term2;
3716     floatx80 z;
3717
3718     aSig = extractFloatx80Frac( a );
3719     aExp = extractFloatx80Exp( a );
3720     aSign = extractFloatx80Sign( a );
3721     bSig = extractFloatx80Frac( b );
3722     bExp = extractFloatx80Exp( b );
3723     bSign = extractFloatx80Sign( b );
3724     zSign = aSign ^ bSign;
3725     if ( aExp == 0x7FFF ) {
3726         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3727         if ( bExp == 0x7FFF ) {
3728             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3729             goto invalid;
3730         }
3731         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3732     }
3733     if ( bExp == 0x7FFF ) {
3734         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3735         return packFloatx80( zSign, 0, 0 );
3736     }
3737     if ( bExp == 0 ) {
3738         if ( bSig == 0 ) {
3739             if ( ( aExp | aSig ) == 0 ) {
3740  invalid:
3741                 float_raise( float_flag_invalid STATUS_VAR);
3742                 z.low = floatx80_default_nan_low;
3743                 z.high = floatx80_default_nan_high;
3744                 return z;
3745             }
3746             float_raise( float_flag_divbyzero STATUS_VAR);
3747             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3748         }
3749         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3750     }
3751     if ( aExp == 0 ) {
3752         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3753         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3754     }
3755     zExp = aExp - bExp + 0x3FFE;
3756     rem1 = 0;
3757     if ( bSig <= aSig ) {
3758         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3759         ++zExp;
3760     }
3761     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3762     mul64To128( bSig, zSig0, &term0, &term1 );
3763     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3764     while ( (sbits64) rem0 < 0 ) {
3765         --zSig0;
3766         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3767     }
3768     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3769     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3770         mul64To128( bSig, zSig1, &term1, &term2 );
3771         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3772         while ( (sbits64) rem1 < 0 ) {
3773             --zSig1;
3774             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3775         }
3776         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3777     }
3778     return
3779         roundAndPackFloatx80(
3780             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3781
3782 }
3783
3784 /*----------------------------------------------------------------------------
3785 | Returns the remainder of the extended double-precision floating-point value
3786 | `a' with respect to the corresponding value `b'.  The operation is performed
3787 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3788 *----------------------------------------------------------------------------*/
3789
3790 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
3791 {
3792     flag aSign, bSign, zSign;
3793     int32 aExp, bExp, expDiff;
3794     bits64 aSig0, aSig1, bSig;
3795     bits64 q, term0, term1, alternateASig0, alternateASig1;
3796     floatx80 z;
3797
3798     aSig0 = extractFloatx80Frac( a );
3799     aExp = extractFloatx80Exp( a );
3800     aSign = extractFloatx80Sign( a );
3801     bSig = extractFloatx80Frac( b );
3802     bExp = extractFloatx80Exp( b );
3803     bSign = extractFloatx80Sign( b );
3804     if ( aExp == 0x7FFF ) {
3805         if (    (bits64) ( aSig0<<1 )
3806              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3807             return propagateFloatx80NaN( a, b STATUS_VAR );
3808         }
3809         goto invalid;
3810     }
3811     if ( bExp == 0x7FFF ) {
3812         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3813         return a;
3814     }
3815     if ( bExp == 0 ) {
3816         if ( bSig == 0 ) {
3817  invalid:
3818             float_raise( float_flag_invalid STATUS_VAR);
3819             z.low = floatx80_default_nan_low;
3820             z.high = floatx80_default_nan_high;
3821             return z;
3822         }
3823         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3824     }
3825     if ( aExp == 0 ) {
3826         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3827         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3828     }
3829     bSig |= LIT64( 0x8000000000000000 );
3830     zSign = aSign;
3831     expDiff = aExp - bExp;
3832     aSig1 = 0;
3833     if ( expDiff < 0 ) {
3834         if ( expDiff < -1 ) return a;
3835         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3836         expDiff = 0;
3837     }
3838     q = ( bSig <= aSig0 );
3839     if ( q ) aSig0 -= bSig;
3840     expDiff -= 64;
3841     while ( 0 < expDiff ) {
3842         q = estimateDiv128To64( aSig0, aSig1, bSig );
3843         q = ( 2 < q ) ? q - 2 : 0;
3844         mul64To128( bSig, q, &term0, &term1 );
3845         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3846         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3847         expDiff -= 62;
3848     }
3849     expDiff += 64;
3850     if ( 0 < expDiff ) {
3851         q = estimateDiv128To64( aSig0, aSig1, bSig );
3852         q = ( 2 < q ) ? q - 2 : 0;
3853         q >>= 64 - expDiff;
3854         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3855         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3856         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3857         while ( le128( term0, term1, aSig0, aSig1 ) ) {
3858             ++q;
3859             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3860         }
3861     }
3862     else {
3863         term1 = 0;
3864         term0 = bSig;
3865     }
3866     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3867     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3868          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3869               && ( q & 1 ) )
3870        ) {
3871         aSig0 = alternateASig0;
3872         aSig1 = alternateASig1;
3873         zSign = ! zSign;
3874     }
3875     return
3876         normalizeRoundAndPackFloatx80(
3877             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
3878
3879 }
3880
3881 /*----------------------------------------------------------------------------
3882 | Returns the square root of the extended double-precision floating-point
3883 | value `a'.  The operation is performed according to the IEC/IEEE Standard
3884 | for Binary Floating-Point Arithmetic.
3885 *----------------------------------------------------------------------------*/
3886
3887 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
3888 {
3889     flag aSign;
3890     int32 aExp, zExp;
3891     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3892     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3893     floatx80 z;
3894
3895     aSig0 = extractFloatx80Frac( a );
3896     aExp = extractFloatx80Exp( a );
3897     aSign = extractFloatx80Sign( a );
3898     if ( aExp == 0x7FFF ) {
3899         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
3900         if ( ! aSign ) return a;
3901         goto invalid;
3902     }
3903     if ( aSign ) {
3904         if ( ( aExp | aSig0 ) == 0 ) return a;
3905  invalid:
3906         float_raise( float_flag_invalid STATUS_VAR);
3907         z.low = floatx80_default_nan_low;
3908         z.high = floatx80_default_nan_high;
3909         return z;
3910     }
3911     if ( aExp == 0 ) {
3912         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3913         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3914     }
3915     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3916     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3917     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3918     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3919     doubleZSig0 = zSig0<<1;
3920     mul64To128( zSig0, zSig0, &term0, &term1 );
3921     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3922     while ( (sbits64) rem0 < 0 ) {
3923         --zSig0;
3924         doubleZSig0 -= 2;
3925         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3926     }
3927     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3928     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3929         if ( zSig1 == 0 ) zSig1 = 1;
3930         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3931         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3932         mul64To128( zSig1, zSig1, &term2, &term3 );
3933         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3934         while ( (sbits64) rem1 < 0 ) {
3935             --zSig1;
3936             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3937             term3 |= 1;
3938             term2 |= doubleZSig0;
3939             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3940         }
3941         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3942     }
3943     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3944     zSig0 |= doubleZSig0;
3945     return
3946         roundAndPackFloatx80(
3947             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
3948
3949 }
3950
3951 /*----------------------------------------------------------------------------
3952 | Returns 1 if the extended double-precision floating-point value `a' is
3953 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
3954 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3955 | Arithmetic.
3956 *----------------------------------------------------------------------------*/
3957
3958 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
3959 {
3960
3961     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3962               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3963          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3964               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3965        ) {
3966         if (    floatx80_is_signaling_nan( a )
3967              || floatx80_is_signaling_nan( b ) ) {
3968             float_raise( float_flag_invalid STATUS_VAR);
3969         }
3970         return 0;
3971     }
3972     return
3973            ( a.low == b.low )
3974         && (    ( a.high == b.high )
3975              || (    ( a.low == 0 )
3976                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3977            );
3978
3979 }
3980
3981 /*----------------------------------------------------------------------------
3982 | Returns 1 if the extended double-precision floating-point value `a' is
3983 | less than or equal to the corresponding value `b', and 0 otherwise.  The
3984 | comparison is performed according to the IEC/IEEE Standard for Binary
3985 | Floating-Point Arithmetic.
3986 *----------------------------------------------------------------------------*/
3987
3988 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
3989 {
3990     flag aSign, bSign;
3991
3992     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3993               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3994          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3995               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3996        ) {
3997         float_raise( float_flag_invalid STATUS_VAR);
3998         return 0;
3999     }
4000     aSign = extractFloatx80Sign( a );
4001     bSign = extractFloatx80Sign( b );
4002     if ( aSign != bSign ) {
4003         return
4004                aSign
4005             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4006                  == 0 );
4007     }
4008     return
4009           aSign ? le128( b.high, b.low, a.high, a.low )
4010         : le128( a.high, a.low, b.high, b.low );
4011
4012 }
4013
4014 /*----------------------------------------------------------------------------
4015 | Returns 1 if the extended double-precision floating-point value `a' is
4016 | less than the corresponding value `b', and 0 otherwise.  The comparison
4017 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4018 | Arithmetic.
4019 *----------------------------------------------------------------------------*/
4020
4021 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4022 {
4023     flag aSign, bSign;
4024
4025     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4026               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4027          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4028               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4029        ) {
4030         float_raise( float_flag_invalid STATUS_VAR);
4031         return 0;
4032     }
4033     aSign = extractFloatx80Sign( a );
4034     bSign = extractFloatx80Sign( b );
4035     if ( aSign != bSign ) {
4036         return
4037                aSign
4038             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4039                  != 0 );
4040     }
4041     return
4042           aSign ? lt128( b.high, b.low, a.high, a.low )
4043         : lt128( a.high, a.low, b.high, b.low );
4044
4045 }
4046
4047 /*----------------------------------------------------------------------------
4048 | Returns 1 if the extended double-precision floating-point value `a' is equal
4049 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
4050 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4051 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4052 *----------------------------------------------------------------------------*/
4053
4054 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4055 {
4056
4057     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4058               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4059          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4060               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4061        ) {
4062         float_raise( float_flag_invalid STATUS_VAR);
4063         return 0;
4064     }
4065     return
4066            ( a.low == b.low )
4067         && (    ( a.high == b.high )
4068              || (    ( a.low == 0 )
4069                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4070            );
4071
4072 }
4073
4074 /*----------------------------------------------------------------------------
4075 | Returns 1 if the extended double-precision floating-point value `a' is less
4076 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4077 | do not cause an exception.  Otherwise, the comparison is performed according
4078 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4079 *----------------------------------------------------------------------------*/
4080
4081 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4082 {
4083     flag aSign, bSign;
4084
4085     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4086               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4087          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4088               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4089        ) {
4090         if (    floatx80_is_signaling_nan( a )
4091              || floatx80_is_signaling_nan( b ) ) {
4092             float_raise( float_flag_invalid STATUS_VAR);
4093         }
4094         return 0;
4095     }
4096     aSign = extractFloatx80Sign( a );
4097     bSign = extractFloatx80Sign( b );
4098     if ( aSign != bSign ) {
4099         return
4100                aSign
4101             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4102                  == 0 );
4103     }
4104     return
4105           aSign ? le128( b.high, b.low, a.high, a.low )
4106         : le128( a.high, a.low, b.high, b.low );
4107
4108 }
4109
4110 /*----------------------------------------------------------------------------
4111 | Returns 1 if the extended double-precision floating-point value `a' is less
4112 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4113 | an exception.  Otherwise, the comparison is performed according to the
4114 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4115 *----------------------------------------------------------------------------*/
4116
4117 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4118 {
4119     flag aSign, bSign;
4120
4121     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4122               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4123          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4124               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4125        ) {
4126         if (    floatx80_is_signaling_nan( a )
4127              || floatx80_is_signaling_nan( b ) ) {
4128             float_raise( float_flag_invalid STATUS_VAR);
4129         }
4130         return 0;
4131     }
4132     aSign = extractFloatx80Sign( a );
4133     bSign = extractFloatx80Sign( b );
4134     if ( aSign != bSign ) {
4135         return
4136                aSign
4137             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4138                  != 0 );
4139     }
4140     return
4141           aSign ? lt128( b.high, b.low, a.high, a.low )
4142         : lt128( a.high, a.low, b.high, b.low );
4143
4144 }
4145
4146 #endif
4147
4148 #ifdef FLOAT128
4149
4150 /*----------------------------------------------------------------------------
4151 | Returns the result of converting the quadruple-precision floating-point
4152 | value `a' to the 32-bit two's complement integer format.  The conversion
4153 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4154 | Arithmetic---which means in particular that the conversion is rounded
4155 | according to the current rounding mode.  If `a' is a NaN, the largest
4156 | positive integer is returned.  Otherwise, if the conversion overflows, the
4157 | largest integer with the same sign as `a' is returned.
4158 *----------------------------------------------------------------------------*/
4159
4160 int32 float128_to_int32( float128 a STATUS_PARAM )
4161 {
4162     flag aSign;
4163     int32 aExp, shiftCount;
4164     bits64 aSig0, aSig1;
4165
4166     aSig1 = extractFloat128Frac1( a );
4167     aSig0 = extractFloat128Frac0( a );
4168     aExp = extractFloat128Exp( a );
4169     aSign = extractFloat128Sign( a );
4170     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4171     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4172     aSig0 |= ( aSig1 != 0 );
4173     shiftCount = 0x4028 - aExp;
4174     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4175     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4176
4177 }
4178
4179 /*----------------------------------------------------------------------------
4180 | Returns the result of converting the quadruple-precision floating-point
4181 | value `a' to the 32-bit two's complement integer format.  The conversion
4182 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4183 | Arithmetic, except that the conversion is always rounded toward zero.  If
4184 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4185 | conversion overflows, the largest integer with the same sign as `a' is
4186 | returned.
4187 *----------------------------------------------------------------------------*/
4188
4189 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4190 {
4191     flag aSign;
4192     int32 aExp, shiftCount;
4193     bits64 aSig0, aSig1, savedASig;
4194     int32 z;
4195
4196     aSig1 = extractFloat128Frac1( a );
4197     aSig0 = extractFloat128Frac0( a );
4198     aExp = extractFloat128Exp( a );
4199     aSign = extractFloat128Sign( a );
4200     aSig0 |= ( aSig1 != 0 );
4201     if ( 0x401E < aExp ) {
4202         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4203         goto invalid;
4204     }
4205     else if ( aExp < 0x3FFF ) {
4206         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4207         return 0;
4208     }
4209     aSig0 |= LIT64( 0x0001000000000000 );
4210     shiftCount = 0x402F - aExp;
4211     savedASig = aSig0;
4212     aSig0 >>= shiftCount;
4213     z = aSig0;
4214     if ( aSign ) z = - z;
4215     if ( ( z < 0 ) ^ aSign ) {
4216  invalid:
4217         float_raise( float_flag_invalid STATUS_VAR);
4218         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4219     }
4220     if ( ( aSig0<<shiftCount ) != savedASig ) {
4221         STATUS(float_exception_flags) |= float_flag_inexact;
4222     }
4223     return z;
4224
4225 }
4226
4227 /*----------------------------------------------------------------------------
4228 | Returns the result of converting the quadruple-precision floating-point
4229 | value `a' to the 64-bit two's complement integer format.  The conversion
4230 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4231 | Arithmetic---which means in particular that the conversion is rounded
4232 | according to the current rounding mode.  If `a' is a NaN, the largest
4233 | positive integer is returned.  Otherwise, if the conversion overflows, the
4234 | largest integer with the same sign as `a' is returned.
4235 *----------------------------------------------------------------------------*/
4236
4237 int64 float128_to_int64( float128 a STATUS_PARAM )
4238 {
4239     flag aSign;
4240     int32 aExp, shiftCount;
4241     bits64 aSig0, aSig1;
4242
4243     aSig1 = extractFloat128Frac1( a );
4244     aSig0 = extractFloat128Frac0( a );
4245     aExp = extractFloat128Exp( a );
4246     aSign = extractFloat128Sign( a );
4247     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4248     shiftCount = 0x402F - aExp;
4249     if ( shiftCount <= 0 ) {
4250         if ( 0x403E < aExp ) {
4251             float_raise( float_flag_invalid STATUS_VAR);
4252             if (    ! aSign
4253                  || (    ( aExp == 0x7FFF )
4254                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4255                     )
4256                ) {
4257                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4258             }
4259             return (sbits64) LIT64( 0x8000000000000000 );
4260         }
4261         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4262     }
4263     else {
4264         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4265     }
4266     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4267
4268 }
4269
4270 /*----------------------------------------------------------------------------
4271 | Returns the result of converting the quadruple-precision floating-point
4272 | value `a' to the 64-bit two's complement integer format.  The conversion
4273 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4274 | Arithmetic, except that the conversion is always rounded toward zero.
4275 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4276 | the conversion overflows, the largest integer with the same sign as `a' is
4277 | returned.
4278 *----------------------------------------------------------------------------*/
4279
4280 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4281 {
4282     flag aSign;
4283     int32 aExp, shiftCount;
4284     bits64 aSig0, aSig1;
4285     int64 z;
4286
4287     aSig1 = extractFloat128Frac1( a );
4288     aSig0 = extractFloat128Frac0( a );
4289     aExp = extractFloat128Exp( a );
4290     aSign = extractFloat128Sign( a );
4291     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4292     shiftCount = aExp - 0x402F;
4293     if ( 0 < shiftCount ) {
4294         if ( 0x403E <= aExp ) {
4295             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4296             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4297                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4298                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4299             }
4300             else {
4301                 float_raise( float_flag_invalid STATUS_VAR);
4302                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4303                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4304                 }
4305             }
4306             return (sbits64) LIT64( 0x8000000000000000 );
4307         }
4308         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4309         if ( (bits64) ( aSig1<<shiftCount ) ) {
4310             STATUS(float_exception_flags) |= float_flag_inexact;
4311         }
4312     }
4313     else {
4314         if ( aExp < 0x3FFF ) {
4315             if ( aExp | aSig0 | aSig1 ) {
4316                 STATUS(float_exception_flags) |= float_flag_inexact;
4317             }
4318             return 0;
4319         }
4320         z = aSig0>>( - shiftCount );
4321         if (    aSig1
4322              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4323             STATUS(float_exception_flags) |= float_flag_inexact;
4324         }
4325     }
4326     if ( aSign ) z = - z;
4327     return z;
4328
4329 }
4330
4331 /*----------------------------------------------------------------------------
4332 | Returns the result of converting the quadruple-precision floating-point
4333 | value `a' to the single-precision floating-point format.  The conversion
4334 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4335 | Arithmetic.
4336 *----------------------------------------------------------------------------*/
4337
4338 float32 float128_to_float32( float128 a STATUS_PARAM )
4339 {
4340     flag aSign;
4341     int32 aExp;
4342     bits64 aSig0, aSig1;
4343     bits32 zSig;
4344
4345     aSig1 = extractFloat128Frac1( a );
4346     aSig0 = extractFloat128Frac0( a );
4347     aExp = extractFloat128Exp( a );
4348     aSign = extractFloat128Sign( a );
4349     if ( aExp == 0x7FFF ) {
4350         if ( aSig0 | aSig1 ) {
4351             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4352         }
4353         return packFloat32( aSign, 0xFF, 0 );
4354     }
4355     aSig0 |= ( aSig1 != 0 );
4356     shift64RightJamming( aSig0, 18, &aSig0 );
4357     zSig = aSig0;
4358     if ( aExp || zSig ) {
4359         zSig |= 0x40000000;
4360         aExp -= 0x3F81;
4361     }
4362     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4363
4364 }
4365
4366 /*----------------------------------------------------------------------------
4367 | Returns the result of converting the quadruple-precision floating-point
4368 | value `a' to the double-precision floating-point format.  The conversion
4369 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4370 | Arithmetic.
4371 *----------------------------------------------------------------------------*/
4372
4373 float64 float128_to_float64( float128 a STATUS_PARAM )
4374 {
4375     flag aSign;
4376     int32 aExp;
4377     bits64 aSig0, aSig1;
4378
4379     aSig1 = extractFloat128Frac1( a );
4380     aSig0 = extractFloat128Frac0( a );
4381     aExp = extractFloat128Exp( a );
4382     aSign = extractFloat128Sign( a );
4383     if ( aExp == 0x7FFF ) {
4384         if ( aSig0 | aSig1 ) {
4385             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4386         }
4387         return packFloat64( aSign, 0x7FF, 0 );
4388     }
4389     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4390     aSig0 |= ( aSig1 != 0 );
4391     if ( aExp || aSig0 ) {
4392         aSig0 |= LIT64( 0x4000000000000000 );
4393         aExp -= 0x3C01;
4394     }
4395     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4396
4397 }
4398
4399 #ifdef FLOATX80
4400
4401 /*----------------------------------------------------------------------------
4402 | Returns the result of converting the quadruple-precision floating-point
4403 | value `a' to the extended double-precision floating-point format.  The
4404 | conversion is performed according to the IEC/IEEE Standard for Binary
4405 | Floating-Point Arithmetic.
4406 *----------------------------------------------------------------------------*/
4407
4408 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4409 {
4410     flag aSign;
4411     int32 aExp;
4412     bits64 aSig0, aSig1;
4413
4414     aSig1 = extractFloat128Frac1( a );
4415     aSig0 = extractFloat128Frac0( a );
4416     aExp = extractFloat128Exp( a );
4417     aSign = extractFloat128Sign( a );
4418     if ( aExp == 0x7FFF ) {
4419         if ( aSig0 | aSig1 ) {
4420             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4421         }
4422         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4423     }
4424     if ( aExp == 0 ) {
4425         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4426         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4427     }
4428     else {
4429         aSig0 |= LIT64( 0x0001000000000000 );
4430     }
4431     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4432     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4433
4434 }
4435
4436 #endif
4437
4438 /*----------------------------------------------------------------------------
4439 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4440 | returns the result as a quadruple-precision floating-point value.  The
4441 | operation is performed according to the IEC/IEEE Standard for Binary
4442 | Floating-Point Arithmetic.
4443 *----------------------------------------------------------------------------*/
4444
4445 float128 float128_round_to_int( float128 a STATUS_PARAM )
4446 {
4447     flag aSign;
4448     int32 aExp;
4449     bits64 lastBitMask, roundBitsMask;
4450     int8 roundingMode;
4451     float128 z;
4452
4453     aExp = extractFloat128Exp( a );
4454     if ( 0x402F <= aExp ) {
4455         if ( 0x406F <= aExp ) {
4456             if (    ( aExp == 0x7FFF )
4457                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4458                ) {
4459                 return propagateFloat128NaN( a, a STATUS_VAR );
4460             }
4461             return a;
4462         }
4463         lastBitMask = 1;
4464         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4465         roundBitsMask = lastBitMask - 1;
4466         z = a;
4467         roundingMode = STATUS(float_rounding_mode);
4468         if ( roundingMode == float_round_nearest_even ) {
4469             if ( lastBitMask ) {
4470                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4471                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4472             }
4473             else {
4474                 if ( (sbits64) z.low < 0 ) {
4475                     ++z.high;
4476                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4477                 }
4478             }
4479         }
4480         else if ( roundingMode != float_round_to_zero ) {
4481             if (   extractFloat128Sign( z )
4482                  ^ ( roundingMode == float_round_up ) ) {
4483                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4484             }
4485         }
4486         z.low &= ~ roundBitsMask;
4487     }
4488     else {
4489         if ( aExp < 0x3FFF ) {
4490             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4491             STATUS(float_exception_flags) |= float_flag_inexact;
4492             aSign = extractFloat128Sign( a );
4493             switch ( STATUS(float_rounding_mode) ) {
4494              case float_round_nearest_even:
4495                 if (    ( aExp == 0x3FFE )
4496                      && (   extractFloat128Frac0( a )
4497                           | extractFloat128Frac1( a ) )
4498                    ) {
4499                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4500                 }
4501                 break;
4502              case float_round_down:
4503                 return
4504                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4505                     : packFloat128( 0, 0, 0, 0 );
4506              case float_round_up:
4507                 return
4508                       aSign ? packFloat128( 1, 0, 0, 0 )
4509                     : packFloat128( 0, 0x3FFF, 0, 0 );
4510             }
4511             return packFloat128( aSign, 0, 0, 0 );
4512         }
4513         lastBitMask = 1;
4514         lastBitMask <<= 0x402F - aExp;
4515         roundBitsMask = lastBitMask - 1;
4516         z.low = 0;
4517         z.high = a.high;
4518         roundingMode = STATUS(float_rounding_mode);
4519         if ( roundingMode == float_round_nearest_even ) {
4520             z.high += lastBitMask>>1;
4521             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4522                 z.high &= ~ lastBitMask;
4523             }
4524         }
4525         else if ( roundingMode != float_round_to_zero ) {
4526             if (   extractFloat128Sign( z )
4527                  ^ ( roundingMode == float_round_up ) ) {
4528                 z.high |= ( a.low != 0 );
4529                 z.high += roundBitsMask;
4530             }
4531         }
4532         z.high &= ~ roundBitsMask;
4533     }
4534     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4535         STATUS(float_exception_flags) |= float_flag_inexact;
4536     }
4537     return z;
4538
4539 }
4540
4541 /*----------------------------------------------------------------------------
4542 | Returns the result of adding the absolute values of the quadruple-precision
4543 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4544 | before being returned.  `zSign' is ignored if the result is a NaN.
4545 | The addition is performed according to the IEC/IEEE Standard for Binary
4546 | Floating-Point Arithmetic.
4547 *----------------------------------------------------------------------------*/
4548
4549 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4550 {
4551     int32 aExp, bExp, zExp;
4552     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4553     int32 expDiff;
4554
4555     aSig1 = extractFloat128Frac1( a );
4556     aSig0 = extractFloat128Frac0( a );
4557     aExp = extractFloat128Exp( a );
4558     bSig1 = extractFloat128Frac1( b );
4559     bSig0 = extractFloat128Frac0( b );
4560     bExp = extractFloat128Exp( b );
4561     expDiff = aExp - bExp;
4562     if ( 0 < expDiff ) {
4563         if ( aExp == 0x7FFF ) {
4564             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4565             return a;
4566         }
4567         if ( bExp == 0 ) {
4568             --expDiff;
4569         }
4570         else {
4571             bSig0 |= LIT64( 0x0001000000000000 );
4572         }
4573         shift128ExtraRightJamming(
4574             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4575         zExp = aExp;
4576     }
4577     else if ( expDiff < 0 ) {
4578         if ( bExp == 0x7FFF ) {
4579             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4580             return packFloat128( zSign, 0x7FFF, 0, 0 );
4581         }
4582         if ( aExp == 0 ) {
4583             ++expDiff;
4584         }
4585         else {
4586             aSig0 |= LIT64( 0x0001000000000000 );
4587         }
4588         shift128ExtraRightJamming(
4589             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4590         zExp = bExp;
4591     }
4592     else {
4593         if ( aExp == 0x7FFF ) {
4594             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4595                 return propagateFloat128NaN( a, b STATUS_VAR );
4596             }
4597             return a;
4598         }
4599         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4600         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4601         zSig2 = 0;
4602         zSig0 |= LIT64( 0x0002000000000000 );
4603         zExp = aExp;
4604         goto shiftRight1;
4605     }
4606     aSig0 |= LIT64( 0x0001000000000000 );
4607     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4608     --zExp;
4609     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4610     ++zExp;
4611  shiftRight1:
4612     shift128ExtraRightJamming(
4613         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4614  roundAndPack:
4615     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4616
4617 }
4618
4619 /*----------------------------------------------------------------------------
4620 | Returns the result of subtracting the absolute values of the quadruple-
4621 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
4622 | difference is negated before being returned.  `zSign' is ignored if the
4623 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4624 | Standard for Binary Floating-Point Arithmetic.
4625 *----------------------------------------------------------------------------*/
4626
4627 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4628 {
4629     int32 aExp, bExp, zExp;
4630     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4631     int32 expDiff;
4632     float128 z;
4633
4634     aSig1 = extractFloat128Frac1( a );
4635     aSig0 = extractFloat128Frac0( a );
4636     aExp = extractFloat128Exp( a );
4637     bSig1 = extractFloat128Frac1( b );
4638     bSig0 = extractFloat128Frac0( b );
4639     bExp = extractFloat128Exp( b );
4640     expDiff = aExp - bExp;
4641     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4642     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4643     if ( 0 < expDiff ) goto aExpBigger;
4644     if ( expDiff < 0 ) goto bExpBigger;
4645     if ( aExp == 0x7FFF ) {
4646         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4647             return propagateFloat128NaN( a, b STATUS_VAR );
4648         }
4649         float_raise( float_flag_invalid STATUS_VAR);
4650         z.low = float128_default_nan_low;
4651         z.high = float128_default_nan_high;
4652         return z;
4653     }
4654     if ( aExp == 0 ) {
4655         aExp = 1;
4656         bExp = 1;
4657     }
4658     if ( bSig0 < aSig0 ) goto aBigger;
4659     if ( aSig0 < bSig0 ) goto bBigger;
4660     if ( bSig1 < aSig1 ) goto aBigger;
4661     if ( aSig1 < bSig1 ) goto bBigger;
4662     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4663  bExpBigger:
4664     if ( bExp == 0x7FFF ) {
4665         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4666         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4667     }
4668     if ( aExp == 0 ) {
4669         ++expDiff;
4670     }
4671     else {
4672         aSig0 |= LIT64( 0x4000000000000000 );
4673     }
4674     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4675     bSig0 |= LIT64( 0x4000000000000000 );
4676  bBigger:
4677     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4678     zExp = bExp;
4679     zSign ^= 1;
4680     goto normalizeRoundAndPack;
4681  aExpBigger:
4682     if ( aExp == 0x7FFF ) {
4683         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4684         return a;
4685     }
4686     if ( bExp == 0 ) {
4687         --expDiff;
4688     }
4689     else {
4690         bSig0 |= LIT64( 0x4000000000000000 );
4691     }
4692     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4693     aSig0 |= LIT64( 0x4000000000000000 );
4694  aBigger:
4695     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4696     zExp = aExp;
4697  normalizeRoundAndPack:
4698     --zExp;
4699     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4700
4701 }
4702
4703 /*----------------------------------------------------------------------------
4704 | Returns the result of adding the quadruple-precision floating-point values
4705 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4706 | for Binary Floating-Point Arithmetic.
4707 *----------------------------------------------------------------------------*/
4708
4709 float128 float128_add( float128 a, float128 b STATUS_PARAM )
4710 {
4711     flag aSign, bSign;
4712
4713     aSign = extractFloat128Sign( a );
4714     bSign = extractFloat128Sign( b );
4715     if ( aSign == bSign ) {
4716         return addFloat128Sigs( a, b, aSign STATUS_VAR );
4717     }
4718     else {
4719         return subFloat128Sigs( a, b, aSign STATUS_VAR );
4720     }
4721
4722 }
4723
4724 /*----------------------------------------------------------------------------
4725 | Returns the result of subtracting the quadruple-precision floating-point
4726 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4727 | Standard for Binary Floating-Point Arithmetic.
4728 *----------------------------------------------------------------------------*/
4729
4730 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4731 {
4732     flag aSign, bSign;
4733
4734     aSign = extractFloat128Sign( a );
4735     bSign = extractFloat128Sign( b );
4736     if ( aSign == bSign ) {
4737         return subFloat128Sigs( a, b, aSign STATUS_VAR );
4738     }
4739     else {
4740         return addFloat128Sigs( a, b, aSign STATUS_VAR );
4741     }
4742
4743 }
4744
4745 /*----------------------------------------------------------------------------
4746 | Returns the result of multiplying the quadruple-precision floating-point
4747 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4748 | Standard for Binary Floating-Point Arithmetic.
4749 *----------------------------------------------------------------------------*/
4750
4751 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4752 {
4753     flag aSign, bSign, zSign;
4754     int32 aExp, bExp, zExp;
4755     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4756     float128 z;
4757
4758     aSig1 = extractFloat128Frac1( a );
4759     aSig0 = extractFloat128Frac0( a );
4760     aExp = extractFloat128Exp( a );
4761     aSign = extractFloat128Sign( a );
4762     bSig1 = extractFloat128Frac1( b );
4763     bSig0 = extractFloat128Frac0( b );
4764     bExp = extractFloat128Exp( b );
4765     bSign = extractFloat128Sign( b );
4766     zSign = aSign ^ bSign;
4767     if ( aExp == 0x7FFF ) {
4768         if (    ( aSig0 | aSig1 )
4769              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4770             return propagateFloat128NaN( a, b STATUS_VAR );
4771         }
4772         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4773         return packFloat128( zSign, 0x7FFF, 0, 0 );
4774     }
4775     if ( bExp == 0x7FFF ) {
4776         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4777         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4778  invalid:
4779             float_raise( float_flag_invalid STATUS_VAR);
4780             z.low = float128_default_nan_low;
4781             z.high = float128_default_nan_high;
4782             return z;
4783         }
4784         return packFloat128( zSign, 0x7FFF, 0, 0 );
4785     }
4786     if ( aExp == 0 ) {
4787         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4788         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4789     }
4790     if ( bExp == 0 ) {
4791         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4792         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4793     }
4794     zExp = aExp + bExp - 0x4000;
4795     aSig0 |= LIT64( 0x0001000000000000 );
4796     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4797     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4798     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4799     zSig2 |= ( zSig3 != 0 );
4800     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4801         shift128ExtraRightJamming(
4802             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4803         ++zExp;
4804     }
4805     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4806
4807 }
4808
4809 /*----------------------------------------------------------------------------
4810 | Returns the result of dividing the quadruple-precision floating-point value
4811 | `a' by the corresponding value `b'.  The operation is performed according to
4812 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4813 *----------------------------------------------------------------------------*/
4814
4815 float128 float128_div( float128 a, float128 b STATUS_PARAM )
4816 {
4817     flag aSign, bSign, zSign;
4818     int32 aExp, bExp, zExp;
4819     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4820     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4821     float128 z;
4822
4823     aSig1 = extractFloat128Frac1( a );
4824     aSig0 = extractFloat128Frac0( a );
4825     aExp = extractFloat128Exp( a );
4826     aSign = extractFloat128Sign( a );
4827     bSig1 = extractFloat128Frac1( b );
4828     bSig0 = extractFloat128Frac0( b );
4829     bExp = extractFloat128Exp( b );
4830     bSign = extractFloat128Sign( b );
4831     zSign = aSign ^ bSign;
4832     if ( aExp == 0x7FFF ) {
4833         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4834         if ( bExp == 0x7FFF ) {
4835             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4836             goto invalid;
4837         }
4838         return packFloat128( zSign, 0x7FFF, 0, 0 );
4839     }
4840     if ( bExp == 0x7FFF ) {
4841         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4842         return packFloat128( zSign, 0, 0, 0 );
4843     }
4844     if ( bExp == 0 ) {
4845         if ( ( bSig0 | bSig1 ) == 0 ) {
4846             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4847  invalid:
4848                 float_raise( float_flag_invalid STATUS_VAR);
4849                 z.low = float128_default_nan_low;
4850                 z.high = float128_default_nan_high;
4851                 return z;
4852             }
4853             float_raise( float_flag_divbyzero STATUS_VAR);
4854             return packFloat128( zSign, 0x7FFF, 0, 0 );
4855         }
4856         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4857     }
4858     if ( aExp == 0 ) {
4859         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4860         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4861     }
4862     zExp = aExp - bExp + 0x3FFD;
4863     shortShift128Left(
4864         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4865     shortShift128Left(
4866         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4867     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4868         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4869         ++zExp;
4870     }
4871     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4872     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4873     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4874     while ( (sbits64) rem0 < 0 ) {
4875         --zSig0;
4876         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4877     }
4878     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4879     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4880         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4881         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4882         while ( (sbits64) rem1 < 0 ) {
4883             --zSig1;
4884             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4885         }
4886         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4887     }
4888     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4889     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4890
4891 }
4892
4893 /*----------------------------------------------------------------------------
4894 | Returns the remainder of the quadruple-precision floating-point value `a'
4895 | with respect to the corresponding value `b'.  The operation is performed
4896 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4897 *----------------------------------------------------------------------------*/
4898
4899 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
4900 {
4901     flag aSign, bSign, zSign;
4902     int32 aExp, bExp, expDiff;
4903     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4904     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4905     sbits64 sigMean0;
4906     float128 z;
4907
4908     aSig1 = extractFloat128Frac1( a );
4909     aSig0 = extractFloat128Frac0( a );
4910     aExp = extractFloat128Exp( a );
4911     aSign = extractFloat128Sign( a );
4912     bSig1 = extractFloat128Frac1( b );
4913     bSig0 = extractFloat128Frac0( b );
4914     bExp = extractFloat128Exp( b );
4915     bSign = extractFloat128Sign( b );
4916     if ( aExp == 0x7FFF ) {
4917         if (    ( aSig0 | aSig1 )
4918              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4919             return propagateFloat128NaN( a, b STATUS_VAR );
4920         }
4921         goto invalid;
4922     }
4923     if ( bExp == 0x7FFF ) {
4924         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4925         return a;
4926     }
4927     if ( bExp == 0 ) {
4928         if ( ( bSig0 | bSig1 ) == 0 ) {
4929  invalid:
4930             float_raise( float_flag_invalid STATUS_VAR);
4931             z.low = float128_default_nan_low;
4932             z.high = float128_default_nan_high;
4933             return z;
4934         }
4935         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4936     }
4937     if ( aExp == 0 ) {
4938         if ( ( aSig0 | aSig1 ) == 0 ) return a;
4939         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4940     }
4941     expDiff = aExp - bExp;
4942     if ( expDiff < -1 ) return a;
4943     shortShift128Left(
4944         aSig0 | LIT64( 0x0001000000000000 ),
4945         aSig1,
4946         15 - ( expDiff < 0 ),
4947         &aSig0,
4948         &aSig1
4949     );
4950     shortShift128Left(
4951         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4952     q = le128( bSig0, bSig1, aSig0, aSig1 );
4953     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4954     expDiff -= 64;
4955     while ( 0 < expDiff ) {
4956         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4957         q = ( 4 < q ) ? q - 4 : 0;
4958         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4959         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4960         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4961         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4962         expDiff -= 61;
4963     }
4964     if ( -64 < expDiff ) {
4965         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4966         q = ( 4 < q ) ? q - 4 : 0;
4967         q >>= - expDiff;
4968         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4969         expDiff += 52;
4970         if ( expDiff < 0 ) {
4971             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4972         }
4973         else {
4974             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4975         }
4976         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4977         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4978     }
4979     else {
4980         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4981         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4982     }
4983     do {
4984         alternateASig0 = aSig0;
4985         alternateASig1 = aSig1;
4986         ++q;
4987         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4988     } while ( 0 <= (sbits64) aSig0 );
4989     add128(
4990         aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
4991     if (    ( sigMean0 < 0 )
4992          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4993         aSig0 = alternateASig0;
4994         aSig1 = alternateASig1;
4995     }
4996     zSign = ( (sbits64) aSig0 < 0 );
4997     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4998     return
4999         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5000
5001 }
5002
5003 /*----------------------------------------------------------------------------
5004 | Returns the square root of the quadruple-precision floating-point value `a'.
5005 | The operation is performed according to the IEC/IEEE Standard for Binary
5006 | Floating-Point Arithmetic.
5007 *----------------------------------------------------------------------------*/
5008
5009 float128 float128_sqrt( float128 a STATUS_PARAM )
5010 {
5011     flag aSign;
5012     int32 aExp, zExp;
5013     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5014     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5015     float128 z;
5016
5017     aSig1 = extractFloat128Frac1( a );
5018     aSig0 = extractFloat128Frac0( a );
5019     aExp = extractFloat128Exp( a );
5020     aSign = extractFloat128Sign( a );
5021     if ( aExp == 0x7FFF ) {
5022         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5023         if ( ! aSign ) return a;
5024         goto invalid;
5025     }
5026     if ( aSign ) {
5027         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5028  invalid:
5029         float_raise( float_flag_invalid STATUS_VAR);
5030         z.low = float128_default_nan_low;
5031         z.high = float128_default_nan_high;
5032         return z;
5033     }
5034     if ( aExp == 0 ) {
5035         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5036         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5037     }
5038     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5039     aSig0 |= LIT64( 0x0001000000000000 );
5040     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5041     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5042     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5043     doubleZSig0 = zSig0<<1;
5044     mul64To128( zSig0, zSig0, &term0, &term1 );
5045     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5046     while ( (sbits64) rem0 < 0 ) {
5047         --zSig0;
5048         doubleZSig0 -= 2;
5049         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5050     }
5051     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5052     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5053         if ( zSig1 == 0 ) zSig1 = 1;
5054         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5055         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5056         mul64To128( zSig1, zSig1, &term2, &term3 );
5057         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5058         while ( (sbits64) rem1 < 0 ) {
5059             --zSig1;
5060             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5061             term3 |= 1;
5062             term2 |= doubleZSig0;
5063             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5064         }
5065         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5066     }
5067     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5068     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5069
5070 }
5071
5072 /*----------------------------------------------------------------------------
5073 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5074 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5075 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5076 *----------------------------------------------------------------------------*/
5077
5078 int float128_eq( float128 a, float128 b STATUS_PARAM )
5079 {
5080
5081     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5082               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5083          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5084               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5085        ) {
5086         if (    float128_is_signaling_nan( a )
5087              || float128_is_signaling_nan( b ) ) {
5088             float_raise( float_flag_invalid STATUS_VAR);
5089         }
5090         return 0;
5091     }
5092     return
5093            ( a.low == b.low )
5094         && (    ( a.high == b.high )
5095              || (    ( a.low == 0 )
5096                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5097            );
5098
5099 }
5100
5101 /*----------------------------------------------------------------------------
5102 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5103 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
5104 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5105 | Arithmetic.
5106 *----------------------------------------------------------------------------*/
5107
5108 int float128_le( float128 a, float128 b STATUS_PARAM )
5109 {
5110     flag aSign, bSign;
5111
5112     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5113               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5114          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5115               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5116        ) {
5117         float_raise( float_flag_invalid STATUS_VAR);
5118         return 0;
5119     }
5120     aSign = extractFloat128Sign( a );
5121     bSign = extractFloat128Sign( b );
5122     if ( aSign != bSign ) {
5123         return
5124                aSign
5125             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5126                  == 0 );
5127     }
5128     return
5129           aSign ? le128( b.high, b.low, a.high, a.low )
5130         : le128( a.high, a.low, b.high, b.low );
5131
5132 }
5133
5134 /*----------------------------------------------------------------------------
5135 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5136 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5137 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5138 *----------------------------------------------------------------------------*/
5139
5140 int float128_lt( float128 a, float128 b STATUS_PARAM )
5141 {
5142     flag aSign, bSign;
5143
5144     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5145               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5146          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5147               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5148        ) {
5149         float_raise( float_flag_invalid STATUS_VAR);
5150         return 0;
5151     }
5152     aSign = extractFloat128Sign( a );
5153     bSign = extractFloat128Sign( b );
5154     if ( aSign != bSign ) {
5155         return
5156                aSign
5157             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5158                  != 0 );
5159     }
5160     return
5161           aSign ? lt128( b.high, b.low, a.high, a.low )
5162         : lt128( a.high, a.low, b.high, b.low );
5163
5164 }
5165
5166 /*----------------------------------------------------------------------------
5167 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5168 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5169 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5170 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5171 *----------------------------------------------------------------------------*/
5172
5173 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5174 {
5175
5176     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5177               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5178          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5179               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5180        ) {
5181         float_raise( float_flag_invalid STATUS_VAR);
5182         return 0;
5183     }
5184     return
5185            ( a.low == b.low )
5186         && (    ( a.high == b.high )
5187              || (    ( a.low == 0 )
5188                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5189            );
5190
5191 }
5192
5193 /*----------------------------------------------------------------------------
5194 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5195 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5196 | cause an exception.  Otherwise, the comparison is performed according to the
5197 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5198 *----------------------------------------------------------------------------*/
5199
5200 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5201 {
5202     flag aSign, bSign;
5203
5204     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5205               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5206          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5207               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5208        ) {
5209         if (    float128_is_signaling_nan( a )
5210              || float128_is_signaling_nan( b ) ) {
5211             float_raise( float_flag_invalid STATUS_VAR);
5212         }
5213         return 0;
5214     }
5215     aSign = extractFloat128Sign( a );
5216     bSign = extractFloat128Sign( b );
5217     if ( aSign != bSign ) {
5218         return
5219                aSign
5220             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5221                  == 0 );
5222     }
5223     return
5224           aSign ? le128( b.high, b.low, a.high, a.low )
5225         : le128( a.high, a.low, b.high, b.low );
5226
5227 }
5228
5229 /*----------------------------------------------------------------------------
5230 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5231 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5232 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5233 | Standard for Binary Floating-Point Arithmetic.
5234 *----------------------------------------------------------------------------*/
5235
5236 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5237 {
5238     flag aSign, bSign;
5239
5240     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5241               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5242          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5243               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5244        ) {
5245         if (    float128_is_signaling_nan( a )
5246              || float128_is_signaling_nan( b ) ) {
5247             float_raise( float_flag_invalid STATUS_VAR);
5248         }
5249         return 0;
5250     }
5251     aSign = extractFloat128Sign( a );
5252     bSign = extractFloat128Sign( b );
5253     if ( aSign != bSign ) {
5254         return
5255                aSign
5256             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5257                  != 0 );
5258     }
5259     return
5260           aSign ? lt128( b.high, b.low, a.high, a.low )
5261         : lt128( a.high, a.low, b.high, b.low );
5262
5263 }
5264
5265 #endif
5266
5267 /* misc functions */
5268 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5269 {
5270     return int64_to_float32(a STATUS_VAR);
5271 }
5272
5273 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5274 {
5275     return int64_to_float64(a STATUS_VAR);
5276 }
5277
5278 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5279 {
5280     int64_t v;
5281     unsigned int res;
5282
5283     v = float32_to_int64(a STATUS_VAR);
5284     if (v < 0) {
5285         res = 0;
5286         float_raise( float_flag_invalid STATUS_VAR);
5287     } else if (v > 0xffffffff) {
5288         res = 0xffffffff;
5289         float_raise( float_flag_invalid STATUS_VAR);
5290     } else {
5291         res = v;
5292     }
5293     return res;
5294 }
5295
5296 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5297 {
5298     int64_t v;
5299     unsigned int res;
5300
5301     v = float32_to_int64_round_to_zero(a STATUS_VAR);
5302     if (v < 0) {
5303         res = 0;
5304         float_raise( float_flag_invalid STATUS_VAR);
5305     } else if (v > 0xffffffff) {
5306         res = 0xffffffff;
5307         float_raise( float_flag_invalid STATUS_VAR);
5308     } else {
5309         res = v;
5310     }
5311     return res;
5312 }
5313
5314 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5315 {
5316     int64_t v;
5317     unsigned int res;
5318
5319     v = float64_to_int64(a STATUS_VAR);
5320     if (v < 0) {
5321         res = 0;
5322         float_raise( float_flag_invalid STATUS_VAR);
5323     } else if (v > 0xffffffff) {
5324         res = 0xffffffff;
5325         float_raise( float_flag_invalid STATUS_VAR);
5326     } else {
5327         res = v;
5328     }
5329     return res;
5330 }
5331
5332 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5333 {
5334     int64_t v;
5335     unsigned int res;
5336
5337     v = float64_to_int64_round_to_zero(a STATUS_VAR);
5338     if (v < 0) {
5339         res = 0;
5340         float_raise( float_flag_invalid STATUS_VAR);
5341     } else if (v > 0xffffffff) {
5342         res = 0xffffffff;
5343         float_raise( float_flag_invalid STATUS_VAR);
5344     } else {
5345         res = v;
5346     }
5347     return res;
5348 }
5349
5350 /* FIXME: This looks broken.  */
5351 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5352 {
5353     int64_t v;
5354
5355     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5356     v += float64_val(a);
5357     v = float64_to_int64(make_float64(v) STATUS_VAR);
5358
5359     return v - INT64_MIN;
5360 }
5361
5362 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5363 {
5364     int64_t v;
5365
5366     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5367     v += float64_val(a);
5368     v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5369
5370     return v - INT64_MIN;
5371 }
5372
5373 #define COMPARE(s, nan_exp)                                                  \
5374 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
5375                                       int is_quiet STATUS_PARAM )            \
5376 {                                                                            \
5377     flag aSign, bSign;                                                       \
5378     bits ## s av, bv;                                                        \
5379                                                                              \
5380     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5381          extractFloat ## s ## Frac( a ) ) ||                                 \
5382         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5383           extractFloat ## s ## Frac( b ) )) {                                \
5384         if (!is_quiet ||                                                     \
5385             float ## s ## _is_signaling_nan( a ) ||                          \
5386             float ## s ## _is_signaling_nan( b ) ) {                         \
5387             float_raise( float_flag_invalid STATUS_VAR);                     \
5388         }                                                                    \
5389         return float_relation_unordered;                                     \
5390     }                                                                        \
5391     aSign = extractFloat ## s ## Sign( a );                                  \
5392     bSign = extractFloat ## s ## Sign( b );                                  \
5393     av = float ## s ## _val(a);                                              \
5394     bv = float ## s ## _val(b);                                              \
5395     if ( aSign != bSign ) {                                                  \
5396         if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) {                         \
5397             /* zero case */                                                  \
5398             return float_relation_equal;                                     \
5399         } else {                                                             \
5400             return 1 - (2 * aSign);                                          \
5401         }                                                                    \
5402     } else {                                                                 \
5403         if (av == bv) {                                                      \
5404             return float_relation_equal;                                     \
5405         } else {                                                             \
5406             return 1 - 2 * (aSign ^ ( av < bv ));                            \
5407         }                                                                    \
5408     }                                                                        \
5409 }                                                                            \
5410                                                                              \
5411 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
5412 {                                                                            \
5413     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
5414 }                                                                            \
5415                                                                              \
5416 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
5417 {                                                                            \
5418     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
5419 }
5420
5421 COMPARE(32, 0xff)
5422 COMPARE(64, 0x7ff)
5423
5424 /* Multiply A by 2 raised to the power N.  */
5425 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
5426 {
5427     flag aSign;
5428     int16 aExp;
5429     bits32 aSig;
5430
5431     aSig = extractFloat32Frac( a );
5432     aExp = extractFloat32Exp( a );
5433     aSign = extractFloat32Sign( a );
5434
5435     if ( aExp == 0xFF ) {
5436         return a;
5437     }
5438     aExp += n;
5439     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
5440 }
5441
5442 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
5443 {
5444     flag aSign;
5445     int16 aExp;
5446     bits64 aSig;
5447
5448     aSig = extractFloat64Frac( a );
5449     aExp = extractFloat64Exp( a );
5450     aSign = extractFloat64Sign( a );
5451
5452     if ( aExp == 0x7FF ) {
5453         return a;
5454     }
5455     aExp += n;
5456     return roundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
5457 }
5458
5459 #ifdef FLOATX80
5460 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
5461 {
5462     flag aSign;
5463     int16 aExp;
5464     bits64 aSig;
5465
5466     aSig = extractFloatx80Frac( a );
5467     aExp = extractFloatx80Exp( a );
5468     aSign = extractFloatx80Sign( a );
5469
5470     if ( aExp == 0x7FF ) {
5471         return a;
5472     }
5473     aExp += n;
5474     return roundAndPackFloatx80( STATUS(floatx80_rounding_precision),
5475                                  aSign, aExp, aSig, 0 STATUS_VAR );
5476 }
5477 #endif
5478
5479 #ifdef FLOAT128
5480 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
5481 {
5482     flag aSign;
5483     int32 aExp;
5484     bits64 aSig0, aSig1;
5485
5486     aSig1 = extractFloat128Frac1( a );
5487     aSig0 = extractFloat128Frac0( a );
5488     aExp = extractFloat128Exp( a );
5489     aSign = extractFloat128Sign( a );
5490     if ( aExp == 0x7FFF ) {
5491         return a;
5492     }
5493     aExp += n;
5494     return roundAndPackFloat128( aSign, aExp, aSig0, aSig1, 0 STATUS_VAR );
5495
5496 }
5497 #endif