fpu/softfloat.c

   1
   2 /*============================================================================
   3
   4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
   5 Package, Release 2b.
   6
   7 Written by John R. Hauser.  This work was made possible in part by the
   8 International Computer Science Institute, located at Suite 600, 1947 Center
   9 Street, Berkeley, California 94704.  Funding was partially provided by the
  10 National Science Foundation under grant MIP-9311980.  The original version
  11 of this code was written as part of a project to build a fixed-point vector
  12 processor in collaboration with the University of California at Berkeley,
  13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  15 arithmetic/SoftFloat.html'.
  16
  17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  19 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  25
  26 Derivative works are acceptable, even for commercial purposes, so long as
  27 (1) the source code for the derivative work includes prominent notice that
  28 the work is derivative, and (2) the source code includes prominent notice with
  29 these four paragraphs for those parts of this code that are retained.
  30
  31 =============================================================================*/
  32
  33 /* FIXME: Flush-To-Zero only effects results.  Denormal inputs should also
  34    be flushed to zero.  */
  35 #include "softfloat.h"
  36
  37 /*----------------------------------------------------------------------------
  38 | Primitive arithmetic functions, including multi-word arithmetic, and
  39 | division and square root approximations.  (Can be specialized to target if
  40 | desired.)
  41 *----------------------------------------------------------------------------*/
  42 #include "softfloat-macros.h"
  43
  44 /*----------------------------------------------------------------------------
  45 | Functions and definitions to determine:  (1) whether tininess for underflow
  46 | is detected before or after rounding by default, (2) what (if anything)
  47 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  48 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  49 | are propagated from function inputs to output.  These details are target-
  50 | specific.
  51 *----------------------------------------------------------------------------*/
  52 #include "softfloat-specialize.h"
  53
  54 void set_float_rounding_mode(int val STATUS_PARAM)
  55 {
  56     STATUS(float_rounding_mode) = val;
  57 }
  58
  59 void set_float_exception_flags(int val STATUS_PARAM)
  60 {
  61     STATUS(float_exception_flags) = val;
  62 }
  63
  64 #ifdef FLOATX80
  65 void set_floatx80_rounding_precision(int val STATUS_PARAM)
  66 {
  67     STATUS(floatx80_rounding_precision) = val;
  68 }
  69 #endif
  70
  71 /*----------------------------------------------------------------------------
  72 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  73 | and 7, and returns the properly rounded 32-bit integer corresponding to the
  74 | input.  If `zSign' is 1, the input is negated before being converted to an
  75 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
  76 | is simply rounded to an integer, with the inexact exception raised if the
  77 | input cannot be represented exactly as an integer.  However, if the fixed-
  78 | point input is too large, the invalid exception is raised and the largest
  79 | positive or negative integer is returned.
  80 *----------------------------------------------------------------------------*/
  81
  82 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
  83 {
  84     int8 roundingMode;
  85     flag roundNearestEven;
  86     int8 roundIncrement, roundBits;
  87     int32 z;
  88
  89     roundingMode = STATUS(float_rounding_mode);
  90     roundNearestEven = ( roundingMode == float_round_nearest_even );
  91     roundIncrement = 0x40;
  92     if ( ! roundNearestEven ) {
  93         if ( roundingMode == float_round_to_zero ) {
  94             roundIncrement = 0;
  95         }
  96         else {
  97             roundIncrement = 0x7F;
  98             if ( zSign ) {
  99                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 100             }
 101             else {
 102                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 103             }
 104         }
 105     }
 106     roundBits = absZ & 0x7F;
 107     absZ = ( absZ + roundIncrement )>>7;
 108     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 109     z = absZ;
 110     if ( zSign ) z = - z;
 111     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 112         float_raise( float_flag_invalid STATUS_VAR);
 113         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 114     }
 115     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 116     return z;
 117
 118 }
 119
 120 /*----------------------------------------------------------------------------
 121 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 122 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 123 | and returns the properly rounded 64-bit integer corresponding to the input.
 124 | If `zSign' is 1, the input is negated before being converted to an integer.
 125 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 126 | the inexact exception raised if the input cannot be represented exactly as
 127 | an integer.  However, if the fixed-point input is too large, the invalid
 128 | exception is raised and the largest positive or negative integer is
 129 | returned.
 130 *----------------------------------------------------------------------------*/
 131
 132 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
 133 {
 134     int8 roundingMode;
 135     flag roundNearestEven, increment;
 136     int64 z;
 137
 138     roundingMode = STATUS(float_rounding_mode);
 139     roundNearestEven = ( roundingMode == float_round_nearest_even );
 140     increment = ( (sbits64) absZ1 < 0 );
 141     if ( ! roundNearestEven ) {
 142         if ( roundingMode == float_round_to_zero ) {
 143             increment = 0;
 144         }
 145         else {
 146             if ( zSign ) {
 147                 increment = ( roundingMode == float_round_down ) && absZ1;
 148             }
 149             else {
 150                 increment = ( roundingMode == float_round_up ) && absZ1;
 151             }
 152         }
 153     }
 154     if ( increment ) {
 155         ++absZ0;
 156         if ( absZ0 == 0 ) goto overflow;
 157         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 158     }
 159     z = absZ0;
 160     if ( zSign ) z = - z;
 161     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 162  overflow:
 163         float_raise( float_flag_invalid STATUS_VAR);
 164         return
 165               zSign ? (sbits64) LIT64( 0x8000000000000000 )
 166             : LIT64( 0x7FFFFFFFFFFFFFFF );
 167     }
 168     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 169     return z;
 170
 171 }
 172
 173 /*----------------------------------------------------------------------------
 174 | Returns the fraction bits of the single-precision floating-point value `a'.
 175 *----------------------------------------------------------------------------*/
 176
 177 INLINE bits32 extractFloat32Frac( float32 a )
 178 {
 179
 180     return float32_val(a) & 0x007FFFFF;
 181
 182 }
 183
 184 /*----------------------------------------------------------------------------
 185 | Returns the exponent bits of the single-precision floating-point value `a'.
 186 *----------------------------------------------------------------------------*/
 187
 188 INLINE int16 extractFloat32Exp( float32 a )
 189 {
 190
 191     return ( float32_val(a)>>23 ) & 0xFF;
 192
 193 }
 194
 195 /*----------------------------------------------------------------------------
 196 | Returns the sign bit of the single-precision floating-point value `a'.
 197 *----------------------------------------------------------------------------*/
 198
 199 INLINE flag extractFloat32Sign( float32 a )
 200 {
 201
 202     return float32_val(a)>>31;
 203
 204 }
 205
 206 /*----------------------------------------------------------------------------
 207 | Normalizes the subnormal single-precision floating-point value represented
 208 | by the denormalized significand `aSig'.  The normalized exponent and
 209 | significand are stored at the locations pointed to by `zExpPtr' and
 210 | `zSigPtr', respectively.
 211 *----------------------------------------------------------------------------*/
 212
 213 static void
 214  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 215 {
 216     int8 shiftCount;
 217
 218     shiftCount = countLeadingZeros32( aSig ) - 8;
 219     *zSigPtr = aSig<<shiftCount;
 220     *zExpPtr = 1 - shiftCount;
 221
 222 }
 223
 224 /*----------------------------------------------------------------------------
 225 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 226 | single-precision floating-point value, returning the result.  After being
 227 | shifted into the proper positions, the three fields are simply added
 228 | together to form the result.  This means that any integer portion of `zSig'
 229 | will be added into the exponent.  Since a properly normalized significand
 230 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 231 | than the desired result exponent whenever `zSig' is a complete, normalized
 232 | significand.
 233 *----------------------------------------------------------------------------*/
 234
 235 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 236 {
 237
 238     return make_float32(
 239           ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
 240
 241 }
 242
 243 /*----------------------------------------------------------------------------
 244 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 245 | and significand `zSig', and returns the proper single-precision floating-
 246 | point value corresponding to the abstract input.  Ordinarily, the abstract
 247 | value is simply rounded and packed into the single-precision format, with
 248 | the inexact exception raised if the abstract input cannot be represented
 249 | exactly.  However, if the abstract value is too large, the overflow and
 250 | inexact exceptions are raised and an infinity or maximal finite value is
 251 | returned.  If the abstract value is too small, the input value is rounded to
 252 | a subnormal number, and the underflow and inexact exceptions are raised if
 253 | the abstract input cannot be represented exactly as a subnormal single-
 254 | precision floating-point number.
 255 |     The input significand `zSig' has its binary point between bits 30
 256 | and 29, which is 7 bits to the left of the usual location.  This shifted
 257 | significand must be normalized or smaller.  If `zSig' is not normalized,
 258 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 259 | and it must not require rounding.  In the usual case that `zSig' is
 260 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 261 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 262 | Binary Floating-Point Arithmetic.
 263 *----------------------------------------------------------------------------*/
 264
 265 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 266 {
 267     int8 roundingMode;
 268     flag roundNearestEven;
 269     int8 roundIncrement, roundBits;
 270     flag isTiny;
 271
 272     roundingMode = STATUS(float_rounding_mode);
 273     roundNearestEven = ( roundingMode == float_round_nearest_even );
 274     roundIncrement = 0x40;
 275     if ( ! roundNearestEven ) {
 276         if ( roundingMode == float_round_to_zero ) {
 277             roundIncrement = 0;
 278         }
 279         else {
 280             roundIncrement = 0x7F;
 281             if ( zSign ) {
 282                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 283             }
 284             else {
 285                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 286             }
 287         }
 288     }
 289     roundBits = zSig & 0x7F;
 290     if ( 0xFD <= (bits16) zExp ) {
 291         if (    ( 0xFD < zExp )
 292              || (    ( zExp == 0xFD )
 293                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 294            ) {
 295             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 296             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 297         }
 298         if ( zExp < 0 ) {
 299             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
 300             isTiny =
 301                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 302                 || ( zExp < -1 )
 303                 || ( zSig + roundIncrement < 0x80000000 );
 304             shift32RightJamming( zSig, - zExp, &zSig );
 305             zExp = 0;
 306             roundBits = zSig & 0x7F;
 307             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 308         }
 309     }
 310     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 311     zSig = ( zSig + roundIncrement )>>7;
 312     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 313     if ( zSig == 0 ) zExp = 0;
 314     return packFloat32( zSign, zExp, zSig );
 315
 316 }
 317
 318 /*----------------------------------------------------------------------------
 319 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 320 | and significand `zSig', and returns the proper single-precision floating-
 321 | point value corresponding to the abstract input.  This routine is just like
 322 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 323 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 324 | floating-point exponent.
 325 *----------------------------------------------------------------------------*/
 326
 327 static float32
 328  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 329 {
 330     int8 shiftCount;
 331
 332     shiftCount = countLeadingZeros32( zSig ) - 1;
 333     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 334
 335 }
 336
 337 /*----------------------------------------------------------------------------
 338 | Returns the fraction bits of the double-precision floating-point value `a'.
 339 *----------------------------------------------------------------------------*/
 340
 341 INLINE bits64 extractFloat64Frac( float64 a )
 342 {
 343
 344     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 345
 346 }
 347
 348 /*----------------------------------------------------------------------------
 349 | Returns the exponent bits of the double-precision floating-point value `a'.
 350 *----------------------------------------------------------------------------*/
 351
 352 INLINE int16 extractFloat64Exp( float64 a )
 353 {
 354
 355     return ( float64_val(a)>>52 ) & 0x7FF;
 356
 357 }
 358
 359 /*----------------------------------------------------------------------------
 360 | Returns the sign bit of the double-precision floating-point value `a'.
 361 *----------------------------------------------------------------------------*/
 362
 363 INLINE flag extractFloat64Sign( float64 a )
 364 {
 365
 366     return float64_val(a)>>63;
 367
 368 }
 369
 370 /*----------------------------------------------------------------------------
 371 | Normalizes the subnormal double-precision floating-point value represented
 372 | by the denormalized significand `aSig'.  The normalized exponent and
 373 | significand are stored at the locations pointed to by `zExpPtr' and
 374 | `zSigPtr', respectively.
 375 *----------------------------------------------------------------------------*/
 376
 377 static void
 378  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 379 {
 380     int8 shiftCount;
 381
 382     shiftCount = countLeadingZeros64( aSig ) - 11;
 383     *zSigPtr = aSig<<shiftCount;
 384     *zExpPtr = 1 - shiftCount;
 385
 386 }
 387
 388 /*----------------------------------------------------------------------------
 389 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 390 | double-precision floating-point value, returning the result.  After being
 391 | shifted into the proper positions, the three fields are simply added
 392 | together to form the result.  This means that any integer portion of `zSig'
 393 | will be added into the exponent.  Since a properly normalized significand
 394 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 395 | than the desired result exponent whenever `zSig' is a complete, normalized
 396 | significand.
 397 *----------------------------------------------------------------------------*/
 398
 399 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 400 {
 401
 402     return make_float64(
 403         ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
 404
 405 }
 406
 407 /*----------------------------------------------------------------------------
 408 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 409 | and significand `zSig', and returns the proper double-precision floating-
 410 | point value corresponding to the abstract input.  Ordinarily, the abstract
 411 | value is simply rounded and packed into the double-precision format, with
 412 | the inexact exception raised if the abstract input cannot be represented
 413 | exactly.  However, if the abstract value is too large, the overflow and
 414 | inexact exceptions are raised and an infinity or maximal finite value is
 415 | returned.  If the abstract value is too small, the input value is rounded
 416 | to a subnormal number, and the underflow and inexact exceptions are raised
 417 | if the abstract input cannot be represented exactly as a subnormal double-
 418 | precision floating-point number.
 419 |     The input significand `zSig' has its binary point between bits 62
 420 | and 61, which is 10 bits to the left of the usual location.  This shifted
 421 | significand must be normalized or smaller.  If `zSig' is not normalized,
 422 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 423 | and it must not require rounding.  In the usual case that `zSig' is
 424 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 425 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 426 | Binary Floating-Point Arithmetic.
 427 *----------------------------------------------------------------------------*/
 428
 429 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 430 {
 431     int8 roundingMode;
 432     flag roundNearestEven;
 433     int16 roundIncrement, roundBits;
 434     flag isTiny;
 435
 436     roundingMode = STATUS(float_rounding_mode);
 437     roundNearestEven = ( roundingMode == float_round_nearest_even );
 438     roundIncrement = 0x200;
 439     if ( ! roundNearestEven ) {
 440         if ( roundingMode == float_round_to_zero ) {
 441             roundIncrement = 0;
 442         }
 443         else {
 444             roundIncrement = 0x3FF;
 445             if ( zSign ) {
 446                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 447             }
 448             else {
 449                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 450             }
 451         }
 452     }
 453     roundBits = zSig & 0x3FF;
 454     if ( 0x7FD <= (bits16) zExp ) {
 455         if (    ( 0x7FD < zExp )
 456              || (    ( zExp == 0x7FD )
 457                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 458            ) {
 459             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 460             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 461         }
 462         if ( zExp < 0 ) {
 463             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
 464             isTiny =
 465                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 466                 || ( zExp < -1 )
 467                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 468             shift64RightJamming( zSig, - zExp, &zSig );
 469             zExp = 0;
 470             roundBits = zSig & 0x3FF;
 471             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 472         }
 473     }
 474     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 475     zSig = ( zSig + roundIncrement )>>10;
 476     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 477     if ( zSig == 0 ) zExp = 0;
 478     return packFloat64( zSign, zExp, zSig );
 479
 480 }
 481
 482 /*----------------------------------------------------------------------------
 483 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 484 | and significand `zSig', and returns the proper double-precision floating-
 485 | point value corresponding to the abstract input.  This routine is just like
 486 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 487 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 488 | floating-point exponent.
 489 *----------------------------------------------------------------------------*/
 490
 491 static float64
 492  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 493 {
 494     int8 shiftCount;
 495
 496     shiftCount = countLeadingZeros64( zSig ) - 1;
 497     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 498
 499 }
 500
 501 #ifdef FLOATX80
 502
 503 /*----------------------------------------------------------------------------
 504 | Returns the fraction bits of the extended double-precision floating-point
 505 | value `a'.
 506 *----------------------------------------------------------------------------*/
 507
 508 INLINE bits64 extractFloatx80Frac( floatx80 a )
 509 {
 510
 511     return a.low;
 512
 513 }
 514
 515 /*----------------------------------------------------------------------------
 516 | Returns the exponent bits of the extended double-precision floating-point
 517 | value `a'.
 518 *----------------------------------------------------------------------------*/
 519
 520 INLINE int32 extractFloatx80Exp( floatx80 a )
 521 {
 522
 523     return a.high & 0x7FFF;
 524
 525 }
 526
 527 /*----------------------------------------------------------------------------
 528 | Returns the sign bit of the extended double-precision floating-point value
 529 | `a'.
 530 *----------------------------------------------------------------------------*/
 531
 532 INLINE flag extractFloatx80Sign( floatx80 a )
 533 {
 534
 535     return a.high>>15;
 536
 537 }
 538
 539 /*----------------------------------------------------------------------------
 540 | Normalizes the subnormal extended double-precision floating-point value
 541 | represented by the denormalized significand `aSig'.  The normalized exponent
 542 | and significand are stored at the locations pointed to by `zExpPtr' and
 543 | `zSigPtr', respectively.
 544 *----------------------------------------------------------------------------*/
 545
 546 static void
 547  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 548 {
 549     int8 shiftCount;
 550
 551     shiftCount = countLeadingZeros64( aSig );
 552     *zSigPtr = aSig<<shiftCount;
 553     *zExpPtr = 1 - shiftCount;
 554
 555 }
 556
 557 /*----------------------------------------------------------------------------
 558 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 559 | extended double-precision floating-point value, returning the result.
 560 *----------------------------------------------------------------------------*/
 561
 562 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 563 {
 564     floatx80 z;
 565
 566     z.low = zSig;
 567     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 568     return z;
 569
 570 }
 571
 572 /*----------------------------------------------------------------------------
 573 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 574 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 575 | and returns the proper extended double-precision floating-point value
 576 | corresponding to the abstract input.  Ordinarily, the abstract value is
 577 | rounded and packed into the extended double-precision format, with the
 578 | inexact exception raised if the abstract input cannot be represented
 579 | exactly.  However, if the abstract value is too large, the overflow and
 580 | inexact exceptions are raised and an infinity or maximal finite value is
 581 | returned.  If the abstract value is too small, the input value is rounded to
 582 | a subnormal number, and the underflow and inexact exceptions are raised if
 583 | the abstract input cannot be represented exactly as a subnormal extended
 584 | double-precision floating-point number.
 585 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 586 | number of bits as single or double precision, respectively.  Otherwise, the
 587 | result is rounded to the full precision of the extended double-precision
 588 | format.
 589 |     The input significand must be normalized or smaller.  If the input
 590 | significand is not normalized, `zExp' must be 0; in that case, the result
 591 | returned is a subnormal number, and it must not require rounding.  The
 592 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 593 | Floating-Point Arithmetic.
 594 *----------------------------------------------------------------------------*/
 595
 596 static floatx80
 597  roundAndPackFloatx80(
 598      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 599  STATUS_PARAM)
 600 {
 601     int8 roundingMode;
 602     flag roundNearestEven, increment, isTiny;
 603     int64 roundIncrement, roundMask, roundBits;
 604
 605     roundingMode = STATUS(float_rounding_mode);
 606     roundNearestEven = ( roundingMode == float_round_nearest_even );
 607     if ( roundingPrecision == 80 ) goto precision80;
 608     if ( roundingPrecision == 64 ) {
 609         roundIncrement = LIT64( 0x0000000000000400 );
 610         roundMask = LIT64( 0x00000000000007FF );
 611     }
 612     else if ( roundingPrecision == 32 ) {
 613         roundIncrement = LIT64( 0x0000008000000000 );
 614         roundMask = LIT64( 0x000000FFFFFFFFFF );
 615     }
 616     else {
 617         goto precision80;
 618     }
 619     zSig0 |= ( zSig1 != 0 );
 620     if ( ! roundNearestEven ) {
 621         if ( roundingMode == float_round_to_zero ) {
 622             roundIncrement = 0;
 623         }
 624         else {
 625             roundIncrement = roundMask;
 626             if ( zSign ) {
 627                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 628             }
 629             else {
 630                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 631             }
 632         }
 633     }
 634     roundBits = zSig0 & roundMask;
 635     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 636         if (    ( 0x7FFE < zExp )
 637              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 638            ) {
 639             goto overflow;
 640         }
 641         if ( zExp <= 0 ) {
 642             if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
 643             isTiny =
 644                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 645                 || ( zExp < 0 )
 646                 || ( zSig0 <= zSig0 + roundIncrement );
 647             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 648             zExp = 0;
 649             roundBits = zSig0 & roundMask;
 650             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 651             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 652             zSig0 += roundIncrement;
 653             if ( (sbits64) zSig0 < 0 ) zExp = 1;
 654             roundIncrement = roundMask + 1;
 655             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 656                 roundMask |= roundIncrement;
 657             }
 658             zSig0 &= ~ roundMask;
 659             return packFloatx80( zSign, zExp, zSig0 );
 660         }
 661     }
 662     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 663     zSig0 += roundIncrement;
 664     if ( zSig0 < roundIncrement ) {
 665         ++zExp;
 666         zSig0 = LIT64( 0x8000000000000000 );
 667     }
 668     roundIncrement = roundMask + 1;
 669     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 670         roundMask |= roundIncrement;
 671     }
 672     zSig0 &= ~ roundMask;
 673     if ( zSig0 == 0 ) zExp = 0;
 674     return packFloatx80( zSign, zExp, zSig0 );
 675  precision80:
 676     increment = ( (sbits64) zSig1 < 0 );
 677     if ( ! roundNearestEven ) {
 678         if ( roundingMode == float_round_to_zero ) {
 679             increment = 0;
 680         }
 681         else {
 682             if ( zSign ) {
 683                 increment = ( roundingMode == float_round_down ) && zSig1;
 684             }
 685             else {
 686                 increment = ( roundingMode == float_round_up ) && zSig1;
 687             }
 688         }
 689     }
 690     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 691         if (    ( 0x7FFE < zExp )
 692              || (    ( zExp == 0x7FFE )
 693                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 694                   && increment
 695                 )
 696            ) {
 697             roundMask = 0;
 698  overflow:
 699             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 700             if (    ( roundingMode == float_round_to_zero )
 701                  || ( zSign && ( roundingMode == float_round_up ) )
 702                  || ( ! zSign && ( roundingMode == float_round_down ) )
 703                ) {
 704                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 705             }
 706             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 707         }
 708         if ( zExp <= 0 ) {
 709             isTiny =
 710                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 711                 || ( zExp < 0 )
 712                 || ! increment
 713                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 714             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 715             zExp = 0;
 716             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 717             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 718             if ( roundNearestEven ) {
 719                 increment = ( (sbits64) zSig1 < 0 );
 720             }
 721             else {
 722                 if ( zSign ) {
 723                     increment = ( roundingMode == float_round_down ) && zSig1;
 724                 }
 725                 else {
 726                     increment = ( roundingMode == float_round_up ) && zSig1;
 727                 }
 728             }
 729             if ( increment ) {
 730                 ++zSig0;
 731                 zSig0 &=
 732                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 733                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
 734             }
 735             return packFloatx80( zSign, zExp, zSig0 );
 736         }
 737     }
 738     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 739     if ( increment ) {
 740         ++zSig0;
 741         if ( zSig0 == 0 ) {
 742             ++zExp;
 743             zSig0 = LIT64( 0x8000000000000000 );
 744         }
 745         else {
 746             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 747         }
 748     }
 749     else {
 750         if ( zSig0 == 0 ) zExp = 0;
 751     }
 752     return packFloatx80( zSign, zExp, zSig0 );
 753
 754 }
 755
 756 /*----------------------------------------------------------------------------
 757 | Takes an abstract floating-point value having sign `zSign', exponent
 758 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 759 | and returns the proper extended double-precision floating-point value
 760 | corresponding to the abstract input.  This routine is just like
 761 | `roundAndPackFloatx80' except that the input significand does not have to be
 762 | normalized.
 763 *----------------------------------------------------------------------------*/
 764
 765 static floatx80
 766  normalizeRoundAndPackFloatx80(
 767      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 768  STATUS_PARAM)
 769 {
 770     int8 shiftCount;
 771
 772     if ( zSig0 == 0 ) {
 773         zSig0 = zSig1;
 774         zSig1 = 0;
 775         zExp -= 64;
 776     }
 777     shiftCount = countLeadingZeros64( zSig0 );
 778     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 779     zExp -= shiftCount;
 780     return
 781         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 782
 783 }
 784
 785 #endif
 786
 787 #ifdef FLOAT128
 788
 789 /*----------------------------------------------------------------------------
 790 | Returns the least-significant 64 fraction bits of the quadruple-precision
 791 | floating-point value `a'.
 792 *----------------------------------------------------------------------------*/
 793
 794 INLINE bits64 extractFloat128Frac1( float128 a )
 795 {
 796
 797     return a.low;
 798
 799 }
 800
 801 /*----------------------------------------------------------------------------
 802 | Returns the most-significant 48 fraction bits of the quadruple-precision
 803 | floating-point value `a'.
 804 *----------------------------------------------------------------------------*/
 805
 806 INLINE bits64 extractFloat128Frac0( float128 a )
 807 {
 808
 809     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 810
 811 }
 812
 813 /*----------------------------------------------------------------------------
 814 | Returns the exponent bits of the quadruple-precision floating-point value
 815 | `a'.
 816 *----------------------------------------------------------------------------*/
 817
 818 INLINE int32 extractFloat128Exp( float128 a )
 819 {
 820
 821     return ( a.high>>48 ) & 0x7FFF;
 822
 823 }
 824
 825 /*----------------------------------------------------------------------------
 826 | Returns the sign bit of the quadruple-precision floating-point value `a'.
 827 *----------------------------------------------------------------------------*/
 828
 829 INLINE flag extractFloat128Sign( float128 a )
 830 {
 831
 832     return a.high>>63;
 833
 834 }
 835
 836 /*----------------------------------------------------------------------------
 837 | Normalizes the subnormal quadruple-precision floating-point value
 838 | represented by the denormalized significand formed by the concatenation of
 839 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 840 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 841 | significand are stored at the location pointed to by `zSig0Ptr', and the
 842 | least significant 64 bits of the normalized significand are stored at the
 843 | location pointed to by `zSig1Ptr'.
 844 *----------------------------------------------------------------------------*/
 845
 846 static void
 847  normalizeFloat128Subnormal(
 848      bits64 aSig0,
 849      bits64 aSig1,
 850      int32 *zExpPtr,
 851      bits64 *zSig0Ptr,
 852      bits64 *zSig1Ptr
 853  )
 854 {
 855     int8 shiftCount;
 856
 857     if ( aSig0 == 0 ) {
 858         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 859         if ( shiftCount < 0 ) {
 860             *zSig0Ptr = aSig1>>( - shiftCount );
 861             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 862         }
 863         else {
 864             *zSig0Ptr = aSig1<<shiftCount;
 865             *zSig1Ptr = 0;
 866         }
 867         *zExpPtr = - shiftCount - 63;
 868     }
 869     else {
 870         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 871         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 872         *zExpPtr = 1 - shiftCount;
 873     }
 874
 875 }
 876
 877 /*----------------------------------------------------------------------------
 878 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 879 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 880 | floating-point value, returning the result.  After being shifted into the
 881 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 882 | added together to form the most significant 32 bits of the result.  This
 883 | means that any integer portion of `zSig0' will be added into the exponent.
 884 | Since a properly normalized significand will have an integer portion equal
 885 | to 1, the `zExp' input should be 1 less than the desired result exponent
 886 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 887 | significand.
 888 *----------------------------------------------------------------------------*/
 889
 890 INLINE float128
 891  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 892 {
 893     float128 z;
 894
 895     z.low = zSig1;
 896     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
 897     return z;
 898
 899 }
 900
 901 /*----------------------------------------------------------------------------
 902 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 903 | and extended significand formed by the concatenation of `zSig0', `zSig1',
 904 | and `zSig2', and returns the proper quadruple-precision floating-point value
 905 | corresponding to the abstract input.  Ordinarily, the abstract value is
 906 | simply rounded and packed into the quadruple-precision format, with the
 907 | inexact exception raised if the abstract input cannot be represented
 908 | exactly.  However, if the abstract value is too large, the overflow and
 909 | inexact exceptions are raised and an infinity or maximal finite value is
 910 | returned.  If the abstract value is too small, the input value is rounded to
 911 | a subnormal number, and the underflow and inexact exceptions are raised if
 912 | the abstract input cannot be represented exactly as a subnormal quadruple-
 913 | precision floating-point number.
 914 |     The input significand must be normalized or smaller.  If the input
 915 | significand is not normalized, `zExp' must be 0; in that case, the result
 916 | returned is a subnormal number, and it must not require rounding.  In the
 917 | usual case that the input significand is normalized, `zExp' must be 1 less
 918 | than the ``true'' floating-point exponent.  The handling of underflow and
 919 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 920 *----------------------------------------------------------------------------*/
 921
 922 static float128
 923  roundAndPackFloat128(
 924      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
 925 {
 926     int8 roundingMode;
 927     flag roundNearestEven, increment, isTiny;
 928
 929     roundingMode = STATUS(float_rounding_mode);
 930     roundNearestEven = ( roundingMode == float_round_nearest_even );
 931     increment = ( (sbits64) zSig2 < 0 );
 932     if ( ! roundNearestEven ) {
 933         if ( roundingMode == float_round_to_zero ) {
 934             increment = 0;
 935         }
 936         else {
 937             if ( zSign ) {
 938                 increment = ( roundingMode == float_round_down ) && zSig2;
 939             }
 940             else {
 941                 increment = ( roundingMode == float_round_up ) && zSig2;
 942             }
 943         }
 944     }
 945     if ( 0x7FFD <= (bits32) zExp ) {
 946         if (    ( 0x7FFD < zExp )
 947              || (    ( zExp == 0x7FFD )
 948                   && eq128(
 949                          LIT64( 0x0001FFFFFFFFFFFF ),
 950                          LIT64( 0xFFFFFFFFFFFFFFFF ),
 951                          zSig0,
 952                          zSig1
 953                      )
 954                   && increment
 955                 )
 956            ) {
 957             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 958             if (    ( roundingMode == float_round_to_zero )
 959                  || ( zSign && ( roundingMode == float_round_up ) )
 960                  || ( ! zSign && ( roundingMode == float_round_down ) )
 961                ) {
 962                 return
 963                     packFloat128(
 964                         zSign,
 965                         0x7FFE,
 966                         LIT64( 0x0000FFFFFFFFFFFF ),
 967                         LIT64( 0xFFFFFFFFFFFFFFFF )
 968                     );
 969             }
 970             return packFloat128( zSign, 0x7FFF, 0, 0 );
 971         }
 972         if ( zExp < 0 ) {
 973             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
 974             isTiny =
 975                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 976                 || ( zExp < -1 )
 977                 || ! increment
 978                 || lt128(
 979                        zSig0,
 980                        zSig1,
 981                        LIT64( 0x0001FFFFFFFFFFFF ),
 982                        LIT64( 0xFFFFFFFFFFFFFFFF )
 983                    );
 984             shift128ExtraRightJamming(
 985                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
 986             zExp = 0;
 987             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
 988             if ( roundNearestEven ) {
 989                 increment = ( (sbits64) zSig2 < 0 );
 990             }
 991             else {
 992                 if ( zSign ) {
 993                     increment = ( roundingMode == float_round_down ) && zSig2;
 994                 }
 995                 else {
 996                     increment = ( roundingMode == float_round_up ) && zSig2;
 997                 }
 998             }
 999         }
1000     }
1001     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1002     if ( increment ) {
1003         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1004         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1005     }
1006     else {
1007         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1008     }
1009     return packFloat128( zSign, zExp, zSig0, zSig1 );
1010
1011 }
1012
1013 /*----------------------------------------------------------------------------
1014 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1015 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1016 | returns the proper quadruple-precision floating-point value corresponding
1017 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1018 | except that the input significand has fewer bits and does not have to be
1019 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1020 | point exponent.
1021 *----------------------------------------------------------------------------*/
1022
1023 static float128
1024  normalizeRoundAndPackFloat128(
1025      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1026 {
1027     int8 shiftCount;
1028     bits64 zSig2;
1029
1030     if ( zSig0 == 0 ) {
1031         zSig0 = zSig1;
1032         zSig1 = 0;
1033         zExp -= 64;
1034     }
1035     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1036     if ( 0 <= shiftCount ) {
1037         zSig2 = 0;
1038         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1039     }
1040     else {
1041         shift128ExtraRightJamming(
1042             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1043     }
1044     zExp -= shiftCount;
1045     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1046
1047 }
1048
1049 #endif
1050
1051 /*----------------------------------------------------------------------------
1052 | Returns the result of converting the 32-bit two's complement integer `a'
1053 | to the single-precision floating-point format.  The conversion is performed
1054 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1055 *----------------------------------------------------------------------------*/
1056
1057 float32 int32_to_float32( int32 a STATUS_PARAM )
1058 {
1059     flag zSign;
1060
1061     if ( a == 0 ) return float32_zero;
1062     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1063     zSign = ( a < 0 );
1064     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1065
1066 }
1067
1068 /*----------------------------------------------------------------------------
1069 | Returns the result of converting the 32-bit two's complement integer `a'
1070 | to the double-precision floating-point format.  The conversion is performed
1071 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1072 *----------------------------------------------------------------------------*/
1073
1074 float64 int32_to_float64( int32 a STATUS_PARAM )
1075 {
1076     flag zSign;
1077     uint32 absA;
1078     int8 shiftCount;
1079     bits64 zSig;
1080
1081     if ( a == 0 ) return float64_zero;
1082     zSign = ( a < 0 );
1083     absA = zSign ? - a : a;
1084     shiftCount = countLeadingZeros32( absA ) + 21;
1085     zSig = absA;
1086     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1087
1088 }
1089
1090 #ifdef FLOATX80
1091
1092 /*----------------------------------------------------------------------------
1093 | Returns the result of converting the 32-bit two's complement integer `a'
1094 | to the extended double-precision floating-point format.  The conversion
1095 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1096 | Arithmetic.
1097 *----------------------------------------------------------------------------*/
1098
1099 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1100 {
1101     flag zSign;
1102     uint32 absA;
1103     int8 shiftCount;
1104     bits64 zSig;
1105
1106     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1107     zSign = ( a < 0 );
1108     absA = zSign ? - a : a;
1109     shiftCount = countLeadingZeros32( absA ) + 32;
1110     zSig = absA;
1111     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1112
1113 }
1114
1115 #endif
1116
1117 #ifdef FLOAT128
1118
1119 /*----------------------------------------------------------------------------
1120 | Returns the result of converting the 32-bit two's complement integer `a' to
1121 | the quadruple-precision floating-point format.  The conversion is performed
1122 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1123 *----------------------------------------------------------------------------*/
1124
1125 float128 int32_to_float128( int32 a STATUS_PARAM )
1126 {
1127     flag zSign;
1128     uint32 absA;
1129     int8 shiftCount;
1130     bits64 zSig0;
1131
1132     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1133     zSign = ( a < 0 );
1134     absA = zSign ? - a : a;
1135     shiftCount = countLeadingZeros32( absA ) + 17;
1136     zSig0 = absA;
1137     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1138
1139 }
1140
1141 #endif
1142
1143 /*----------------------------------------------------------------------------
1144 | Returns the result of converting the 64-bit two's complement integer `a'
1145 | to the single-precision floating-point format.  The conversion is performed
1146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1147 *----------------------------------------------------------------------------*/
1148
1149 float32 int64_to_float32( int64 a STATUS_PARAM )
1150 {
1151     flag zSign;
1152     uint64 absA;
1153     int8 shiftCount;
1154
1155     if ( a == 0 ) return float32_zero;
1156     zSign = ( a < 0 );
1157     absA = zSign ? - a : a;
1158     shiftCount = countLeadingZeros64( absA ) - 40;
1159     if ( 0 <= shiftCount ) {
1160         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1161     }
1162     else {
1163         shiftCount += 7;
1164         if ( shiftCount < 0 ) {
1165             shift64RightJamming( absA, - shiftCount, &absA );
1166         }
1167         else {
1168             absA <<= shiftCount;
1169         }
1170         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1171     }
1172
1173 }
1174
1175 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1176 {
1177     int8 shiftCount;
1178
1179     if ( a == 0 ) return float32_zero;
1180     shiftCount = countLeadingZeros64( a ) - 40;
1181     if ( 0 <= shiftCount ) {
1182         return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1183     }
1184     else {
1185         shiftCount += 7;
1186         if ( shiftCount < 0 ) {
1187             shift64RightJamming( a, - shiftCount, &a );
1188         }
1189         else {
1190             a <<= shiftCount;
1191         }
1192         return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1193     }
1194 }
1195
1196 /*----------------------------------------------------------------------------
1197 | Returns the result of converting the 64-bit two's complement integer `a'
1198 | to the double-precision floating-point format.  The conversion is performed
1199 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1200 *----------------------------------------------------------------------------*/
1201
1202 float64 int64_to_float64( int64 a STATUS_PARAM )
1203 {
1204     flag zSign;
1205
1206     if ( a == 0 ) return float64_zero;
1207     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1208         return packFloat64( 1, 0x43E, 0 );
1209     }
1210     zSign = ( a < 0 );
1211     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1212
1213 }
1214
1215 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1216 {
1217     if ( a == 0 ) return float64_zero;
1218     return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1219
1220 }
1221
1222 #ifdef FLOATX80
1223
1224 /*----------------------------------------------------------------------------
1225 | Returns the result of converting the 64-bit two's complement integer `a'
1226 | to the extended double-precision floating-point format.  The conversion
1227 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1228 | Arithmetic.
1229 *----------------------------------------------------------------------------*/
1230
1231 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1232 {
1233     flag zSign;
1234     uint64 absA;
1235     int8 shiftCount;
1236
1237     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1238     zSign = ( a < 0 );
1239     absA = zSign ? - a : a;
1240     shiftCount = countLeadingZeros64( absA );
1241     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1242
1243 }
1244
1245 #endif
1246
1247 #ifdef FLOAT128
1248
1249 /*----------------------------------------------------------------------------
1250 | Returns the result of converting the 64-bit two's complement integer `a' to
1251 | the quadruple-precision floating-point format.  The conversion is performed
1252 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1253 *----------------------------------------------------------------------------*/
1254
1255 float128 int64_to_float128( int64 a STATUS_PARAM )
1256 {
1257     flag zSign;
1258     uint64 absA;
1259     int8 shiftCount;
1260     int32 zExp;
1261     bits64 zSig0, zSig1;
1262
1263     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1264     zSign = ( a < 0 );
1265     absA = zSign ? - a : a;
1266     shiftCount = countLeadingZeros64( absA ) + 49;
1267     zExp = 0x406E - shiftCount;
1268     if ( 64 <= shiftCount ) {
1269         zSig1 = 0;
1270         zSig0 = absA;
1271         shiftCount -= 64;
1272     }
1273     else {
1274         zSig1 = absA;
1275         zSig0 = 0;
1276     }
1277     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1278     return packFloat128( zSign, zExp, zSig0, zSig1 );
1279
1280 }
1281
1282 #endif
1283
1284 /*----------------------------------------------------------------------------
1285 | Returns the result of converting the single-precision floating-point value
1286 | `a' to the 32-bit two's complement integer format.  The conversion is
1287 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1288 | Arithmetic---which means in particular that the conversion is rounded
1289 | according to the current rounding mode.  If `a' is a NaN, the largest
1290 | positive integer is returned.  Otherwise, if the conversion overflows, the
1291 | largest integer with the same sign as `a' is returned.
1292 *----------------------------------------------------------------------------*/
1293
1294 int32 float32_to_int32( float32 a STATUS_PARAM )
1295 {
1296     flag aSign;
1297     int16 aExp, shiftCount;
1298     bits32 aSig;
1299     bits64 aSig64;
1300
1301     aSig = extractFloat32Frac( a );
1302     aExp = extractFloat32Exp( a );
1303     aSign = extractFloat32Sign( a );
1304     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1305     if ( aExp ) aSig |= 0x00800000;
1306     shiftCount = 0xAF - aExp;
1307     aSig64 = aSig;
1308     aSig64 <<= 32;
1309     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1310     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1311
1312 }
1313
1314 /*----------------------------------------------------------------------------
1315 | Returns the result of converting the single-precision floating-point value
1316 | `a' to the 32-bit two's complement integer format.  The conversion is
1317 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1318 | Arithmetic, except that the conversion is always rounded toward zero.
1319 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1320 | the conversion overflows, the largest integer with the same sign as `a' is
1321 | returned.
1322 *----------------------------------------------------------------------------*/
1323
1324 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1325 {
1326     flag aSign;
1327     int16 aExp, shiftCount;
1328     bits32 aSig;
1329     int32 z;
1330
1331     aSig = extractFloat32Frac( a );
1332     aExp = extractFloat32Exp( a );
1333     aSign = extractFloat32Sign( a );
1334     shiftCount = aExp - 0x9E;
1335     if ( 0 <= shiftCount ) {
1336         if ( float32_val(a) != 0xCF000000 ) {
1337             float_raise( float_flag_invalid STATUS_VAR);
1338             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1339         }
1340         return (sbits32) 0x80000000;
1341     }
1342     else if ( aExp <= 0x7E ) {
1343         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1344         return 0;
1345     }
1346     aSig = ( aSig | 0x00800000 )<<8;
1347     z = aSig>>( - shiftCount );
1348     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1349         STATUS(float_exception_flags) |= float_flag_inexact;
1350     }
1351     if ( aSign ) z = - z;
1352     return z;
1353
1354 }
1355
1356 /*----------------------------------------------------------------------------
1357 | Returns the result of converting the single-precision floating-point value
1358 | `a' to the 64-bit two's complement integer format.  The conversion is
1359 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1360 | Arithmetic---which means in particular that the conversion is rounded
1361 | according to the current rounding mode.  If `a' is a NaN, the largest
1362 | positive integer is returned.  Otherwise, if the conversion overflows, the
1363 | largest integer with the same sign as `a' is returned.
1364 *----------------------------------------------------------------------------*/
1365
1366 int64 float32_to_int64( float32 a STATUS_PARAM )
1367 {
1368     flag aSign;
1369     int16 aExp, shiftCount;
1370     bits32 aSig;
1371     bits64 aSig64, aSigExtra;
1372
1373     aSig = extractFloat32Frac( a );
1374     aExp = extractFloat32Exp( a );
1375     aSign = extractFloat32Sign( a );
1376     shiftCount = 0xBE - aExp;
1377     if ( shiftCount < 0 ) {
1378         float_raise( float_flag_invalid STATUS_VAR);
1379         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1380             return LIT64( 0x7FFFFFFFFFFFFFFF );
1381         }
1382         return (sbits64) LIT64( 0x8000000000000000 );
1383     }
1384     if ( aExp ) aSig |= 0x00800000;
1385     aSig64 = aSig;
1386     aSig64 <<= 40;
1387     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1388     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1389
1390 }
1391
1392 /*----------------------------------------------------------------------------
1393 | Returns the result of converting the single-precision floating-point value
1394 | `a' to the 64-bit two's complement integer format.  The conversion is
1395 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1396 | Arithmetic, except that the conversion is always rounded toward zero.  If
1397 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1398 | conversion overflows, the largest integer with the same sign as `a' is
1399 | returned.
1400 *----------------------------------------------------------------------------*/
1401
1402 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1403 {
1404     flag aSign;
1405     int16 aExp, shiftCount;
1406     bits32 aSig;
1407     bits64 aSig64;
1408     int64 z;
1409
1410     aSig = extractFloat32Frac( a );
1411     aExp = extractFloat32Exp( a );
1412     aSign = extractFloat32Sign( a );
1413     shiftCount = aExp - 0xBE;
1414     if ( 0 <= shiftCount ) {
1415         if ( float32_val(a) != 0xDF000000 ) {
1416             float_raise( float_flag_invalid STATUS_VAR);
1417             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1418                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1419             }
1420         }
1421         return (sbits64) LIT64( 0x8000000000000000 );
1422     }
1423     else if ( aExp <= 0x7E ) {
1424         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1425         return 0;
1426     }
1427     aSig64 = aSig | 0x00800000;
1428     aSig64 <<= 40;
1429     z = aSig64>>( - shiftCount );
1430     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1431         STATUS(float_exception_flags) |= float_flag_inexact;
1432     }
1433     if ( aSign ) z = - z;
1434     return z;
1435
1436 }
1437
1438 /*----------------------------------------------------------------------------
1439 | Returns the result of converting the single-precision floating-point value
1440 | `a' to the double-precision floating-point format.  The conversion is
1441 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1442 | Arithmetic.
1443 *----------------------------------------------------------------------------*/
1444
1445 float64 float32_to_float64( float32 a STATUS_PARAM )
1446 {
1447     flag aSign;
1448     int16 aExp;
1449     bits32 aSig;
1450
1451     aSig = extractFloat32Frac( a );
1452     aExp = extractFloat32Exp( a );
1453     aSign = extractFloat32Sign( a );
1454     if ( aExp == 0xFF ) {
1455         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1456         return packFloat64( aSign, 0x7FF, 0 );
1457     }
1458     if ( aExp == 0 ) {
1459         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1460         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1461         --aExp;
1462     }
1463     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1464
1465 }
1466
1467 #ifdef FLOATX80
1468
1469 /*----------------------------------------------------------------------------
1470 | Returns the result of converting the single-precision floating-point value
1471 | `a' to the extended double-precision floating-point format.  The conversion
1472 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1473 | Arithmetic.
1474 *----------------------------------------------------------------------------*/
1475
1476 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1477 {
1478     flag aSign;
1479     int16 aExp;
1480     bits32 aSig;
1481
1482     aSig = extractFloat32Frac( a );
1483     aExp = extractFloat32Exp( a );
1484     aSign = extractFloat32Sign( a );
1485     if ( aExp == 0xFF ) {
1486         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1487         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1488     }
1489     if ( aExp == 0 ) {
1490         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1491         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1492     }
1493     aSig |= 0x00800000;
1494     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1495
1496 }
1497
1498 #endif
1499
1500 #ifdef FLOAT128
1501
1502 /*----------------------------------------------------------------------------
1503 | Returns the result of converting the single-precision floating-point value
1504 | `a' to the double-precision floating-point format.  The conversion is
1505 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1506 | Arithmetic.
1507 *----------------------------------------------------------------------------*/
1508
1509 float128 float32_to_float128( float32 a STATUS_PARAM )
1510 {
1511     flag aSign;
1512     int16 aExp;
1513     bits32 aSig;
1514
1515     aSig = extractFloat32Frac( a );
1516     aExp = extractFloat32Exp( a );
1517     aSign = extractFloat32Sign( a );
1518     if ( aExp == 0xFF ) {
1519         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1520         return packFloat128( aSign, 0x7FFF, 0, 0 );
1521     }
1522     if ( aExp == 0 ) {
1523         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1524         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1525         --aExp;
1526     }
1527     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1528
1529 }
1530
1531 #endif
1532
1533 /*----------------------------------------------------------------------------
1534 | Rounds the single-precision floating-point value `a' to an integer, and
1535 | returns the result as a single-precision floating-point value.  The
1536 | operation is performed according to the IEC/IEEE Standard for Binary
1537 | Floating-Point Arithmetic.
1538 *----------------------------------------------------------------------------*/
1539
1540 float32 float32_round_to_int( float32 a STATUS_PARAM)
1541 {
1542     flag aSign;
1543     int16 aExp;
1544     bits32 lastBitMask, roundBitsMask;
1545     int8 roundingMode;
1546     bits32 z;
1547
1548     aExp = extractFloat32Exp( a );
1549     if ( 0x96 <= aExp ) {
1550         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1551             return propagateFloat32NaN( a, a STATUS_VAR );
1552         }
1553         return a;
1554     }
1555     if ( aExp <= 0x7E ) {
1556         if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1557         STATUS(float_exception_flags) |= float_flag_inexact;
1558         aSign = extractFloat32Sign( a );
1559         switch ( STATUS(float_rounding_mode) ) {
1560          case float_round_nearest_even:
1561             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1562                 return packFloat32( aSign, 0x7F, 0 );
1563             }
1564             break;
1565          case float_round_down:
1566             return make_float32(aSign ? 0xBF800000 : 0);
1567          case float_round_up:
1568             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1569         }
1570         return packFloat32( aSign, 0, 0 );
1571     }
1572     lastBitMask = 1;
1573     lastBitMask <<= 0x96 - aExp;
1574     roundBitsMask = lastBitMask - 1;
1575     z = float32_val(a);
1576     roundingMode = STATUS(float_rounding_mode);
1577     if ( roundingMode == float_round_nearest_even ) {
1578         z += lastBitMask>>1;
1579         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1580     }
1581     else if ( roundingMode != float_round_to_zero ) {
1582         if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1583             z += roundBitsMask;
1584         }
1585     }
1586     z &= ~ roundBitsMask;
1587     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1588     return make_float32(z);
1589
1590 }
1591
1592 /*----------------------------------------------------------------------------
1593 | Returns the result of adding the absolute values of the single-precision
1594 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1595 | before being returned.  `zSign' is ignored if the result is a NaN.
1596 | The addition is performed according to the IEC/IEEE Standard for Binary
1597 | Floating-Point Arithmetic.
1598 *----------------------------------------------------------------------------*/
1599
1600 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1601 {
1602     int16 aExp, bExp, zExp;
1603     bits32 aSig, bSig, zSig;
1604     int16 expDiff;
1605
1606     aSig = extractFloat32Frac( a );
1607     aExp = extractFloat32Exp( a );
1608     bSig = extractFloat32Frac( b );
1609     bExp = extractFloat32Exp( b );
1610     expDiff = aExp - bExp;
1611     aSig <<= 6;
1612     bSig <<= 6;
1613     if ( 0 < expDiff ) {
1614         if ( aExp == 0xFF ) {
1615             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1616             return a;
1617         }
1618         if ( bExp == 0 ) {
1619             --expDiff;
1620         }
1621         else {
1622             bSig |= 0x20000000;
1623         }
1624         shift32RightJamming( bSig, expDiff, &bSig );
1625         zExp = aExp;
1626     }
1627     else if ( expDiff < 0 ) {
1628         if ( bExp == 0xFF ) {
1629             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1630             return packFloat32( zSign, 0xFF, 0 );
1631         }
1632         if ( aExp == 0 ) {
1633             ++expDiff;
1634         }
1635         else {
1636             aSig |= 0x20000000;
1637         }
1638         shift32RightJamming( aSig, - expDiff, &aSig );
1639         zExp = bExp;
1640     }
1641     else {
1642         if ( aExp == 0xFF ) {
1643             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1644             return a;
1645         }
1646         if ( aExp == 0 ) {
1647             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1648             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1649         }
1650         zSig = 0x40000000 + aSig + bSig;
1651         zExp = aExp;
1652         goto roundAndPack;
1653     }
1654     aSig |= 0x20000000;
1655     zSig = ( aSig + bSig )<<1;
1656     --zExp;
1657     if ( (sbits32) zSig < 0 ) {
1658         zSig = aSig + bSig;
1659         ++zExp;
1660     }
1661  roundAndPack:
1662     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1663
1664 }
1665
1666 /*----------------------------------------------------------------------------
1667 | Returns the result of subtracting the absolute values of the single-
1668 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1669 | difference is negated before being returned.  `zSign' is ignored if the
1670 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1671 | Standard for Binary Floating-Point Arithmetic.
1672 *----------------------------------------------------------------------------*/
1673
1674 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1675 {
1676     int16 aExp, bExp, zExp;
1677     bits32 aSig, bSig, zSig;
1678     int16 expDiff;
1679
1680     aSig = extractFloat32Frac( a );
1681     aExp = extractFloat32Exp( a );
1682     bSig = extractFloat32Frac( b );
1683     bExp = extractFloat32Exp( b );
1684     expDiff = aExp - bExp;
1685     aSig <<= 7;
1686     bSig <<= 7;
1687     if ( 0 < expDiff ) goto aExpBigger;
1688     if ( expDiff < 0 ) goto bExpBigger;
1689     if ( aExp == 0xFF ) {
1690         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1691         float_raise( float_flag_invalid STATUS_VAR);
1692         return float32_default_nan;
1693     }
1694     if ( aExp == 0 ) {
1695         aExp = 1;
1696         bExp = 1;
1697     }
1698     if ( bSig < aSig ) goto aBigger;
1699     if ( aSig < bSig ) goto bBigger;
1700     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1701  bExpBigger:
1702     if ( bExp == 0xFF ) {
1703         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1704         return packFloat32( zSign ^ 1, 0xFF, 0 );
1705     }
1706     if ( aExp == 0 ) {
1707         ++expDiff;
1708     }
1709     else {
1710         aSig |= 0x40000000;
1711     }
1712     shift32RightJamming( aSig, - expDiff, &aSig );
1713     bSig |= 0x40000000;
1714  bBigger:
1715     zSig = bSig - aSig;
1716     zExp = bExp;
1717     zSign ^= 1;
1718     goto normalizeRoundAndPack;
1719  aExpBigger:
1720     if ( aExp == 0xFF ) {
1721         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1722         return a;
1723     }
1724     if ( bExp == 0 ) {
1725         --expDiff;
1726     }
1727     else {
1728         bSig |= 0x40000000;
1729     }
1730     shift32RightJamming( bSig, expDiff, &bSig );
1731     aSig |= 0x40000000;
1732  aBigger:
1733     zSig = aSig - bSig;
1734     zExp = aExp;
1735  normalizeRoundAndPack:
1736     --zExp;
1737     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1738
1739 }
1740
1741 /*----------------------------------------------------------------------------
1742 | Returns the result of adding the single-precision floating-point values `a'
1743 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1744 | Binary Floating-Point Arithmetic.
1745 *----------------------------------------------------------------------------*/
1746
1747 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1748 {
1749     flag aSign, bSign;
1750
1751     aSign = extractFloat32Sign( a );
1752     bSign = extractFloat32Sign( b );
1753     if ( aSign == bSign ) {
1754         return addFloat32Sigs( a, b, aSign STATUS_VAR);
1755     }
1756     else {
1757         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1758     }
1759
1760 }
1761
1762 /*----------------------------------------------------------------------------
1763 | Returns the result of subtracting the single-precision floating-point values
1764 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1765 | for Binary Floating-Point Arithmetic.
1766 *----------------------------------------------------------------------------*/
1767
1768 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1769 {
1770     flag aSign, bSign;
1771
1772     aSign = extractFloat32Sign( a );
1773     bSign = extractFloat32Sign( b );
1774     if ( aSign == bSign ) {
1775         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1776     }
1777     else {
1778         return addFloat32Sigs( a, b, aSign STATUS_VAR );
1779     }
1780
1781 }
1782
1783 /*----------------------------------------------------------------------------
1784 | Returns the result of multiplying the single-precision floating-point values
1785 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1786 | for Binary Floating-Point Arithmetic.
1787 *----------------------------------------------------------------------------*/
1788
1789 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1790 {
1791     flag aSign, bSign, zSign;
1792     int16 aExp, bExp, zExp;
1793     bits32 aSig, bSig;
1794     bits64 zSig64;
1795     bits32 zSig;
1796
1797     aSig = extractFloat32Frac( a );
1798     aExp = extractFloat32Exp( a );
1799     aSign = extractFloat32Sign( a );
1800     bSig = extractFloat32Frac( b );
1801     bExp = extractFloat32Exp( b );
1802     bSign = extractFloat32Sign( b );
1803     zSign = aSign ^ bSign;
1804     if ( aExp == 0xFF ) {
1805         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1806             return propagateFloat32NaN( a, b STATUS_VAR );
1807         }
1808         if ( ( bExp | bSig ) == 0 ) {
1809             float_raise( float_flag_invalid STATUS_VAR);
1810             return float32_default_nan;
1811         }
1812         return packFloat32( zSign, 0xFF, 0 );
1813     }
1814     if ( bExp == 0xFF ) {
1815         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1816         if ( ( aExp | aSig ) == 0 ) {
1817             float_raise( float_flag_invalid STATUS_VAR);
1818             return float32_default_nan;
1819         }
1820         return packFloat32( zSign, 0xFF, 0 );
1821     }
1822     if ( aExp == 0 ) {
1823         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1824         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1825     }
1826     if ( bExp == 0 ) {
1827         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1828         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1829     }
1830     zExp = aExp + bExp - 0x7F;
1831     aSig = ( aSig | 0x00800000 )<<7;
1832     bSig = ( bSig | 0x00800000 )<<8;
1833     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1834     zSig = zSig64;
1835     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1836         zSig <<= 1;
1837         --zExp;
1838     }
1839     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1840
1841 }
1842
1843 /*----------------------------------------------------------------------------
1844 | Returns the result of dividing the single-precision floating-point value `a'
1845 | by the corresponding value `b'.  The operation is performed according to the
1846 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1847 *----------------------------------------------------------------------------*/
1848
1849 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1850 {
1851     flag aSign, bSign, zSign;
1852     int16 aExp, bExp, zExp;
1853     bits32 aSig, bSig, zSig;
1854
1855     aSig = extractFloat32Frac( a );
1856     aExp = extractFloat32Exp( a );
1857     aSign = extractFloat32Sign( a );
1858     bSig = extractFloat32Frac( b );
1859     bExp = extractFloat32Exp( b );
1860     bSign = extractFloat32Sign( b );
1861     zSign = aSign ^ bSign;
1862     if ( aExp == 0xFF ) {
1863         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1864         if ( bExp == 0xFF ) {
1865             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1866             float_raise( float_flag_invalid STATUS_VAR);
1867             return float32_default_nan;
1868         }
1869         return packFloat32( zSign, 0xFF, 0 );
1870     }
1871     if ( bExp == 0xFF ) {
1872         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1873         return packFloat32( zSign, 0, 0 );
1874     }
1875     if ( bExp == 0 ) {
1876         if ( bSig == 0 ) {
1877             if ( ( aExp | aSig ) == 0 ) {
1878                 float_raise( float_flag_invalid STATUS_VAR);
1879                 return float32_default_nan;
1880             }
1881             float_raise( float_flag_divbyzero STATUS_VAR);
1882             return packFloat32( zSign, 0xFF, 0 );
1883         }
1884         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1885     }
1886     if ( aExp == 0 ) {
1887         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1888         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1889     }
1890     zExp = aExp - bExp + 0x7D;
1891     aSig = ( aSig | 0x00800000 )<<7;
1892     bSig = ( bSig | 0x00800000 )<<8;
1893     if ( bSig <= ( aSig + aSig ) ) {
1894         aSig >>= 1;
1895         ++zExp;
1896     }
1897     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1898     if ( ( zSig & 0x3F ) == 0 ) {
1899         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1900     }
1901     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1902
1903 }
1904
1905 /*----------------------------------------------------------------------------
1906 | Returns the remainder of the single-precision floating-point value `a'
1907 | with respect to the corresponding value `b'.  The operation is performed
1908 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1909 *----------------------------------------------------------------------------*/
1910
1911 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1912 {
1913     flag aSign, bSign, zSign;
1914     int16 aExp, bExp, expDiff;
1915     bits32 aSig, bSig;
1916     bits32 q;
1917     bits64 aSig64, bSig64, q64;
1918     bits32 alternateASig;
1919     sbits32 sigMean;
1920
1921     aSig = extractFloat32Frac( a );
1922     aExp = extractFloat32Exp( a );
1923     aSign = extractFloat32Sign( a );
1924     bSig = extractFloat32Frac( b );
1925     bExp = extractFloat32Exp( b );
1926     bSign = extractFloat32Sign( b );
1927     if ( aExp == 0xFF ) {
1928         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1929             return propagateFloat32NaN( a, b STATUS_VAR );
1930         }
1931         float_raise( float_flag_invalid STATUS_VAR);
1932         return float32_default_nan;
1933     }
1934     if ( bExp == 0xFF ) {
1935         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1936         return a;
1937     }
1938     if ( bExp == 0 ) {
1939         if ( bSig == 0 ) {
1940             float_raise( float_flag_invalid STATUS_VAR);
1941             return float32_default_nan;
1942         }
1943         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1944     }
1945     if ( aExp == 0 ) {
1946         if ( aSig == 0 ) return a;
1947         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1948     }
1949     expDiff = aExp - bExp;
1950     aSig |= 0x00800000;
1951     bSig |= 0x00800000;
1952     if ( expDiff < 32 ) {
1953         aSig <<= 8;
1954         bSig <<= 8;
1955         if ( expDiff < 0 ) {
1956             if ( expDiff < -1 ) return a;
1957             aSig >>= 1;
1958         }
1959         q = ( bSig <= aSig );
1960         if ( q ) aSig -= bSig;
1961         if ( 0 < expDiff ) {
1962             q = ( ( (bits64) aSig )<<32 ) / bSig;
1963             q >>= 32 - expDiff;
1964             bSig >>= 2;
1965             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1966         }
1967         else {
1968             aSig >>= 2;
1969             bSig >>= 2;
1970         }
1971     }
1972     else {
1973         if ( bSig <= aSig ) aSig -= bSig;
1974         aSig64 = ( (bits64) aSig )<<40;
1975         bSig64 = ( (bits64) bSig )<<40;
1976         expDiff -= 64;
1977         while ( 0 < expDiff ) {
1978             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1979             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1980             aSig64 = - ( ( bSig * q64 )<<38 );
1981             expDiff -= 62;
1982         }
1983         expDiff += 64;
1984         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1985         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1986         q = q64>>( 64 - expDiff );
1987         bSig <<= 6;
1988         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1989     }
1990     do {
1991         alternateASig = aSig;
1992         ++q;
1993         aSig -= bSig;
1994     } while ( 0 <= (sbits32) aSig );
1995     sigMean = aSig + alternateASig;
1996     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1997         aSig = alternateASig;
1998     }
1999     zSign = ( (sbits32) aSig < 0 );
2000     if ( zSign ) aSig = - aSig;
2001     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2002
2003 }
2004
2005 /*----------------------------------------------------------------------------
2006 | Returns the square root of the single-precision floating-point value `a'.
2007 | The operation is performed according to the IEC/IEEE Standard for Binary
2008 | Floating-Point Arithmetic.
2009 *----------------------------------------------------------------------------*/
2010
2011 float32 float32_sqrt( float32 a STATUS_PARAM )
2012 {
2013     flag aSign;
2014     int16 aExp, zExp;
2015     bits32 aSig, zSig;
2016     bits64 rem, term;
2017
2018     aSig = extractFloat32Frac( a );
2019     aExp = extractFloat32Exp( a );
2020     aSign = extractFloat32Sign( a );
2021     if ( aExp == 0xFF ) {
2022         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2023         if ( ! aSign ) return a;
2024         float_raise( float_flag_invalid STATUS_VAR);
2025         return float32_default_nan;
2026     }
2027     if ( aSign ) {
2028         if ( ( aExp | aSig ) == 0 ) return a;
2029         float_raise( float_flag_invalid STATUS_VAR);
2030         return float32_default_nan;
2031     }
2032     if ( aExp == 0 ) {
2033         if ( aSig == 0 ) return float32_zero;
2034         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2035     }
2036     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2037     aSig = ( aSig | 0x00800000 )<<8;
2038     zSig = estimateSqrt32( aExp, aSig ) + 2;
2039     if ( ( zSig & 0x7F ) <= 5 ) {
2040         if ( zSig < 2 ) {
2041             zSig = 0x7FFFFFFF;
2042             goto roundAndPack;
2043         }
2044         aSig >>= aExp & 1;
2045         term = ( (bits64) zSig ) * zSig;
2046         rem = ( ( (bits64) aSig )<<32 ) - term;
2047         while ( (sbits64) rem < 0 ) {
2048             --zSig;
2049             rem += ( ( (bits64) zSig )<<1 ) | 1;
2050         }
2051         zSig |= ( rem != 0 );
2052     }
2053     shift32RightJamming( zSig, 1, &zSig );
2054  roundAndPack:
2055     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2056
2057 }
2058
2059 /*----------------------------------------------------------------------------
2060 | Returns 1 if the single-precision floating-point value `a' is equal to
2061 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2062 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2063 *----------------------------------------------------------------------------*/
2064
2065 int float32_eq( float32 a, float32 b STATUS_PARAM )
2066 {
2067
2068     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2069          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2070        ) {
2071         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2072             float_raise( float_flag_invalid STATUS_VAR);
2073         }
2074         return 0;
2075     }
2076     return ( float32_val(a) == float32_val(b) ) ||
2077             ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2078
2079 }
2080
2081 /*----------------------------------------------------------------------------
2082 | Returns 1 if the single-precision floating-point value `a' is less than
2083 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
2084 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2085 | Arithmetic.
2086 *----------------------------------------------------------------------------*/
2087
2088 int float32_le( float32 a, float32 b STATUS_PARAM )
2089 {
2090     flag aSign, bSign;
2091     bits32 av, bv;
2092
2093     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2094          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2095        ) {
2096         float_raise( float_flag_invalid STATUS_VAR);
2097         return 0;
2098     }
2099     aSign = extractFloat32Sign( a );
2100     bSign = extractFloat32Sign( b );
2101     av = float32_val(a);
2102     bv = float32_val(b);
2103     if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2104     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2105
2106 }
2107
2108 /*----------------------------------------------------------------------------
2109 | Returns 1 if the single-precision floating-point value `a' is less than
2110 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2111 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2112 *----------------------------------------------------------------------------*/
2113
2114 int float32_lt( float32 a, float32 b STATUS_PARAM )
2115 {
2116     flag aSign, bSign;
2117     bits32 av, bv;
2118
2119     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2120          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2121        ) {
2122         float_raise( float_flag_invalid STATUS_VAR);
2123         return 0;
2124     }
2125     aSign = extractFloat32Sign( a );
2126     bSign = extractFloat32Sign( b );
2127     av = float32_val(a);
2128     bv = float32_val(b);
2129     if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2130     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2131
2132 }
2133
2134 /*----------------------------------------------------------------------------
2135 | Returns 1 if the single-precision floating-point value `a' is equal to
2136 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2137 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2139 *----------------------------------------------------------------------------*/
2140
2141 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2142 {
2143     bits32 av, bv;
2144
2145     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2146          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2147        ) {
2148         float_raise( float_flag_invalid STATUS_VAR);
2149         return 0;
2150     }
2151     av = float32_val(a);
2152     bv = float32_val(b);
2153     return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2154
2155 }
2156
2157 /*----------------------------------------------------------------------------
2158 | Returns 1 if the single-precision floating-point value `a' is less than or
2159 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2160 | cause an exception.  Otherwise, the comparison is performed according to the
2161 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2162 *----------------------------------------------------------------------------*/
2163
2164 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2165 {
2166     flag aSign, bSign;
2167     bits32 av, bv;
2168
2169     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2170          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2171        ) {
2172         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2173             float_raise( float_flag_invalid STATUS_VAR);
2174         }
2175         return 0;
2176     }
2177     aSign = extractFloat32Sign( a );
2178     bSign = extractFloat32Sign( b );
2179     av = float32_val(a);
2180     bv = float32_val(b);
2181     if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2182     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2183
2184 }
2185
2186 /*----------------------------------------------------------------------------
2187 | Returns 1 if the single-precision floating-point value `a' is less than
2188 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2189 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2190 | Standard for Binary Floating-Point Arithmetic.
2191 *----------------------------------------------------------------------------*/
2192
2193 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2194 {
2195     flag aSign, bSign;
2196     bits32 av, bv;
2197
2198     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2199          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2200        ) {
2201         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2202             float_raise( float_flag_invalid STATUS_VAR);
2203         }
2204         return 0;
2205     }
2206     aSign = extractFloat32Sign( a );
2207     bSign = extractFloat32Sign( b );
2208     av = float32_val(a);
2209     bv = float32_val(b);
2210     if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2211     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2212
2213 }
2214
2215 /*----------------------------------------------------------------------------
2216 | Returns the result of converting the double-precision floating-point value
2217 | `a' to the 32-bit two's complement integer format.  The conversion is
2218 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2219 | Arithmetic---which means in particular that the conversion is rounded
2220 | according to the current rounding mode.  If `a' is a NaN, the largest
2221 | positive integer is returned.  Otherwise, if the conversion overflows, the
2222 | largest integer with the same sign as `a' is returned.
2223 *----------------------------------------------------------------------------*/
2224
2225 int32 float64_to_int32( float64 a STATUS_PARAM )
2226 {
2227     flag aSign;
2228     int16 aExp, shiftCount;
2229     bits64 aSig;
2230
2231     aSig = extractFloat64Frac( a );
2232     aExp = extractFloat64Exp( a );
2233     aSign = extractFloat64Sign( a );
2234     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2235     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2236     shiftCount = 0x42C - aExp;
2237     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2238     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2239
2240 }
2241
2242 /*----------------------------------------------------------------------------
2243 | Returns the result of converting the double-precision floating-point value
2244 | `a' to the 32-bit two's complement integer format.  The conversion is
2245 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2246 | Arithmetic, except that the conversion is always rounded toward zero.
2247 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2248 | the conversion overflows, the largest integer with the same sign as `a' is
2249 | returned.
2250 *----------------------------------------------------------------------------*/
2251
2252 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2253 {
2254     flag aSign;
2255     int16 aExp, shiftCount;
2256     bits64 aSig, savedASig;
2257     int32 z;
2258
2259     aSig = extractFloat64Frac( a );
2260     aExp = extractFloat64Exp( a );
2261     aSign = extractFloat64Sign( a );
2262     if ( 0x41E < aExp ) {
2263         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2264         goto invalid;
2265     }
2266     else if ( aExp < 0x3FF ) {
2267         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2268         return 0;
2269     }
2270     aSig |= LIT64( 0x0010000000000000 );
2271     shiftCount = 0x433 - aExp;
2272     savedASig = aSig;
2273     aSig >>= shiftCount;
2274     z = aSig;
2275     if ( aSign ) z = - z;
2276     if ( ( z < 0 ) ^ aSign ) {
2277  invalid:
2278         float_raise( float_flag_invalid STATUS_VAR);
2279         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2280     }
2281     if ( ( aSig<<shiftCount ) != savedASig ) {
2282         STATUS(float_exception_flags) |= float_flag_inexact;
2283     }
2284     return z;
2285
2286 }
2287
2288 /*----------------------------------------------------------------------------
2289 | Returns the result of converting the double-precision floating-point value
2290 | `a' to the 64-bit two's complement integer format.  The conversion is
2291 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2292 | Arithmetic---which means in particular that the conversion is rounded
2293 | according to the current rounding mode.  If `a' is a NaN, the largest
2294 | positive integer is returned.  Otherwise, if the conversion overflows, the
2295 | largest integer with the same sign as `a' is returned.
2296 *----------------------------------------------------------------------------*/
2297
2298 int64 float64_to_int64( float64 a STATUS_PARAM )
2299 {
2300     flag aSign;
2301     int16 aExp, shiftCount;
2302     bits64 aSig, aSigExtra;
2303
2304     aSig = extractFloat64Frac( a );
2305     aExp = extractFloat64Exp( a );
2306     aSign = extractFloat64Sign( a );
2307     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2308     shiftCount = 0x433 - aExp;
2309     if ( shiftCount <= 0 ) {
2310         if ( 0x43E < aExp ) {
2311             float_raise( float_flag_invalid STATUS_VAR);
2312             if (    ! aSign
2313                  || (    ( aExp == 0x7FF )
2314                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2315                ) {
2316                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2317             }
2318             return (sbits64) LIT64( 0x8000000000000000 );
2319         }
2320         aSigExtra = 0;
2321         aSig <<= - shiftCount;
2322     }
2323     else {
2324         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2325     }
2326     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2327
2328 }
2329
2330 /*----------------------------------------------------------------------------
2331 | Returns the result of converting the double-precision floating-point value
2332 | `a' to the 64-bit two's complement integer format.  The conversion is
2333 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2334 | Arithmetic, except that the conversion is always rounded toward zero.
2335 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2336 | the conversion overflows, the largest integer with the same sign as `a' is
2337 | returned.
2338 *----------------------------------------------------------------------------*/
2339
2340 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2341 {
2342     flag aSign;
2343     int16 aExp, shiftCount;
2344     bits64 aSig;
2345     int64 z;
2346
2347     aSig = extractFloat64Frac( a );
2348     aExp = extractFloat64Exp( a );
2349     aSign = extractFloat64Sign( a );
2350     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2351     shiftCount = aExp - 0x433;
2352     if ( 0 <= shiftCount ) {
2353         if ( 0x43E <= aExp ) {
2354             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2355                 float_raise( float_flag_invalid STATUS_VAR);
2356                 if (    ! aSign
2357                      || (    ( aExp == 0x7FF )
2358                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2359                    ) {
2360                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2361                 }
2362             }
2363             return (sbits64) LIT64( 0x8000000000000000 );
2364         }
2365         z = aSig<<shiftCount;
2366     }
2367     else {
2368         if ( aExp < 0x3FE ) {
2369             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2370             return 0;
2371         }
2372         z = aSig>>( - shiftCount );
2373         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2374             STATUS(float_exception_flags) |= float_flag_inexact;
2375         }
2376     }
2377     if ( aSign ) z = - z;
2378     return z;
2379
2380 }
2381
2382 /*----------------------------------------------------------------------------
2383 | Returns the result of converting the double-precision floating-point value
2384 | `a' to the single-precision floating-point format.  The conversion is
2385 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2386 | Arithmetic.
2387 *----------------------------------------------------------------------------*/
2388
2389 float32 float64_to_float32( float64 a STATUS_PARAM )
2390 {
2391     flag aSign;
2392     int16 aExp;
2393     bits64 aSig;
2394     bits32 zSig;
2395
2396     aSig = extractFloat64Frac( a );
2397     aExp = extractFloat64Exp( a );
2398     aSign = extractFloat64Sign( a );
2399     if ( aExp == 0x7FF ) {
2400         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2401         return packFloat32( aSign, 0xFF, 0 );
2402     }
2403     shift64RightJamming( aSig, 22, &aSig );
2404     zSig = aSig;
2405     if ( aExp || zSig ) {
2406         zSig |= 0x40000000;
2407         aExp -= 0x381;
2408     }
2409     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2410
2411 }
2412
2413 #ifdef FLOATX80
2414
2415 /*----------------------------------------------------------------------------
2416 | Returns the result of converting the double-precision floating-point value
2417 | `a' to the extended double-precision floating-point format.  The conversion
2418 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2419 | Arithmetic.
2420 *----------------------------------------------------------------------------*/
2421
2422 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2423 {
2424     flag aSign;
2425     int16 aExp;
2426     bits64 aSig;
2427
2428     aSig = extractFloat64Frac( a );
2429     aExp = extractFloat64Exp( a );
2430     aSign = extractFloat64Sign( a );
2431     if ( aExp == 0x7FF ) {
2432         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2433         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2434     }
2435     if ( aExp == 0 ) {
2436         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2437         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2438     }
2439     return
2440         packFloatx80(
2441             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2442
2443 }
2444
2445 #endif
2446
2447 #ifdef FLOAT128
2448
2449 /*----------------------------------------------------------------------------
2450 | Returns the result of converting the double-precision floating-point value
2451 | `a' to the quadruple-precision floating-point format.  The conversion is
2452 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2453 | Arithmetic.
2454 *----------------------------------------------------------------------------*/
2455
2456 float128 float64_to_float128( float64 a STATUS_PARAM )
2457 {
2458     flag aSign;
2459     int16 aExp;
2460     bits64 aSig, zSig0, zSig1;
2461
2462     aSig = extractFloat64Frac( a );
2463     aExp = extractFloat64Exp( a );
2464     aSign = extractFloat64Sign( a );
2465     if ( aExp == 0x7FF ) {
2466         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2467         return packFloat128( aSign, 0x7FFF, 0, 0 );
2468     }
2469     if ( aExp == 0 ) {
2470         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2471         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2472         --aExp;
2473     }
2474     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2475     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2476
2477 }
2478
2479 #endif
2480
2481 /*----------------------------------------------------------------------------
2482 | Rounds the double-precision floating-point value `a' to an integer, and
2483 | returns the result as a double-precision floating-point value.  The
2484 | operation is performed according to the IEC/IEEE Standard for Binary
2485 | Floating-Point Arithmetic.
2486 *----------------------------------------------------------------------------*/
2487
2488 float64 float64_round_to_int( float64 a STATUS_PARAM )
2489 {
2490     flag aSign;
2491     int16 aExp;
2492     bits64 lastBitMask, roundBitsMask;
2493     int8 roundingMode;
2494     bits64 z;
2495
2496     aExp = extractFloat64Exp( a );
2497     if ( 0x433 <= aExp ) {
2498         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2499             return propagateFloat64NaN( a, a STATUS_VAR );
2500         }
2501         return a;
2502     }
2503     if ( aExp < 0x3FF ) {
2504         if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2505         STATUS(float_exception_flags) |= float_flag_inexact;
2506         aSign = extractFloat64Sign( a );
2507         switch ( STATUS(float_rounding_mode) ) {
2508          case float_round_nearest_even:
2509             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2510                 return packFloat64( aSign, 0x3FF, 0 );
2511             }
2512             break;
2513          case float_round_down:
2514             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2515          case float_round_up:
2516             return make_float64(
2517             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2518         }
2519         return packFloat64( aSign, 0, 0 );
2520     }
2521     lastBitMask = 1;
2522     lastBitMask <<= 0x433 - aExp;
2523     roundBitsMask = lastBitMask - 1;
2524     z = float64_val(a);
2525     roundingMode = STATUS(float_rounding_mode);
2526     if ( roundingMode == float_round_nearest_even ) {
2527         z += lastBitMask>>1;
2528         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2529     }
2530     else if ( roundingMode != float_round_to_zero ) {
2531         if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2532             z += roundBitsMask;
2533         }
2534     }
2535     z &= ~ roundBitsMask;
2536     if ( z != float64_val(a) )
2537         STATUS(float_exception_flags) |= float_flag_inexact;
2538     return make_float64(z);
2539
2540 }
2541
2542 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2543 {
2544     int oldmode;
2545     float64 res;
2546     oldmode = STATUS(float_rounding_mode);
2547     STATUS(float_rounding_mode) = float_round_to_zero;
2548     res = float64_round_to_int(a STATUS_VAR);
2549     STATUS(float_rounding_mode) = oldmode;
2550     return res;
2551 }
2552
2553 /*----------------------------------------------------------------------------
2554 | Returns the result of adding the absolute values of the double-precision
2555 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2556 | before being returned.  `zSign' is ignored if the result is a NaN.
2557 | The addition is performed according to the IEC/IEEE Standard for Binary
2558 | Floating-Point Arithmetic.
2559 *----------------------------------------------------------------------------*/
2560
2561 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2562 {
2563     int16 aExp, bExp, zExp;
2564     bits64 aSig, bSig, zSig;
2565     int16 expDiff;
2566
2567     aSig = extractFloat64Frac( a );
2568     aExp = extractFloat64Exp( a );
2569     bSig = extractFloat64Frac( b );
2570     bExp = extractFloat64Exp( b );
2571     expDiff = aExp - bExp;
2572     aSig <<= 9;
2573     bSig <<= 9;
2574     if ( 0 < expDiff ) {
2575         if ( aExp == 0x7FF ) {
2576             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2577             return a;
2578         }
2579         if ( bExp == 0 ) {
2580             --expDiff;
2581         }
2582         else {
2583             bSig |= LIT64( 0x2000000000000000 );
2584         }
2585         shift64RightJamming( bSig, expDiff, &bSig );
2586         zExp = aExp;
2587     }
2588     else if ( expDiff < 0 ) {
2589         if ( bExp == 0x7FF ) {
2590             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2591             return packFloat64( zSign, 0x7FF, 0 );
2592         }
2593         if ( aExp == 0 ) {
2594             ++expDiff;
2595         }
2596         else {
2597             aSig |= LIT64( 0x2000000000000000 );
2598         }
2599         shift64RightJamming( aSig, - expDiff, &aSig );
2600         zExp = bExp;
2601     }
2602     else {
2603         if ( aExp == 0x7FF ) {
2604             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2605             return a;
2606         }
2607         if ( aExp == 0 ) {
2608             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
2609             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2610         }
2611         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2612         zExp = aExp;
2613         goto roundAndPack;
2614     }
2615     aSig |= LIT64( 0x2000000000000000 );
2616     zSig = ( aSig + bSig )<<1;
2617     --zExp;
2618     if ( (sbits64) zSig < 0 ) {
2619         zSig = aSig + bSig;
2620         ++zExp;
2621     }
2622  roundAndPack:
2623     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2624
2625 }
2626
2627 /*----------------------------------------------------------------------------
2628 | Returns the result of subtracting the absolute values of the double-
2629 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2630 | difference is negated before being returned.  `zSign' is ignored if the
2631 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2632 | Standard for Binary Floating-Point Arithmetic.
2633 *----------------------------------------------------------------------------*/
2634
2635 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2636 {
2637     int16 aExp, bExp, zExp;
2638     bits64 aSig, bSig, zSig;
2639     int16 expDiff;
2640
2641     aSig = extractFloat64Frac( a );
2642     aExp = extractFloat64Exp( a );
2643     bSig = extractFloat64Frac( b );
2644     bExp = extractFloat64Exp( b );
2645     expDiff = aExp - bExp;
2646     aSig <<= 10;
2647     bSig <<= 10;
2648     if ( 0 < expDiff ) goto aExpBigger;
2649     if ( expDiff < 0 ) goto bExpBigger;
2650     if ( aExp == 0x7FF ) {
2651         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2652         float_raise( float_flag_invalid STATUS_VAR);
2653         return float64_default_nan;
2654     }
2655     if ( aExp == 0 ) {
2656         aExp = 1;
2657         bExp = 1;
2658     }
2659     if ( bSig < aSig ) goto aBigger;
2660     if ( aSig < bSig ) goto bBigger;
2661     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2662  bExpBigger:
2663     if ( bExp == 0x7FF ) {
2664         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2665         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2666     }
2667     if ( aExp == 0 ) {
2668         ++expDiff;
2669     }
2670     else {
2671         aSig |= LIT64( 0x4000000000000000 );
2672     }
2673     shift64RightJamming( aSig, - expDiff, &aSig );
2674     bSig |= LIT64( 0x4000000000000000 );
2675  bBigger:
2676     zSig = bSig - aSig;
2677     zExp = bExp;
2678     zSign ^= 1;
2679     goto normalizeRoundAndPack;
2680  aExpBigger:
2681     if ( aExp == 0x7FF ) {
2682         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2683         return a;
2684     }
2685     if ( bExp == 0 ) {
2686         --expDiff;
2687     }
2688     else {
2689         bSig |= LIT64( 0x4000000000000000 );
2690     }
2691     shift64RightJamming( bSig, expDiff, &bSig );
2692     aSig |= LIT64( 0x4000000000000000 );
2693  aBigger:
2694     zSig = aSig - bSig;
2695     zExp = aExp;
2696  normalizeRoundAndPack:
2697     --zExp;
2698     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2699
2700 }
2701
2702 /*----------------------------------------------------------------------------
2703 | Returns the result of adding the double-precision floating-point values `a'
2704 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2705 | Binary Floating-Point Arithmetic.
2706 *----------------------------------------------------------------------------*/
2707
2708 float64 float64_add( float64 a, float64 b STATUS_PARAM )
2709 {
2710     flag aSign, bSign;
2711
2712     aSign = extractFloat64Sign( a );
2713     bSign = extractFloat64Sign( b );
2714     if ( aSign == bSign ) {
2715         return addFloat64Sigs( a, b, aSign STATUS_VAR );
2716     }
2717     else {
2718         return subFloat64Sigs( a, b, aSign STATUS_VAR );
2719     }
2720
2721 }
2722
2723 /*----------------------------------------------------------------------------
2724 | Returns the result of subtracting the double-precision floating-point values
2725 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2726 | for Binary Floating-Point Arithmetic.
2727 *----------------------------------------------------------------------------*/
2728
2729 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2730 {
2731     flag aSign, bSign;
2732
2733     aSign = extractFloat64Sign( a );
2734     bSign = extractFloat64Sign( b );
2735     if ( aSign == bSign ) {
2736         return subFloat64Sigs( a, b, aSign STATUS_VAR );
2737     }
2738     else {
2739         return addFloat64Sigs( a, b, aSign STATUS_VAR );
2740     }
2741
2742 }
2743
2744 /*----------------------------------------------------------------------------
2745 | Returns the result of multiplying the double-precision floating-point values
2746 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2747 | for Binary Floating-Point Arithmetic.
2748 *----------------------------------------------------------------------------*/
2749
2750 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2751 {
2752     flag aSign, bSign, zSign;
2753     int16 aExp, bExp, zExp;
2754     bits64 aSig, bSig, zSig0, zSig1;
2755
2756     aSig = extractFloat64Frac( a );
2757     aExp = extractFloat64Exp( a );
2758     aSign = extractFloat64Sign( a );
2759     bSig = extractFloat64Frac( b );
2760     bExp = extractFloat64Exp( b );
2761     bSign = extractFloat64Sign( b );
2762     zSign = aSign ^ bSign;
2763     if ( aExp == 0x7FF ) {
2764         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2765             return propagateFloat64NaN( a, b STATUS_VAR );
2766         }
2767         if ( ( bExp | bSig ) == 0 ) {
2768             float_raise( float_flag_invalid STATUS_VAR);
2769             return float64_default_nan;
2770         }
2771         return packFloat64( zSign, 0x7FF, 0 );
2772     }
2773     if ( bExp == 0x7FF ) {
2774         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2775         if ( ( aExp | aSig ) == 0 ) {
2776             float_raise( float_flag_invalid STATUS_VAR);
2777             return float64_default_nan;
2778         }
2779         return packFloat64( zSign, 0x7FF, 0 );
2780     }
2781     if ( aExp == 0 ) {
2782         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2783         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2784     }
2785     if ( bExp == 0 ) {
2786         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2787         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2788     }
2789     zExp = aExp + bExp - 0x3FF;
2790     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2791     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2792     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2793     zSig0 |= ( zSig1 != 0 );
2794     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2795         zSig0 <<= 1;
2796         --zExp;
2797     }
2798     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2799
2800 }
2801
2802 /*----------------------------------------------------------------------------
2803 | Returns the result of dividing the double-precision floating-point value `a'
2804 | by the corresponding value `b'.  The operation is performed according to
2805 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2806 *----------------------------------------------------------------------------*/
2807
2808 float64 float64_div( float64 a, float64 b STATUS_PARAM )
2809 {
2810     flag aSign, bSign, zSign;
2811     int16 aExp, bExp, zExp;
2812     bits64 aSig, bSig, zSig;
2813     bits64 rem0, rem1;
2814     bits64 term0, term1;
2815
2816     aSig = extractFloat64Frac( a );
2817     aExp = extractFloat64Exp( a );
2818     aSign = extractFloat64Sign( a );
2819     bSig = extractFloat64Frac( b );
2820     bExp = extractFloat64Exp( b );
2821     bSign = extractFloat64Sign( b );
2822     zSign = aSign ^ bSign;
2823     if ( aExp == 0x7FF ) {
2824         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2825         if ( bExp == 0x7FF ) {
2826             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2827             float_raise( float_flag_invalid STATUS_VAR);
2828             return float64_default_nan;
2829         }
2830         return packFloat64( zSign, 0x7FF, 0 );
2831     }
2832     if ( bExp == 0x7FF ) {
2833         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2834         return packFloat64( zSign, 0, 0 );
2835     }
2836     if ( bExp == 0 ) {
2837         if ( bSig == 0 ) {
2838             if ( ( aExp | aSig ) == 0 ) {
2839                 float_raise( float_flag_invalid STATUS_VAR);
2840                 return float64_default_nan;
2841             }
2842             float_raise( float_flag_divbyzero STATUS_VAR);
2843             return packFloat64( zSign, 0x7FF, 0 );
2844         }
2845         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2846     }
2847     if ( aExp == 0 ) {
2848         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2849         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2850     }
2851     zExp = aExp - bExp + 0x3FD;
2852     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2853     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2854     if ( bSig <= ( aSig + aSig ) ) {
2855         aSig >>= 1;
2856         ++zExp;
2857     }
2858     zSig = estimateDiv128To64( aSig, 0, bSig );
2859     if ( ( zSig & 0x1FF ) <= 2 ) {
2860         mul64To128( bSig, zSig, &term0, &term1 );
2861         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2862         while ( (sbits64) rem0 < 0 ) {
2863             --zSig;
2864             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2865         }
2866         zSig |= ( rem1 != 0 );
2867     }
2868     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2869
2870 }
2871
2872 /*----------------------------------------------------------------------------
2873 | Returns the remainder of the double-precision floating-point value `a'
2874 | with respect to the corresponding value `b'.  The operation is performed
2875 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2876 *----------------------------------------------------------------------------*/
2877
2878 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
2879 {
2880     flag aSign, bSign, zSign;
2881     int16 aExp, bExp, expDiff;
2882     bits64 aSig, bSig;
2883     bits64 q, alternateASig;
2884     sbits64 sigMean;
2885
2886     aSig = extractFloat64Frac( a );
2887     aExp = extractFloat64Exp( a );
2888     aSign = extractFloat64Sign( a );
2889     bSig = extractFloat64Frac( b );
2890     bExp = extractFloat64Exp( b );
2891     bSign = extractFloat64Sign( b );
2892     if ( aExp == 0x7FF ) {
2893         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2894             return propagateFloat64NaN( a, b STATUS_VAR );
2895         }
2896         float_raise( float_flag_invalid STATUS_VAR);
2897         return float64_default_nan;
2898     }
2899     if ( bExp == 0x7FF ) {
2900         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2901         return a;
2902     }
2903     if ( bExp == 0 ) {
2904         if ( bSig == 0 ) {
2905             float_raise( float_flag_invalid STATUS_VAR);
2906             return float64_default_nan;
2907         }
2908         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2909     }
2910     if ( aExp == 0 ) {
2911         if ( aSig == 0 ) return a;
2912         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2913     }
2914     expDiff = aExp - bExp;
2915     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2916     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2917     if ( expDiff < 0 ) {
2918         if ( expDiff < -1 ) return a;
2919         aSig >>= 1;
2920     }
2921     q = ( bSig <= aSig );
2922     if ( q ) aSig -= bSig;
2923     expDiff -= 64;
2924     while ( 0 < expDiff ) {
2925         q = estimateDiv128To64( aSig, 0, bSig );
2926         q = ( 2 < q ) ? q - 2 : 0;
2927         aSig = - ( ( bSig>>2 ) * q );
2928         expDiff -= 62;
2929     }
2930     expDiff += 64;
2931     if ( 0 < expDiff ) {
2932         q = estimateDiv128To64( aSig, 0, bSig );
2933         q = ( 2 < q ) ? q - 2 : 0;
2934         q >>= 64 - expDiff;
2935         bSig >>= 2;
2936         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2937     }
2938     else {
2939         aSig >>= 2;
2940         bSig >>= 2;
2941     }
2942     do {
2943         alternateASig = aSig;
2944         ++q;
2945         aSig -= bSig;
2946     } while ( 0 <= (sbits64) aSig );
2947     sigMean = aSig + alternateASig;
2948     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2949         aSig = alternateASig;
2950     }
2951     zSign = ( (sbits64) aSig < 0 );
2952     if ( zSign ) aSig = - aSig;
2953     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
2954
2955 }
2956
2957 /*----------------------------------------------------------------------------
2958 | Returns the square root of the double-precision floating-point value `a'.
2959 | The operation is performed according to the IEC/IEEE Standard for Binary
2960 | Floating-Point Arithmetic.
2961 *----------------------------------------------------------------------------*/
2962
2963 float64 float64_sqrt( float64 a STATUS_PARAM )
2964 {
2965     flag aSign;
2966     int16 aExp, zExp;
2967     bits64 aSig, zSig, doubleZSig;
2968     bits64 rem0, rem1, term0, term1;
2969
2970     aSig = extractFloat64Frac( a );
2971     aExp = extractFloat64Exp( a );
2972     aSign = extractFloat64Sign( a );
2973     if ( aExp == 0x7FF ) {
2974         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
2975         if ( ! aSign ) return a;
2976         float_raise( float_flag_invalid STATUS_VAR);
2977         return float64_default_nan;
2978     }
2979     if ( aSign ) {
2980         if ( ( aExp | aSig ) == 0 ) return a;
2981         float_raise( float_flag_invalid STATUS_VAR);
2982         return float64_default_nan;
2983     }
2984     if ( aExp == 0 ) {
2985         if ( aSig == 0 ) return float64_zero;
2986         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2987     }
2988     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2989     aSig |= LIT64( 0x0010000000000000 );
2990     zSig = estimateSqrt32( aExp, aSig>>21 );
2991     aSig <<= 9 - ( aExp & 1 );
2992     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2993     if ( ( zSig & 0x1FF ) <= 5 ) {
2994         doubleZSig = zSig<<1;
2995         mul64To128( zSig, zSig, &term0, &term1 );
2996         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2997         while ( (sbits64) rem0 < 0 ) {
2998             --zSig;
2999             doubleZSig -= 2;
3000             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3001         }
3002         zSig |= ( ( rem0 | rem1 ) != 0 );
3003     }
3004     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3005
3006 }
3007
3008 /*----------------------------------------------------------------------------
3009 | Returns 1 if the double-precision floating-point value `a' is equal to the
3010 | corresponding value `b', and 0 otherwise.  The comparison is performed
3011 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3012 *----------------------------------------------------------------------------*/
3013
3014 int float64_eq( float64 a, float64 b STATUS_PARAM )
3015 {
3016     bits64 av, bv;
3017
3018     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3019          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3020        ) {
3021         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3022             float_raise( float_flag_invalid STATUS_VAR);
3023         }
3024         return 0;
3025     }
3026     av = float64_val(a);
3027     bv = float64_val(b);
3028     return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3029
3030 }
3031
3032 /*----------------------------------------------------------------------------
3033 | Returns 1 if the double-precision floating-point value `a' is less than or
3034 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
3035 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3036 | Arithmetic.
3037 *----------------------------------------------------------------------------*/
3038
3039 int float64_le( float64 a, float64 b STATUS_PARAM )
3040 {
3041     flag aSign, bSign;
3042     bits64 av, bv;
3043
3044     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3045          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3046        ) {
3047         float_raise( float_flag_invalid STATUS_VAR);
3048         return 0;
3049     }
3050     aSign = extractFloat64Sign( a );
3051     bSign = extractFloat64Sign( b );
3052     av = float64_val(a);
3053     bv = float64_val(b);
3054     if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3055     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3056
3057 }
3058
3059 /*----------------------------------------------------------------------------
3060 | Returns 1 if the double-precision floating-point value `a' is less than
3061 | the corresponding value `b', and 0 otherwise.  The comparison is performed
3062 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3063 *----------------------------------------------------------------------------*/
3064
3065 int float64_lt( float64 a, float64 b STATUS_PARAM )
3066 {
3067     flag aSign, bSign;
3068     bits64 av, bv;
3069
3070     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3071          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3072        ) {
3073         float_raise( float_flag_invalid STATUS_VAR);
3074         return 0;
3075     }
3076     aSign = extractFloat64Sign( a );
3077     bSign = extractFloat64Sign( b );
3078     av = float64_val(a);
3079     bv = float64_val(b);
3080     if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3081     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3082
3083 }
3084
3085 /*----------------------------------------------------------------------------
3086 | Returns 1 if the double-precision floating-point value `a' is equal to the
3087 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
3088 | if either operand is a NaN.  Otherwise, the comparison is performed
3089 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3090 *----------------------------------------------------------------------------*/
3091
3092 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3093 {
3094     bits64 av, bv;
3095
3096     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3097          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3098        ) {
3099         float_raise( float_flag_invalid STATUS_VAR);
3100         return 0;
3101     }
3102     av = float64_val(a);
3103     bv = float64_val(b);
3104     return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3105
3106 }
3107
3108 /*----------------------------------------------------------------------------
3109 | Returns 1 if the double-precision floating-point value `a' is less than or
3110 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3111 | cause an exception.  Otherwise, the comparison is performed according to the
3112 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3113 *----------------------------------------------------------------------------*/
3114
3115 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3116 {
3117     flag aSign, bSign;
3118     bits64 av, bv;
3119
3120     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3121          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3122        ) {
3123         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3124             float_raise( float_flag_invalid STATUS_VAR);
3125         }
3126         return 0;
3127     }
3128     aSign = extractFloat64Sign( a );
3129     bSign = extractFloat64Sign( b );
3130     av = float64_val(a);
3131     bv = float64_val(b);
3132     if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3133     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3134
3135 }
3136
3137 /*----------------------------------------------------------------------------
3138 | Returns 1 if the double-precision floating-point value `a' is less than
3139 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3140 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3141 | Standard for Binary Floating-Point Arithmetic.
3142 *----------------------------------------------------------------------------*/
3143
3144 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3145 {
3146     flag aSign, bSign;
3147     bits64 av, bv;
3148
3149     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3150          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3151        ) {
3152         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3153             float_raise( float_flag_invalid STATUS_VAR);
3154         }
3155         return 0;
3156     }
3157     aSign = extractFloat64Sign( a );
3158     bSign = extractFloat64Sign( b );
3159     av = float64_val(a);
3160     bv = float64_val(b);
3161     if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3162     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3163
3164 }
3165
3166 #ifdef FLOATX80
3167
3168 /*----------------------------------------------------------------------------
3169 | Returns the result of converting the extended double-precision floating-
3170 | point value `a' to the 32-bit two's complement integer format.  The
3171 | conversion is performed according to the IEC/IEEE Standard for Binary
3172 | Floating-Point Arithmetic---which means in particular that the conversion
3173 | is rounded according to the current rounding mode.  If `a' is a NaN, the
3174 | largest positive integer is returned.  Otherwise, if the conversion
3175 | overflows, the largest integer with the same sign as `a' is returned.
3176 *----------------------------------------------------------------------------*/
3177
3178 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3179 {
3180     flag aSign;
3181     int32 aExp, shiftCount;
3182     bits64 aSig;
3183
3184     aSig = extractFloatx80Frac( a );
3185     aExp = extractFloatx80Exp( a );
3186     aSign = extractFloatx80Sign( a );
3187     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3188     shiftCount = 0x4037 - aExp;
3189     if ( shiftCount <= 0 ) shiftCount = 1;
3190     shift64RightJamming( aSig, shiftCount, &aSig );
3191     return roundAndPackInt32( aSign, aSig STATUS_VAR );
3192
3193 }
3194
3195 /*----------------------------------------------------------------------------
3196 | Returns the result of converting the extended double-precision floating-
3197 | point value `a' to the 32-bit two's complement integer format.  The
3198 | conversion is performed according to the IEC/IEEE Standard for Binary
3199 | Floating-Point Arithmetic, except that the conversion is always rounded
3200 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3201 | Otherwise, if the conversion overflows, the largest integer with the same
3202 | sign as `a' is returned.
3203 *----------------------------------------------------------------------------*/
3204
3205 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3206 {
3207     flag aSign;
3208     int32 aExp, shiftCount;
3209     bits64 aSig, savedASig;
3210     int32 z;
3211
3212     aSig = extractFloatx80Frac( a );
3213     aExp = extractFloatx80Exp( a );
3214     aSign = extractFloatx80Sign( a );
3215     if ( 0x401E < aExp ) {
3216         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3217         goto invalid;
3218     }
3219     else if ( aExp < 0x3FFF ) {
3220         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3221         return 0;
3222     }
3223     shiftCount = 0x403E - aExp;
3224     savedASig = aSig;
3225     aSig >>= shiftCount;
3226     z = aSig;
3227     if ( aSign ) z = - z;
3228     if ( ( z < 0 ) ^ aSign ) {
3229  invalid:
3230         float_raise( float_flag_invalid STATUS_VAR);
3231         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3232     }
3233     if ( ( aSig<<shiftCount ) != savedASig ) {
3234         STATUS(float_exception_flags) |= float_flag_inexact;
3235     }
3236     return z;
3237
3238 }
3239
3240 /*----------------------------------------------------------------------------
3241 | Returns the result of converting the extended double-precision floating-
3242 | point value `a' to the 64-bit two's complement integer format.  The
3243 | conversion is performed according to the IEC/IEEE Standard for Binary
3244 | Floating-Point Arithmetic---which means in particular that the conversion
3245 | is rounded according to the current rounding mode.  If `a' is a NaN,
3246 | the largest positive integer is returned.  Otherwise, if the conversion
3247 | overflows, the largest integer with the same sign as `a' is returned.
3248 *----------------------------------------------------------------------------*/
3249
3250 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3251 {
3252     flag aSign;
3253     int32 aExp, shiftCount;
3254     bits64 aSig, aSigExtra;
3255
3256     aSig = extractFloatx80Frac( a );
3257     aExp = extractFloatx80Exp( a );
3258     aSign = extractFloatx80Sign( a );
3259     shiftCount = 0x403E - aExp;
3260     if ( shiftCount <= 0 ) {
3261         if ( shiftCount ) {
3262             float_raise( float_flag_invalid STATUS_VAR);
3263             if (    ! aSign
3264                  || (    ( aExp == 0x7FFF )
3265                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3266                ) {
3267                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3268             }
3269             return (sbits64) LIT64( 0x8000000000000000 );
3270         }
3271         aSigExtra = 0;
3272     }
3273     else {
3274         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3275     }
3276     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3277
3278 }
3279
3280 /*----------------------------------------------------------------------------
3281 | Returns the result of converting the extended double-precision floating-
3282 | point value `a' to the 64-bit two's complement integer format.  The
3283 | conversion is performed according to the IEC/IEEE Standard for Binary
3284 | Floating-Point Arithmetic, except that the conversion is always rounded
3285 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3286 | Otherwise, if the conversion overflows, the largest integer with the same
3287 | sign as `a' is returned.
3288 *----------------------------------------------------------------------------*/
3289
3290 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3291 {
3292     flag aSign;
3293     int32 aExp, shiftCount;
3294     bits64 aSig;
3295     int64 z;
3296
3297     aSig = extractFloatx80Frac( a );
3298     aExp = extractFloatx80Exp( a );
3299     aSign = extractFloatx80Sign( a );
3300     shiftCount = aExp - 0x403E;
3301     if ( 0 <= shiftCount ) {
3302         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3303         if ( ( a.high != 0xC03E ) || aSig ) {
3304             float_raise( float_flag_invalid STATUS_VAR);
3305             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3306                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3307             }
3308         }
3309         return (sbits64) LIT64( 0x8000000000000000 );
3310     }
3311     else if ( aExp < 0x3FFF ) {
3312         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3313         return 0;
3314     }
3315     z = aSig>>( - shiftCount );
3316     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3317         STATUS(float_exception_flags) |= float_flag_inexact;
3318     }
3319     if ( aSign ) z = - z;
3320     return z;
3321
3322 }
3323
3324 /*----------------------------------------------------------------------------
3325 | Returns the result of converting the extended double-precision floating-
3326 | point value `a' to the single-precision floating-point format.  The
3327 | conversion is performed according to the IEC/IEEE Standard for Binary
3328 | Floating-Point Arithmetic.
3329 *----------------------------------------------------------------------------*/
3330
3331 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3332 {
3333     flag aSign;
3334     int32 aExp;
3335     bits64 aSig;
3336
3337     aSig = extractFloatx80Frac( a );
3338     aExp = extractFloatx80Exp( a );
3339     aSign = extractFloatx80Sign( a );
3340     if ( aExp == 0x7FFF ) {
3341         if ( (bits64) ( aSig<<1 ) ) {
3342             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3343         }
3344         return packFloat32( aSign, 0xFF, 0 );
3345     }
3346     shift64RightJamming( aSig, 33, &aSig );
3347     if ( aExp || aSig ) aExp -= 0x3F81;
3348     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3349
3350 }
3351
3352 /*----------------------------------------------------------------------------
3353 | Returns the result of converting the extended double-precision floating-
3354 | point value `a' to the double-precision floating-point format.  The
3355 | conversion is performed according to the IEC/IEEE Standard for Binary
3356 | Floating-Point Arithmetic.
3357 *----------------------------------------------------------------------------*/
3358
3359 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3360 {
3361     flag aSign;
3362     int32 aExp;
3363     bits64 aSig, zSig;
3364
3365     aSig = extractFloatx80Frac( a );
3366     aExp = extractFloatx80Exp( a );
3367     aSign = extractFloatx80Sign( a );
3368     if ( aExp == 0x7FFF ) {
3369         if ( (bits64) ( aSig<<1 ) ) {
3370             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3371         }
3372         return packFloat64( aSign, 0x7FF, 0 );
3373     }
3374     shift64RightJamming( aSig, 1, &zSig );
3375     if ( aExp || aSig ) aExp -= 0x3C01;
3376     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3377
3378 }
3379
3380 #ifdef FLOAT128
3381
3382 /*----------------------------------------------------------------------------
3383 | Returns the result of converting the extended double-precision floating-
3384 | point value `a' to the quadruple-precision floating-point format.  The
3385 | conversion is performed according to the IEC/IEEE Standard for Binary
3386 | Floating-Point Arithmetic.
3387 *----------------------------------------------------------------------------*/
3388
3389 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3390 {
3391     flag aSign;
3392     int16 aExp;
3393     bits64 aSig, zSig0, zSig1;
3394
3395     aSig = extractFloatx80Frac( a );
3396     aExp = extractFloatx80Exp( a );
3397     aSign = extractFloatx80Sign( a );
3398     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3399         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3400     }
3401     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3402     return packFloat128( aSign, aExp, zSig0, zSig1 );
3403
3404 }
3405
3406 #endif
3407
3408 /*----------------------------------------------------------------------------
3409 | Rounds the extended double-precision floating-point value `a' to an integer,
3410 | and returns the result as an extended quadruple-precision floating-point
3411 | value.  The operation is performed according to the IEC/IEEE Standard for
3412 | Binary Floating-Point Arithmetic.
3413 *----------------------------------------------------------------------------*/
3414
3415 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3416 {
3417     flag aSign;
3418     int32 aExp;
3419     bits64 lastBitMask, roundBitsMask;
3420     int8 roundingMode;
3421     floatx80 z;
3422
3423     aExp = extractFloatx80Exp( a );
3424     if ( 0x403E <= aExp ) {
3425         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3426             return propagateFloatx80NaN( a, a STATUS_VAR );
3427         }
3428         return a;
3429     }
3430     if ( aExp < 0x3FFF ) {
3431         if (    ( aExp == 0 )
3432              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3433             return a;
3434         }
3435         STATUS(float_exception_flags) |= float_flag_inexact;
3436         aSign = extractFloatx80Sign( a );
3437         switch ( STATUS(float_rounding_mode) ) {
3438          case float_round_nearest_even:
3439             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3440                ) {
3441                 return
3442                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3443             }
3444             break;
3445          case float_round_down:
3446             return
3447                   aSign ?
3448                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3449                 : packFloatx80( 0, 0, 0 );
3450          case float_round_up:
3451             return
3452                   aSign ? packFloatx80( 1, 0, 0 )
3453                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3454         }
3455         return packFloatx80( aSign, 0, 0 );
3456     }
3457     lastBitMask = 1;
3458     lastBitMask <<= 0x403E - aExp;
3459     roundBitsMask = lastBitMask - 1;
3460     z = a;
3461     roundingMode = STATUS(float_rounding_mode);
3462     if ( roundingMode == float_round_nearest_even ) {
3463         z.low += lastBitMask>>1;
3464         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3465     }
3466     else if ( roundingMode != float_round_to_zero ) {
3467         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3468             z.low += roundBitsMask;
3469         }
3470     }
3471     z.low &= ~ roundBitsMask;
3472     if ( z.low == 0 ) {
3473         ++z.high;
3474         z.low = LIT64( 0x8000000000000000 );
3475     }
3476     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3477     return z;
3478
3479 }
3480
3481 /*----------------------------------------------------------------------------
3482 | Returns the result of adding the absolute values of the extended double-
3483 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3484 | negated before being returned.  `zSign' is ignored if the result is a NaN.
3485 | The addition is performed according to the IEC/IEEE Standard for Binary
3486 | Floating-Point Arithmetic.
3487 *----------------------------------------------------------------------------*/
3488
3489 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3490 {
3491     int32 aExp, bExp, zExp;
3492     bits64 aSig, bSig, zSig0, zSig1;
3493     int32 expDiff;
3494
3495     aSig = extractFloatx80Frac( a );
3496     aExp = extractFloatx80Exp( a );
3497     bSig = extractFloatx80Frac( b );
3498     bExp = extractFloatx80Exp( b );
3499     expDiff = aExp - bExp;
3500     if ( 0 < expDiff ) {
3501         if ( aExp == 0x7FFF ) {
3502             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3503             return a;
3504         }
3505         if ( bExp == 0 ) --expDiff;
3506         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3507         zExp = aExp;
3508     }
3509     else if ( expDiff < 0 ) {
3510         if ( bExp == 0x7FFF ) {
3511             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3512             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3513         }
3514         if ( aExp == 0 ) ++expDiff;
3515         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3516         zExp = bExp;
3517     }
3518     else {
3519         if ( aExp == 0x7FFF ) {
3520             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3521                 return propagateFloatx80NaN( a, b STATUS_VAR );
3522             }
3523             return a;
3524         }
3525         zSig1 = 0;
3526         zSig0 = aSig + bSig;
3527         if ( aExp == 0 ) {
3528             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3529             goto roundAndPack;
3530         }
3531         zExp = aExp;
3532         goto shiftRight1;
3533     }
3534     zSig0 = aSig + bSig;
3535     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3536  shiftRight1:
3537     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3538     zSig0 |= LIT64( 0x8000000000000000 );
3539     ++zExp;
3540  roundAndPack:
3541     return
3542         roundAndPackFloatx80(
3543             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3544
3545 }
3546
3547 /*----------------------------------------------------------------------------
3548 | Returns the result of subtracting the absolute values of the extended
3549 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3550 | difference is negated before being returned.  `zSign' is ignored if the
3551 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3552 | Standard for Binary Floating-Point Arithmetic.
3553 *----------------------------------------------------------------------------*/
3554
3555 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3556 {
3557     int32 aExp, bExp, zExp;
3558     bits64 aSig, bSig, zSig0, zSig1;
3559     int32 expDiff;
3560     floatx80 z;
3561
3562     aSig = extractFloatx80Frac( a );
3563     aExp = extractFloatx80Exp( a );
3564     bSig = extractFloatx80Frac( b );
3565     bExp = extractFloatx80Exp( b );
3566     expDiff = aExp - bExp;
3567     if ( 0 < expDiff ) goto aExpBigger;
3568     if ( expDiff < 0 ) goto bExpBigger;
3569     if ( aExp == 0x7FFF ) {
3570         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3571             return propagateFloatx80NaN( a, b STATUS_VAR );
3572         }
3573         float_raise( float_flag_invalid STATUS_VAR);
3574         z.low = floatx80_default_nan_low;
3575         z.high = floatx80_default_nan_high;
3576         return z;
3577     }
3578     if ( aExp == 0 ) {
3579         aExp = 1;
3580         bExp = 1;
3581     }
3582     zSig1 = 0;
3583     if ( bSig < aSig ) goto aBigger;
3584     if ( aSig < bSig ) goto bBigger;
3585     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3586  bExpBigger:
3587     if ( bExp == 0x7FFF ) {
3588         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3589         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3590     }
3591     if ( aExp == 0 ) ++expDiff;
3592     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3593  bBigger:
3594     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3595     zExp = bExp;
3596     zSign ^= 1;
3597     goto normalizeRoundAndPack;
3598  aExpBigger:
3599     if ( aExp == 0x7FFF ) {
3600         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3601         return a;
3602     }
3603     if ( bExp == 0 ) --expDiff;
3604     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3605  aBigger:
3606     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3607     zExp = aExp;
3608  normalizeRoundAndPack:
3609     return
3610         normalizeRoundAndPackFloatx80(
3611             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3612
3613 }
3614
3615 /*----------------------------------------------------------------------------
3616 | Returns the result of adding the extended double-precision floating-point
3617 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
3618 | Standard for Binary Floating-Point Arithmetic.
3619 *----------------------------------------------------------------------------*/
3620
3621 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3622 {
3623     flag aSign, bSign;
3624
3625     aSign = extractFloatx80Sign( a );
3626     bSign = extractFloatx80Sign( b );
3627     if ( aSign == bSign ) {
3628         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3629     }
3630     else {
3631         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3632     }
3633
3634 }
3635
3636 /*----------------------------------------------------------------------------
3637 | Returns the result of subtracting the extended double-precision floating-
3638 | point values `a' and `b'.  The operation is performed according to the
3639 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3640 *----------------------------------------------------------------------------*/
3641
3642 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3643 {
3644     flag aSign, bSign;
3645
3646     aSign = extractFloatx80Sign( a );
3647     bSign = extractFloatx80Sign( b );
3648     if ( aSign == bSign ) {
3649         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3650     }
3651     else {
3652         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3653     }
3654
3655 }
3656
3657 /*----------------------------------------------------------------------------
3658 | Returns the result of multiplying the extended double-precision floating-
3659 | point values `a' and `b'.  The operation is performed according to the
3660 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3661 *----------------------------------------------------------------------------*/
3662
3663 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3664 {
3665     flag aSign, bSign, zSign;
3666     int32 aExp, bExp, zExp;
3667     bits64 aSig, bSig, zSig0, zSig1;
3668     floatx80 z;
3669
3670     aSig = extractFloatx80Frac( a );
3671     aExp = extractFloatx80Exp( a );
3672     aSign = extractFloatx80Sign( a );
3673     bSig = extractFloatx80Frac( b );
3674     bExp = extractFloatx80Exp( b );
3675     bSign = extractFloatx80Sign( b );
3676     zSign = aSign ^ bSign;
3677     if ( aExp == 0x7FFF ) {
3678         if (    (bits64) ( aSig<<1 )
3679              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3680             return propagateFloatx80NaN( a, b STATUS_VAR );
3681         }
3682         if ( ( bExp | bSig ) == 0 ) goto invalid;
3683         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3684     }
3685     if ( bExp == 0x7FFF ) {
3686         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3687         if ( ( aExp | aSig ) == 0 ) {
3688  invalid:
3689             float_raise( float_flag_invalid STATUS_VAR);
3690             z.low = floatx80_default_nan_low;
3691             z.high = floatx80_default_nan_high;
3692             return z;
3693         }
3694         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3695     }
3696     if ( aExp == 0 ) {
3697         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3698         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3699     }
3700     if ( bExp == 0 ) {
3701         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3702         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3703     }
3704     zExp = aExp + bExp - 0x3FFE;
3705     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3706     if ( 0 < (sbits64) zSig0 ) {
3707         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3708         --zExp;
3709     }
3710     return
3711         roundAndPackFloatx80(
3712             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3713
3714 }
3715
3716 /*----------------------------------------------------------------------------
3717 | Returns the result of dividing the extended double-precision floating-point
3718 | value `a' by the corresponding value `b'.  The operation is performed
3719 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3720 *----------------------------------------------------------------------------*/
3721
3722 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3723 {
3724     flag aSign, bSign, zSign;
3725     int32 aExp, bExp, zExp;
3726     bits64 aSig, bSig, zSig0, zSig1;
3727     bits64 rem0, rem1, rem2, term0, term1, term2;
3728     floatx80 z;
3729
3730     aSig = extractFloatx80Frac( a );
3731     aExp = extractFloatx80Exp( a );
3732     aSign = extractFloatx80Sign( a );
3733     bSig = extractFloatx80Frac( b );
3734     bExp = extractFloatx80Exp( b );
3735     bSign = extractFloatx80Sign( b );
3736     zSign = aSign ^ bSign;
3737     if ( aExp == 0x7FFF ) {
3738         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3739         if ( bExp == 0x7FFF ) {
3740             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3741             goto invalid;
3742         }
3743         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3744     }
3745     if ( bExp == 0x7FFF ) {
3746         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3747         return packFloatx80( zSign, 0, 0 );
3748     }
3749     if ( bExp == 0 ) {
3750         if ( bSig == 0 ) {
3751             if ( ( aExp | aSig ) == 0 ) {
3752  invalid:
3753                 float_raise( float_flag_invalid STATUS_VAR);
3754                 z.low = floatx80_default_nan_low;
3755                 z.high = floatx80_default_nan_high;
3756                 return z;
3757             }
3758             float_raise( float_flag_divbyzero STATUS_VAR);
3759             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3760         }
3761         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3762     }
3763     if ( aExp == 0 ) {
3764         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3765         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3766     }
3767     zExp = aExp - bExp + 0x3FFE;
3768     rem1 = 0;
3769     if ( bSig <= aSig ) {
3770         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3771         ++zExp;
3772     }
3773     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3774     mul64To128( bSig, zSig0, &term0, &term1 );
3775     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3776     while ( (sbits64) rem0 < 0 ) {
3777         --zSig0;
3778         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3779     }
3780     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3781     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3782         mul64To128( bSig, zSig1, &term1, &term2 );
3783         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3784         while ( (sbits64) rem1 < 0 ) {
3785             --zSig1;
3786             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3787         }
3788         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3789     }
3790     return
3791         roundAndPackFloatx80(
3792             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3793
3794 }
3795
3796 /*----------------------------------------------------------------------------
3797 | Returns the remainder of the extended double-precision floating-point value
3798 | `a' with respect to the corresponding value `b'.  The operation is performed
3799 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3800 *----------------------------------------------------------------------------*/
3801
3802 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
3803 {
3804     flag aSign, bSign, zSign;
3805     int32 aExp, bExp, expDiff;
3806     bits64 aSig0, aSig1, bSig;
3807     bits64 q, term0, term1, alternateASig0, alternateASig1;
3808     floatx80 z;
3809
3810     aSig0 = extractFloatx80Frac( a );
3811     aExp = extractFloatx80Exp( a );
3812     aSign = extractFloatx80Sign( a );
3813     bSig = extractFloatx80Frac( b );
3814     bExp = extractFloatx80Exp( b );
3815     bSign = extractFloatx80Sign( b );
3816     if ( aExp == 0x7FFF ) {
3817         if (    (bits64) ( aSig0<<1 )
3818              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3819             return propagateFloatx80NaN( a, b STATUS_VAR );
3820         }
3821         goto invalid;
3822     }
3823     if ( bExp == 0x7FFF ) {
3824         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3825         return a;
3826     }
3827     if ( bExp == 0 ) {
3828         if ( bSig == 0 ) {
3829  invalid:
3830             float_raise( float_flag_invalid STATUS_VAR);
3831             z.low = floatx80_default_nan_low;
3832             z.high = floatx80_default_nan_high;
3833             return z;
3834         }
3835         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3836     }
3837     if ( aExp == 0 ) {
3838         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3839         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3840     }
3841     bSig |= LIT64( 0x8000000000000000 );
3842     zSign = aSign;
3843     expDiff = aExp - bExp;
3844     aSig1 = 0;
3845     if ( expDiff < 0 ) {
3846         if ( expDiff < -1 ) return a;
3847         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3848         expDiff = 0;
3849     }
3850     q = ( bSig <= aSig0 );
3851     if ( q ) aSig0 -= bSig;
3852     expDiff -= 64;
3853     while ( 0 < expDiff ) {
3854         q = estimateDiv128To64( aSig0, aSig1, bSig );
3855         q = ( 2 < q ) ? q - 2 : 0;
3856         mul64To128( bSig, q, &term0, &term1 );
3857         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3858         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3859         expDiff -= 62;
3860     }
3861     expDiff += 64;
3862     if ( 0 < expDiff ) {
3863         q = estimateDiv128To64( aSig0, aSig1, bSig );
3864         q = ( 2 < q ) ? q - 2 : 0;
3865         q >>= 64 - expDiff;
3866         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3867         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3868         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3869         while ( le128( term0, term1, aSig0, aSig1 ) ) {
3870             ++q;
3871             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3872         }
3873     }
3874     else {
3875         term1 = 0;
3876         term0 = bSig;
3877     }
3878     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3879     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3880          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3881               && ( q & 1 ) )
3882        ) {
3883         aSig0 = alternateASig0;
3884         aSig1 = alternateASig1;
3885         zSign = ! zSign;
3886     }
3887     return
3888         normalizeRoundAndPackFloatx80(
3889             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
3890
3891 }
3892
3893 /*----------------------------------------------------------------------------
3894 | Returns the square root of the extended double-precision floating-point
3895 | value `a'.  The operation is performed according to the IEC/IEEE Standard
3896 | for Binary Floating-Point Arithmetic.
3897 *----------------------------------------------------------------------------*/
3898
3899 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
3900 {
3901     flag aSign;
3902     int32 aExp, zExp;
3903     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3904     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3905     floatx80 z;
3906
3907     aSig0 = extractFloatx80Frac( a );
3908     aExp = extractFloatx80Exp( a );
3909     aSign = extractFloatx80Sign( a );
3910     if ( aExp == 0x7FFF ) {
3911         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
3912         if ( ! aSign ) return a;
3913         goto invalid;
3914     }
3915     if ( aSign ) {
3916         if ( ( aExp | aSig0 ) == 0 ) return a;
3917  invalid:
3918         float_raise( float_flag_invalid STATUS_VAR);
3919         z.low = floatx80_default_nan_low;
3920         z.high = floatx80_default_nan_high;
3921         return z;
3922     }
3923     if ( aExp == 0 ) {
3924         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3925         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3926     }
3927     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3928     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3929     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3930     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3931     doubleZSig0 = zSig0<<1;
3932     mul64To128( zSig0, zSig0, &term0, &term1 );
3933     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3934     while ( (sbits64) rem0 < 0 ) {
3935         --zSig0;
3936         doubleZSig0 -= 2;
3937         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3938     }
3939     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3940     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3941         if ( zSig1 == 0 ) zSig1 = 1;
3942         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3943         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3944         mul64To128( zSig1, zSig1, &term2, &term3 );
3945         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3946         while ( (sbits64) rem1 < 0 ) {
3947             --zSig1;
3948             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3949             term3 |= 1;
3950             term2 |= doubleZSig0;
3951             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3952         }
3953         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3954     }
3955     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3956     zSig0 |= doubleZSig0;
3957     return
3958         roundAndPackFloatx80(
3959             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
3960
3961 }
3962
3963 /*----------------------------------------------------------------------------
3964 | Returns 1 if the extended double-precision floating-point value `a' is
3965 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
3966 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3967 | Arithmetic.
3968 *----------------------------------------------------------------------------*/
3969
3970 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
3971 {
3972
3973     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3974               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3975          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3976               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3977        ) {
3978         if (    floatx80_is_signaling_nan( a )
3979              || floatx80_is_signaling_nan( b ) ) {
3980             float_raise( float_flag_invalid STATUS_VAR);
3981         }
3982         return 0;
3983     }
3984     return
3985            ( a.low == b.low )
3986         && (    ( a.high == b.high )
3987              || (    ( a.low == 0 )
3988                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3989            );
3990
3991 }
3992
3993 /*----------------------------------------------------------------------------
3994 | Returns 1 if the extended double-precision floating-point value `a' is
3995 | less than or equal to the corresponding value `b', and 0 otherwise.  The
3996 | comparison is performed according to the IEC/IEEE Standard for Binary
3997 | Floating-Point Arithmetic.
3998 *----------------------------------------------------------------------------*/
3999
4000 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4001 {
4002     flag aSign, bSign;
4003
4004     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4005               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4006          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4007               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4008        ) {
4009         float_raise( float_flag_invalid STATUS_VAR);
4010         return 0;
4011     }
4012     aSign = extractFloatx80Sign( a );
4013     bSign = extractFloatx80Sign( b );
4014     if ( aSign != bSign ) {
4015         return
4016                aSign
4017             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4018                  == 0 );
4019     }
4020     return
4021           aSign ? le128( b.high, b.low, a.high, a.low )
4022         : le128( a.high, a.low, b.high, b.low );
4023
4024 }
4025
4026 /*----------------------------------------------------------------------------
4027 | Returns 1 if the extended double-precision floating-point value `a' is
4028 | less than the corresponding value `b', and 0 otherwise.  The comparison
4029 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4030 | Arithmetic.
4031 *----------------------------------------------------------------------------*/
4032
4033 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4034 {
4035     flag aSign, bSign;
4036
4037     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4038               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4039          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4040               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4041        ) {
4042         float_raise( float_flag_invalid STATUS_VAR);
4043         return 0;
4044     }
4045     aSign = extractFloatx80Sign( a );
4046     bSign = extractFloatx80Sign( b );
4047     if ( aSign != bSign ) {
4048         return
4049                aSign
4050             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4051                  != 0 );
4052     }
4053     return
4054           aSign ? lt128( b.high, b.low, a.high, a.low )
4055         : lt128( a.high, a.low, b.high, b.low );
4056
4057 }
4058
4059 /*----------------------------------------------------------------------------
4060 | Returns 1 if the extended double-precision floating-point value `a' is equal
4061 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
4062 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4063 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4064 *----------------------------------------------------------------------------*/
4065
4066 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4067 {
4068
4069     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4070               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4071          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4072               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4073        ) {
4074         float_raise( float_flag_invalid STATUS_VAR);
4075         return 0;
4076     }
4077     return
4078            ( a.low == b.low )
4079         && (    ( a.high == b.high )
4080              || (    ( a.low == 0 )
4081                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4082            );
4083
4084 }
4085
4086 /*----------------------------------------------------------------------------
4087 | Returns 1 if the extended double-precision floating-point value `a' is less
4088 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4089 | do not cause an exception.  Otherwise, the comparison is performed according
4090 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4091 *----------------------------------------------------------------------------*/
4092
4093 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4094 {
4095     flag aSign, bSign;
4096
4097     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4098               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4099          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4100               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4101        ) {
4102         if (    floatx80_is_signaling_nan( a )
4103              || floatx80_is_signaling_nan( b ) ) {
4104             float_raise( float_flag_invalid STATUS_VAR);
4105         }
4106         return 0;
4107     }
4108     aSign = extractFloatx80Sign( a );
4109     bSign = extractFloatx80Sign( b );
4110     if ( aSign != bSign ) {
4111         return
4112                aSign
4113             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4114                  == 0 );
4115     }
4116     return
4117           aSign ? le128( b.high, b.low, a.high, a.low )
4118         : le128( a.high, a.low, b.high, b.low );
4119
4120 }
4121
4122 /*----------------------------------------------------------------------------
4123 | Returns 1 if the extended double-precision floating-point value `a' is less
4124 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4125 | an exception.  Otherwise, the comparison is performed according to the
4126 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4127 *----------------------------------------------------------------------------*/
4128
4129 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4130 {
4131     flag aSign, bSign;
4132
4133     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4134               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4135          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4136               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4137        ) {
4138         if (    floatx80_is_signaling_nan( a )
4139              || floatx80_is_signaling_nan( b ) ) {
4140             float_raise( float_flag_invalid STATUS_VAR);
4141         }
4142         return 0;
4143     }
4144     aSign = extractFloatx80Sign( a );
4145     bSign = extractFloatx80Sign( b );
4146     if ( aSign != bSign ) {
4147         return
4148                aSign
4149             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4150                  != 0 );
4151     }
4152     return
4153           aSign ? lt128( b.high, b.low, a.high, a.low )
4154         : lt128( a.high, a.low, b.high, b.low );
4155
4156 }
4157
4158 #endif
4159
4160 #ifdef FLOAT128
4161
4162 /*----------------------------------------------------------------------------
4163 | Returns the result of converting the quadruple-precision floating-point
4164 | value `a' to the 32-bit two's complement integer format.  The conversion
4165 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4166 | Arithmetic---which means in particular that the conversion is rounded
4167 | according to the current rounding mode.  If `a' is a NaN, the largest
4168 | positive integer is returned.  Otherwise, if the conversion overflows, the
4169 | largest integer with the same sign as `a' is returned.
4170 *----------------------------------------------------------------------------*/
4171
4172 int32 float128_to_int32( float128 a STATUS_PARAM )
4173 {
4174     flag aSign;
4175     int32 aExp, shiftCount;
4176     bits64 aSig0, aSig1;
4177
4178     aSig1 = extractFloat128Frac1( a );
4179     aSig0 = extractFloat128Frac0( a );
4180     aExp = extractFloat128Exp( a );
4181     aSign = extractFloat128Sign( a );
4182     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4183     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4184     aSig0 |= ( aSig1 != 0 );
4185     shiftCount = 0x4028 - aExp;
4186     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4187     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4188
4189 }
4190
4191 /*----------------------------------------------------------------------------
4192 | Returns the result of converting the quadruple-precision floating-point
4193 | value `a' to the 32-bit two's complement integer format.  The conversion
4194 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4195 | Arithmetic, except that the conversion is always rounded toward zero.  If
4196 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4197 | conversion overflows, the largest integer with the same sign as `a' is
4198 | returned.
4199 *----------------------------------------------------------------------------*/
4200
4201 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4202 {
4203     flag aSign;
4204     int32 aExp, shiftCount;
4205     bits64 aSig0, aSig1, savedASig;
4206     int32 z;
4207
4208     aSig1 = extractFloat128Frac1( a );
4209     aSig0 = extractFloat128Frac0( a );
4210     aExp = extractFloat128Exp( a );
4211     aSign = extractFloat128Sign( a );
4212     aSig0 |= ( aSig1 != 0 );
4213     if ( 0x401E < aExp ) {
4214         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4215         goto invalid;
4216     }
4217     else if ( aExp < 0x3FFF ) {
4218         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4219         return 0;
4220     }
4221     aSig0 |= LIT64( 0x0001000000000000 );
4222     shiftCount = 0x402F - aExp;
4223     savedASig = aSig0;
4224     aSig0 >>= shiftCount;
4225     z = aSig0;
4226     if ( aSign ) z = - z;
4227     if ( ( z < 0 ) ^ aSign ) {
4228  invalid:
4229         float_raise( float_flag_invalid STATUS_VAR);
4230         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4231     }
4232     if ( ( aSig0<<shiftCount ) != savedASig ) {
4233         STATUS(float_exception_flags) |= float_flag_inexact;
4234     }
4235     return z;
4236
4237 }
4238
4239 /*----------------------------------------------------------------------------
4240 | Returns the result of converting the quadruple-precision floating-point
4241 | value `a' to the 64-bit two's complement integer format.  The conversion
4242 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4243 | Arithmetic---which means in particular that the conversion is rounded
4244 | according to the current rounding mode.  If `a' is a NaN, the largest
4245 | positive integer is returned.  Otherwise, if the conversion overflows, the
4246 | largest integer with the same sign as `a' is returned.
4247 *----------------------------------------------------------------------------*/
4248
4249 int64 float128_to_int64( float128 a STATUS_PARAM )
4250 {
4251     flag aSign;
4252     int32 aExp, shiftCount;
4253     bits64 aSig0, aSig1;
4254
4255     aSig1 = extractFloat128Frac1( a );
4256     aSig0 = extractFloat128Frac0( a );
4257     aExp = extractFloat128Exp( a );
4258     aSign = extractFloat128Sign( a );
4259     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4260     shiftCount = 0x402F - aExp;
4261     if ( shiftCount <= 0 ) {
4262         if ( 0x403E < aExp ) {
4263             float_raise( float_flag_invalid STATUS_VAR);
4264             if (    ! aSign
4265                  || (    ( aExp == 0x7FFF )
4266                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4267                     )
4268                ) {
4269                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4270             }
4271             return (sbits64) LIT64( 0x8000000000000000 );
4272         }
4273         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4274     }
4275     else {
4276         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4277     }
4278     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4279
4280 }
4281
4282 /*----------------------------------------------------------------------------
4283 | Returns the result of converting the quadruple-precision floating-point
4284 | value `a' to the 64-bit two's complement integer format.  The conversion
4285 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4286 | Arithmetic, except that the conversion is always rounded toward zero.
4287 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4288 | the conversion overflows, the largest integer with the same sign as `a' is
4289 | returned.
4290 *----------------------------------------------------------------------------*/
4291
4292 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4293 {
4294     flag aSign;
4295     int32 aExp, shiftCount;
4296     bits64 aSig0, aSig1;
4297     int64 z;
4298
4299     aSig1 = extractFloat128Frac1( a );
4300     aSig0 = extractFloat128Frac0( a );
4301     aExp = extractFloat128Exp( a );
4302     aSign = extractFloat128Sign( a );
4303     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4304     shiftCount = aExp - 0x402F;
4305     if ( 0 < shiftCount ) {
4306         if ( 0x403E <= aExp ) {
4307             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4308             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4309                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4310                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4311             }
4312             else {
4313                 float_raise( float_flag_invalid STATUS_VAR);
4314                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4315                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4316                 }
4317             }
4318             return (sbits64) LIT64( 0x8000000000000000 );
4319         }
4320         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4321         if ( (bits64) ( aSig1<<shiftCount ) ) {
4322             STATUS(float_exception_flags) |= float_flag_inexact;
4323         }
4324     }
4325     else {
4326         if ( aExp < 0x3FFF ) {
4327             if ( aExp | aSig0 | aSig1 ) {
4328                 STATUS(float_exception_flags) |= float_flag_inexact;
4329             }
4330             return 0;
4331         }
4332         z = aSig0>>( - shiftCount );
4333         if (    aSig1
4334              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4335             STATUS(float_exception_flags) |= float_flag_inexact;
4336         }
4337     }
4338     if ( aSign ) z = - z;
4339     return z;
4340
4341 }
4342
4343 /*----------------------------------------------------------------------------
4344 | Returns the result of converting the quadruple-precision floating-point
4345 | value `a' to the single-precision floating-point format.  The conversion
4346 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4347 | Arithmetic.
4348 *----------------------------------------------------------------------------*/
4349
4350 float32 float128_to_float32( float128 a STATUS_PARAM )
4351 {
4352     flag aSign;
4353     int32 aExp;
4354     bits64 aSig0, aSig1;
4355     bits32 zSig;
4356
4357     aSig1 = extractFloat128Frac1( a );
4358     aSig0 = extractFloat128Frac0( a );
4359     aExp = extractFloat128Exp( a );
4360     aSign = extractFloat128Sign( a );
4361     if ( aExp == 0x7FFF ) {
4362         if ( aSig0 | aSig1 ) {
4363             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4364         }
4365         return packFloat32( aSign, 0xFF, 0 );
4366     }
4367     aSig0 |= ( aSig1 != 0 );
4368     shift64RightJamming( aSig0, 18, &aSig0 );
4369     zSig = aSig0;
4370     if ( aExp || zSig ) {
4371         zSig |= 0x40000000;
4372         aExp -= 0x3F81;
4373     }
4374     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4375
4376 }
4377
4378 /*----------------------------------------------------------------------------
4379 | Returns the result of converting the quadruple-precision floating-point
4380 | value `a' to the double-precision floating-point format.  The conversion
4381 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4382 | Arithmetic.
4383 *----------------------------------------------------------------------------*/
4384
4385 float64 float128_to_float64( float128 a STATUS_PARAM )
4386 {
4387     flag aSign;
4388     int32 aExp;
4389     bits64 aSig0, aSig1;
4390
4391     aSig1 = extractFloat128Frac1( a );
4392     aSig0 = extractFloat128Frac0( a );
4393     aExp = extractFloat128Exp( a );
4394     aSign = extractFloat128Sign( a );
4395     if ( aExp == 0x7FFF ) {
4396         if ( aSig0 | aSig1 ) {
4397             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4398         }
4399         return packFloat64( aSign, 0x7FF, 0 );
4400     }
4401     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4402     aSig0 |= ( aSig1 != 0 );
4403     if ( aExp || aSig0 ) {
4404         aSig0 |= LIT64( 0x4000000000000000 );
4405         aExp -= 0x3C01;
4406     }
4407     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4408
4409 }
4410
4411 #ifdef FLOATX80
4412
4413 /*----------------------------------------------------------------------------
4414 | Returns the result of converting the quadruple-precision floating-point
4415 | value `a' to the extended double-precision floating-point format.  The
4416 | conversion is performed according to the IEC/IEEE Standard for Binary
4417 | Floating-Point Arithmetic.
4418 *----------------------------------------------------------------------------*/
4419
4420 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4421 {
4422     flag aSign;
4423     int32 aExp;
4424     bits64 aSig0, aSig1;
4425
4426     aSig1 = extractFloat128Frac1( a );
4427     aSig0 = extractFloat128Frac0( a );
4428     aExp = extractFloat128Exp( a );
4429     aSign = extractFloat128Sign( a );
4430     if ( aExp == 0x7FFF ) {
4431         if ( aSig0 | aSig1 ) {
4432             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4433         }
4434         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4435     }
4436     if ( aExp == 0 ) {
4437         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4438         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4439     }
4440     else {
4441         aSig0 |= LIT64( 0x0001000000000000 );
4442     }
4443     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4444     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4445
4446 }
4447
4448 #endif
4449
4450 /*----------------------------------------------------------------------------
4451 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4452 | returns the result as a quadruple-precision floating-point value.  The
4453 | operation is performed according to the IEC/IEEE Standard for Binary
4454 | Floating-Point Arithmetic.
4455 *----------------------------------------------------------------------------*/
4456
4457 float128 float128_round_to_int( float128 a STATUS_PARAM )
4458 {
4459     flag aSign;
4460     int32 aExp;
4461     bits64 lastBitMask, roundBitsMask;
4462     int8 roundingMode;
4463     float128 z;
4464
4465     aExp = extractFloat128Exp( a );
4466     if ( 0x402F <= aExp ) {
4467         if ( 0x406F <= aExp ) {
4468             if (    ( aExp == 0x7FFF )
4469                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4470                ) {
4471                 return propagateFloat128NaN( a, a STATUS_VAR );
4472             }
4473             return a;
4474         }
4475         lastBitMask = 1;
4476         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4477         roundBitsMask = lastBitMask - 1;
4478         z = a;
4479         roundingMode = STATUS(float_rounding_mode);
4480         if ( roundingMode == float_round_nearest_even ) {
4481             if ( lastBitMask ) {
4482                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4483                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4484             }
4485             else {
4486                 if ( (sbits64) z.low < 0 ) {
4487                     ++z.high;
4488                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4489                 }
4490             }
4491         }
4492         else if ( roundingMode != float_round_to_zero ) {
4493             if (   extractFloat128Sign( z )
4494                  ^ ( roundingMode == float_round_up ) ) {
4495                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4496             }
4497         }
4498         z.low &= ~ roundBitsMask;
4499     }
4500     else {
4501         if ( aExp < 0x3FFF ) {
4502             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4503             STATUS(float_exception_flags) |= float_flag_inexact;
4504             aSign = extractFloat128Sign( a );
4505             switch ( STATUS(float_rounding_mode) ) {
4506              case float_round_nearest_even:
4507                 if (    ( aExp == 0x3FFE )
4508                      && (   extractFloat128Frac0( a )
4509                           | extractFloat128Frac1( a ) )
4510                    ) {
4511                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4512                 }
4513                 break;
4514              case float_round_down:
4515                 return
4516                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4517                     : packFloat128( 0, 0, 0, 0 );
4518              case float_round_up:
4519                 return
4520                       aSign ? packFloat128( 1, 0, 0, 0 )
4521                     : packFloat128( 0, 0x3FFF, 0, 0 );
4522             }
4523             return packFloat128( aSign, 0, 0, 0 );
4524         }
4525         lastBitMask = 1;
4526         lastBitMask <<= 0x402F - aExp;
4527         roundBitsMask = lastBitMask - 1;
4528         z.low = 0;
4529         z.high = a.high;
4530         roundingMode = STATUS(float_rounding_mode);
4531         if ( roundingMode == float_round_nearest_even ) {
4532             z.high += lastBitMask>>1;
4533             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4534                 z.high &= ~ lastBitMask;
4535             }
4536         }
4537         else if ( roundingMode != float_round_to_zero ) {
4538             if (   extractFloat128Sign( z )
4539                  ^ ( roundingMode == float_round_up ) ) {
4540                 z.high |= ( a.low != 0 );
4541                 z.high += roundBitsMask;
4542             }
4543         }
4544         z.high &= ~ roundBitsMask;
4545     }
4546     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4547         STATUS(float_exception_flags) |= float_flag_inexact;
4548     }
4549     return z;
4550
4551 }
4552
4553 /*----------------------------------------------------------------------------
4554 | Returns the result of adding the absolute values of the quadruple-precision
4555 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4556 | before being returned.  `zSign' is ignored if the result is a NaN.
4557 | The addition is performed according to the IEC/IEEE Standard for Binary
4558 | Floating-Point Arithmetic.
4559 *----------------------------------------------------------------------------*/
4560
4561 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4562 {
4563     int32 aExp, bExp, zExp;
4564     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4565     int32 expDiff;
4566
4567     aSig1 = extractFloat128Frac1( a );
4568     aSig0 = extractFloat128Frac0( a );
4569     aExp = extractFloat128Exp( a );
4570     bSig1 = extractFloat128Frac1( b );
4571     bSig0 = extractFloat128Frac0( b );
4572     bExp = extractFloat128Exp( b );
4573     expDiff = aExp - bExp;
4574     if ( 0 < expDiff ) {
4575         if ( aExp == 0x7FFF ) {
4576             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4577             return a;
4578         }
4579         if ( bExp == 0 ) {
4580             --expDiff;
4581         }
4582         else {
4583             bSig0 |= LIT64( 0x0001000000000000 );
4584         }
4585         shift128ExtraRightJamming(
4586             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4587         zExp = aExp;
4588     }
4589     else if ( expDiff < 0 ) {
4590         if ( bExp == 0x7FFF ) {
4591             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4592             return packFloat128( zSign, 0x7FFF, 0, 0 );
4593         }
4594         if ( aExp == 0 ) {
4595             ++expDiff;
4596         }
4597         else {
4598             aSig0 |= LIT64( 0x0001000000000000 );
4599         }
4600         shift128ExtraRightJamming(
4601             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4602         zExp = bExp;
4603     }
4604     else {
4605         if ( aExp == 0x7FFF ) {
4606             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4607                 return propagateFloat128NaN( a, b STATUS_VAR );
4608             }
4609             return a;
4610         }
4611         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4612         if ( aExp == 0 ) {
4613             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
4614             return packFloat128( zSign, 0, zSig0, zSig1 );
4615         }
4616         zSig2 = 0;
4617         zSig0 |= LIT64( 0x0002000000000000 );
4618         zExp = aExp;
4619         goto shiftRight1;
4620     }
4621     aSig0 |= LIT64( 0x0001000000000000 );
4622     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4623     --zExp;
4624     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4625     ++zExp;
4626  shiftRight1:
4627     shift128ExtraRightJamming(
4628         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4629  roundAndPack:
4630     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4631
4632 }
4633
4634 /*----------------------------------------------------------------------------
4635 | Returns the result of subtracting the absolute values of the quadruple-
4636 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
4637 | difference is negated before being returned.  `zSign' is ignored if the
4638 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4639 | Standard for Binary Floating-Point Arithmetic.
4640 *----------------------------------------------------------------------------*/
4641
4642 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4643 {
4644     int32 aExp, bExp, zExp;
4645     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4646     int32 expDiff;
4647     float128 z;
4648
4649     aSig1 = extractFloat128Frac1( a );
4650     aSig0 = extractFloat128Frac0( a );
4651     aExp = extractFloat128Exp( a );
4652     bSig1 = extractFloat128Frac1( b );
4653     bSig0 = extractFloat128Frac0( b );
4654     bExp = extractFloat128Exp( b );
4655     expDiff = aExp - bExp;
4656     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4657     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4658     if ( 0 < expDiff ) goto aExpBigger;
4659     if ( expDiff < 0 ) goto bExpBigger;
4660     if ( aExp == 0x7FFF ) {
4661         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4662             return propagateFloat128NaN( a, b STATUS_VAR );
4663         }
4664         float_raise( float_flag_invalid STATUS_VAR);
4665         z.low = float128_default_nan_low;
4666         z.high = float128_default_nan_high;
4667         return z;
4668     }
4669     if ( aExp == 0 ) {
4670         aExp = 1;
4671         bExp = 1;
4672     }
4673     if ( bSig0 < aSig0 ) goto aBigger;
4674     if ( aSig0 < bSig0 ) goto bBigger;
4675     if ( bSig1 < aSig1 ) goto aBigger;
4676     if ( aSig1 < bSig1 ) goto bBigger;
4677     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4678  bExpBigger:
4679     if ( bExp == 0x7FFF ) {
4680         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4681         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4682     }
4683     if ( aExp == 0 ) {
4684         ++expDiff;
4685     }
4686     else {
4687         aSig0 |= LIT64( 0x4000000000000000 );
4688     }
4689     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4690     bSig0 |= LIT64( 0x4000000000000000 );
4691  bBigger:
4692     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4693     zExp = bExp;
4694     zSign ^= 1;
4695     goto normalizeRoundAndPack;
4696  aExpBigger:
4697     if ( aExp == 0x7FFF ) {
4698         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4699         return a;
4700     }
4701     if ( bExp == 0 ) {
4702         --expDiff;
4703     }
4704     else {
4705         bSig0 |= LIT64( 0x4000000000000000 );
4706     }
4707     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4708     aSig0 |= LIT64( 0x4000000000000000 );
4709  aBigger:
4710     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4711     zExp = aExp;
4712  normalizeRoundAndPack:
4713     --zExp;
4714     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4715
4716 }
4717
4718 /*----------------------------------------------------------------------------
4719 | Returns the result of adding the quadruple-precision floating-point values
4720 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4721 | for Binary Floating-Point Arithmetic.
4722 *----------------------------------------------------------------------------*/
4723
4724 float128 float128_add( float128 a, float128 b STATUS_PARAM )
4725 {
4726     flag aSign, bSign;
4727
4728     aSign = extractFloat128Sign( a );
4729     bSign = extractFloat128Sign( b );
4730     if ( aSign == bSign ) {
4731         return addFloat128Sigs( a, b, aSign STATUS_VAR );
4732     }
4733     else {
4734         return subFloat128Sigs( a, b, aSign STATUS_VAR );
4735     }
4736
4737 }
4738
4739 /*----------------------------------------------------------------------------
4740 | Returns the result of subtracting the quadruple-precision floating-point
4741 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4742 | Standard for Binary Floating-Point Arithmetic.
4743 *----------------------------------------------------------------------------*/
4744
4745 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4746 {
4747     flag aSign, bSign;
4748
4749     aSign = extractFloat128Sign( a );
4750     bSign = extractFloat128Sign( b );
4751     if ( aSign == bSign ) {
4752         return subFloat128Sigs( a, b, aSign STATUS_VAR );
4753     }
4754     else {
4755         return addFloat128Sigs( a, b, aSign STATUS_VAR );
4756     }
4757
4758 }
4759
4760 /*----------------------------------------------------------------------------
4761 | Returns the result of multiplying the quadruple-precision floating-point
4762 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4763 | Standard for Binary Floating-Point Arithmetic.
4764 *----------------------------------------------------------------------------*/
4765
4766 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4767 {
4768     flag aSign, bSign, zSign;
4769     int32 aExp, bExp, zExp;
4770     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4771     float128 z;
4772
4773     aSig1 = extractFloat128Frac1( a );
4774     aSig0 = extractFloat128Frac0( a );
4775     aExp = extractFloat128Exp( a );
4776     aSign = extractFloat128Sign( a );
4777     bSig1 = extractFloat128Frac1( b );
4778     bSig0 = extractFloat128Frac0( b );
4779     bExp = extractFloat128Exp( b );
4780     bSign = extractFloat128Sign( b );
4781     zSign = aSign ^ bSign;
4782     if ( aExp == 0x7FFF ) {
4783         if (    ( aSig0 | aSig1 )
4784              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4785             return propagateFloat128NaN( a, b STATUS_VAR );
4786         }
4787         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4788         return packFloat128( zSign, 0x7FFF, 0, 0 );
4789     }
4790     if ( bExp == 0x7FFF ) {
4791         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4792         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4793  invalid:
4794             float_raise( float_flag_invalid STATUS_VAR);
4795             z.low = float128_default_nan_low;
4796             z.high = float128_default_nan_high;
4797             return z;
4798         }
4799         return packFloat128( zSign, 0x7FFF, 0, 0 );
4800     }
4801     if ( aExp == 0 ) {
4802         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4803         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4804     }
4805     if ( bExp == 0 ) {
4806         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4807         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4808     }
4809     zExp = aExp + bExp - 0x4000;
4810     aSig0 |= LIT64( 0x0001000000000000 );
4811     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4812     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4813     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4814     zSig2 |= ( zSig3 != 0 );
4815     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4816         shift128ExtraRightJamming(
4817             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4818         ++zExp;
4819     }
4820     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4821
4822 }
4823
4824 /*----------------------------------------------------------------------------
4825 | Returns the result of dividing the quadruple-precision floating-point value
4826 | `a' by the corresponding value `b'.  The operation is performed according to
4827 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4828 *----------------------------------------------------------------------------*/
4829
4830 float128 float128_div( float128 a, float128 b STATUS_PARAM )
4831 {
4832     flag aSign, bSign, zSign;
4833     int32 aExp, bExp, zExp;
4834     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4835     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4836     float128 z;
4837
4838     aSig1 = extractFloat128Frac1( a );
4839     aSig0 = extractFloat128Frac0( a );
4840     aExp = extractFloat128Exp( a );
4841     aSign = extractFloat128Sign( a );
4842     bSig1 = extractFloat128Frac1( b );
4843     bSig0 = extractFloat128Frac0( b );
4844     bExp = extractFloat128Exp( b );
4845     bSign = extractFloat128Sign( b );
4846     zSign = aSign ^ bSign;
4847     if ( aExp == 0x7FFF ) {
4848         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4849         if ( bExp == 0x7FFF ) {
4850             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4851             goto invalid;
4852         }
4853         return packFloat128( zSign, 0x7FFF, 0, 0 );
4854     }
4855     if ( bExp == 0x7FFF ) {
4856         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4857         return packFloat128( zSign, 0, 0, 0 );
4858     }
4859     if ( bExp == 0 ) {
4860         if ( ( bSig0 | bSig1 ) == 0 ) {
4861             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4862  invalid:
4863                 float_raise( float_flag_invalid STATUS_VAR);
4864                 z.low = float128_default_nan_low;
4865                 z.high = float128_default_nan_high;
4866                 return z;
4867             }
4868             float_raise( float_flag_divbyzero STATUS_VAR);
4869             return packFloat128( zSign, 0x7FFF, 0, 0 );
4870         }
4871         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4872     }
4873     if ( aExp == 0 ) {
4874         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4875         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4876     }
4877     zExp = aExp - bExp + 0x3FFD;
4878     shortShift128Left(
4879         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4880     shortShift128Left(
4881         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4882     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4883         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4884         ++zExp;
4885     }
4886     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4887     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4888     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4889     while ( (sbits64) rem0 < 0 ) {
4890         --zSig0;
4891         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4892     }
4893     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4894     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4895         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4896         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4897         while ( (sbits64) rem1 < 0 ) {
4898             --zSig1;
4899             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4900         }
4901         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4902     }
4903     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4904     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4905
4906 }
4907
4908 /*----------------------------------------------------------------------------
4909 | Returns the remainder of the quadruple-precision floating-point value `a'
4910 | with respect to the corresponding value `b'.  The operation is performed
4911 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4912 *----------------------------------------------------------------------------*/
4913
4914 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
4915 {
4916     flag aSign, bSign, zSign;
4917     int32 aExp, bExp, expDiff;
4918     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4919     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4920     sbits64 sigMean0;
4921     float128 z;
4922
4923     aSig1 = extractFloat128Frac1( a );
4924     aSig0 = extractFloat128Frac0( a );
4925     aExp = extractFloat128Exp( a );
4926     aSign = extractFloat128Sign( a );
4927     bSig1 = extractFloat128Frac1( b );
4928     bSig0 = extractFloat128Frac0( b );
4929     bExp = extractFloat128Exp( b );
4930     bSign = extractFloat128Sign( b );
4931     if ( aExp == 0x7FFF ) {
4932         if (    ( aSig0 | aSig1 )
4933              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4934             return propagateFloat128NaN( a, b STATUS_VAR );
4935         }
4936         goto invalid;
4937     }
4938     if ( bExp == 0x7FFF ) {
4939         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4940         return a;
4941     }
4942     if ( bExp == 0 ) {
4943         if ( ( bSig0 | bSig1 ) == 0 ) {
4944  invalid:
4945             float_raise( float_flag_invalid STATUS_VAR);
4946             z.low = float128_default_nan_low;
4947             z.high = float128_default_nan_high;
4948             return z;
4949         }
4950         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4951     }
4952     if ( aExp == 0 ) {
4953         if ( ( aSig0 | aSig1 ) == 0 ) return a;
4954         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4955     }
4956     expDiff = aExp - bExp;
4957     if ( expDiff < -1 ) return a;
4958     shortShift128Left(
4959         aSig0 | LIT64( 0x0001000000000000 ),
4960         aSig1,
4961         15 - ( expDiff < 0 ),
4962         &aSig0,
4963         &aSig1
4964     );
4965     shortShift128Left(
4966         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4967     q = le128( bSig0, bSig1, aSig0, aSig1 );
4968     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4969     expDiff -= 64;
4970     while ( 0 < expDiff ) {
4971         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4972         q = ( 4 < q ) ? q - 4 : 0;
4973         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4974         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4975         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4976         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4977         expDiff -= 61;
4978     }
4979     if ( -64 < expDiff ) {
4980         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4981         q = ( 4 < q ) ? q - 4 : 0;
4982         q >>= - expDiff;
4983         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4984         expDiff += 52;
4985         if ( expDiff < 0 ) {
4986             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4987         }
4988         else {
4989             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4990         }
4991         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4992         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4993     }
4994     else {
4995         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4996         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4997     }
4998     do {
4999         alternateASig0 = aSig0;
5000         alternateASig1 = aSig1;
5001         ++q;
5002         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5003     } while ( 0 <= (sbits64) aSig0 );
5004     add128(
5005         aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5006     if (    ( sigMean0 < 0 )
5007          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5008         aSig0 = alternateASig0;
5009         aSig1 = alternateASig1;
5010     }
5011     zSign = ( (sbits64) aSig0 < 0 );
5012     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5013     return
5014         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5015
5016 }
5017
5018 /*----------------------------------------------------------------------------
5019 | Returns the square root of the quadruple-precision floating-point value `a'.
5020 | The operation is performed according to the IEC/IEEE Standard for Binary
5021 | Floating-Point Arithmetic.
5022 *----------------------------------------------------------------------------*/
5023
5024 float128 float128_sqrt( float128 a STATUS_PARAM )
5025 {
5026     flag aSign;
5027     int32 aExp, zExp;
5028     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5029     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5030     float128 z;
5031
5032     aSig1 = extractFloat128Frac1( a );
5033     aSig0 = extractFloat128Frac0( a );
5034     aExp = extractFloat128Exp( a );
5035     aSign = extractFloat128Sign( a );
5036     if ( aExp == 0x7FFF ) {
5037         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5038         if ( ! aSign ) return a;
5039         goto invalid;
5040     }
5041     if ( aSign ) {
5042         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5043  invalid:
5044         float_raise( float_flag_invalid STATUS_VAR);
5045         z.low = float128_default_nan_low;
5046         z.high = float128_default_nan_high;
5047         return z;
5048     }
5049     if ( aExp == 0 ) {
5050         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5051         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5052     }
5053     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5054     aSig0 |= LIT64( 0x0001000000000000 );
5055     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5056     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5057     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5058     doubleZSig0 = zSig0<<1;
5059     mul64To128( zSig0, zSig0, &term0, &term1 );
5060     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5061     while ( (sbits64) rem0 < 0 ) {
5062         --zSig0;
5063         doubleZSig0 -= 2;
5064         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5065     }
5066     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5067     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5068         if ( zSig1 == 0 ) zSig1 = 1;
5069         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5070         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5071         mul64To128( zSig1, zSig1, &term2, &term3 );
5072         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5073         while ( (sbits64) rem1 < 0 ) {
5074             --zSig1;
5075             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5076             term3 |= 1;
5077             term2 |= doubleZSig0;
5078             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5079         }
5080         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5081     }
5082     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5083     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5084
5085 }
5086
5087 /*----------------------------------------------------------------------------
5088 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5089 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5090 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5091 *----------------------------------------------------------------------------*/
5092
5093 int float128_eq( float128 a, float128 b STATUS_PARAM )
5094 {
5095
5096     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5097               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5098          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5099               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5100        ) {
5101         if (    float128_is_signaling_nan( a )
5102              || float128_is_signaling_nan( b ) ) {
5103             float_raise( float_flag_invalid STATUS_VAR);
5104         }
5105         return 0;
5106     }
5107     return
5108            ( a.low == b.low )
5109         && (    ( a.high == b.high )
5110              || (    ( a.low == 0 )
5111                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5112            );
5113
5114 }
5115
5116 /*----------------------------------------------------------------------------
5117 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5118 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
5119 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5120 | Arithmetic.
5121 *----------------------------------------------------------------------------*/
5122
5123 int float128_le( float128 a, float128 b STATUS_PARAM )
5124 {
5125     flag aSign, bSign;
5126
5127     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5128               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5129          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5130               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5131        ) {
5132         float_raise( float_flag_invalid STATUS_VAR);
5133         return 0;
5134     }
5135     aSign = extractFloat128Sign( a );
5136     bSign = extractFloat128Sign( b );
5137     if ( aSign != bSign ) {
5138         return
5139                aSign
5140             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5141                  == 0 );
5142     }
5143     return
5144           aSign ? le128( b.high, b.low, a.high, a.low )
5145         : le128( a.high, a.low, b.high, b.low );
5146
5147 }
5148
5149 /*----------------------------------------------------------------------------
5150 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5151 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5152 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5153 *----------------------------------------------------------------------------*/
5154
5155 int float128_lt( float128 a, float128 b STATUS_PARAM )
5156 {
5157     flag aSign, bSign;
5158
5159     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5160               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5161          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5162               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5163        ) {
5164         float_raise( float_flag_invalid STATUS_VAR);
5165         return 0;
5166     }
5167     aSign = extractFloat128Sign( a );
5168     bSign = extractFloat128Sign( b );
5169     if ( aSign != bSign ) {
5170         return
5171                aSign
5172             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5173                  != 0 );
5174     }
5175     return
5176           aSign ? lt128( b.high, b.low, a.high, a.low )
5177         : lt128( a.high, a.low, b.high, b.low );
5178
5179 }
5180
5181 /*----------------------------------------------------------------------------
5182 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5183 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5184 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5185 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5186 *----------------------------------------------------------------------------*/
5187
5188 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5189 {
5190
5191     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5192               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5193          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5194               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5195        ) {
5196         float_raise( float_flag_invalid STATUS_VAR);
5197         return 0;
5198     }
5199     return
5200            ( a.low == b.low )
5201         && (    ( a.high == b.high )
5202              || (    ( a.low == 0 )
5203                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5204            );
5205
5206 }
5207
5208 /*----------------------------------------------------------------------------
5209 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5210 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5211 | cause an exception.  Otherwise, the comparison is performed according to the
5212 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5213 *----------------------------------------------------------------------------*/
5214
5215 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5216 {
5217     flag aSign, bSign;
5218
5219     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5220               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5221          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5222               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5223        ) {
5224         if (    float128_is_signaling_nan( a )
5225              || float128_is_signaling_nan( b ) ) {
5226             float_raise( float_flag_invalid STATUS_VAR);
5227         }
5228         return 0;
5229     }
5230     aSign = extractFloat128Sign( a );
5231     bSign = extractFloat128Sign( b );
5232     if ( aSign != bSign ) {
5233         return
5234                aSign
5235             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5236                  == 0 );
5237     }
5238     return
5239           aSign ? le128( b.high, b.low, a.high, a.low )
5240         : le128( a.high, a.low, b.high, b.low );
5241
5242 }
5243
5244 /*----------------------------------------------------------------------------
5245 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5246 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5247 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5248 | Standard for Binary Floating-Point Arithmetic.
5249 *----------------------------------------------------------------------------*/
5250
5251 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5252 {
5253     flag aSign, bSign;
5254
5255     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5256               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5257          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5258               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5259        ) {
5260         if (    float128_is_signaling_nan( a )
5261              || float128_is_signaling_nan( b ) ) {
5262             float_raise( float_flag_invalid STATUS_VAR);
5263         }
5264         return 0;
5265     }
5266     aSign = extractFloat128Sign( a );
5267     bSign = extractFloat128Sign( b );
5268     if ( aSign != bSign ) {
5269         return
5270                aSign
5271             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5272                  != 0 );
5273     }
5274     return
5275           aSign ? lt128( b.high, b.low, a.high, a.low )
5276         : lt128( a.high, a.low, b.high, b.low );
5277
5278 }
5279
5280 #endif
5281
5282 /* misc functions */
5283 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5284 {
5285     return int64_to_float32(a STATUS_VAR);
5286 }
5287
5288 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5289 {
5290     return int64_to_float64(a STATUS_VAR);
5291 }
5292
5293 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5294 {
5295     int64_t v;
5296     unsigned int res;
5297
5298     v = float32_to_int64(a STATUS_VAR);
5299     if (v < 0) {
5300         res = 0;
5301         float_raise( float_flag_invalid STATUS_VAR);
5302     } else if (v > 0xffffffff) {
5303         res = 0xffffffff;
5304         float_raise( float_flag_invalid STATUS_VAR);
5305     } else {
5306         res = v;
5307     }
5308     return res;
5309 }
5310
5311 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5312 {
5313     int64_t v;
5314     unsigned int res;
5315
5316     v = float32_to_int64_round_to_zero(a STATUS_VAR);
5317     if (v < 0) {
5318         res = 0;
5319         float_raise( float_flag_invalid STATUS_VAR);
5320     } else if (v > 0xffffffff) {
5321         res = 0xffffffff;
5322         float_raise( float_flag_invalid STATUS_VAR);
5323     } else {
5324         res = v;
5325     }
5326     return res;
5327 }
5328
5329 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5330 {
5331     int64_t v;
5332     unsigned int res;
5333
5334     v = float64_to_int64(a STATUS_VAR);
5335     if (v < 0) {
5336         res = 0;
5337         float_raise( float_flag_invalid STATUS_VAR);
5338     } else if (v > 0xffffffff) {
5339         res = 0xffffffff;
5340         float_raise( float_flag_invalid STATUS_VAR);
5341     } else {
5342         res = v;
5343     }
5344     return res;
5345 }
5346
5347 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5348 {
5349     int64_t v;
5350     unsigned int res;
5351
5352     v = float64_to_int64_round_to_zero(a STATUS_VAR);
5353     if (v < 0) {
5354         res = 0;
5355         float_raise( float_flag_invalid STATUS_VAR);
5356     } else if (v > 0xffffffff) {
5357         res = 0xffffffff;
5358         float_raise( float_flag_invalid STATUS_VAR);
5359     } else {
5360         res = v;
5361     }
5362     return res;
5363 }
5364
5365 /* FIXME: This looks broken.  */
5366 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5367 {
5368     int64_t v;
5369
5370     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5371     v += float64_val(a);
5372     v = float64_to_int64(make_float64(v) STATUS_VAR);
5373
5374     return v - INT64_MIN;
5375 }
5376
5377 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5378 {
5379     int64_t v;
5380
5381     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5382     v += float64_val(a);
5383     v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5384
5385     return v - INT64_MIN;
5386 }
5387
5388 #define COMPARE(s, nan_exp)                                                  \
5389 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
5390                                       int is_quiet STATUS_PARAM )            \
5391 {                                                                            \
5392     flag aSign, bSign;                                                       \
5393     bits ## s av, bv;                                                        \
5394                                                                              \
5395     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5396          extractFloat ## s ## Frac( a ) ) ||                                 \
5397         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5398           extractFloat ## s ## Frac( b ) )) {                                \
5399         if (!is_quiet ||                                                     \
5400             float ## s ## _is_signaling_nan( a ) ||                          \
5401             float ## s ## _is_signaling_nan( b ) ) {                         \
5402             float_raise( float_flag_invalid STATUS_VAR);                     \
5403         }                                                                    \
5404         return float_relation_unordered;                                     \
5405     }                                                                        \
5406     aSign = extractFloat ## s ## Sign( a );                                  \
5407     bSign = extractFloat ## s ## Sign( b );                                  \
5408     av = float ## s ## _val(a);                                              \
5409     bv = float ## s ## _val(b);                                              \
5410     if ( aSign != bSign ) {                                                  \
5411         if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) {                         \
5412             /* zero case */                                                  \
5413             return float_relation_equal;                                     \
5414         } else {                                                             \
5415             return 1 - (2 * aSign);                                          \
5416         }                                                                    \
5417     } else {                                                                 \
5418         if (av == bv) {                                                      \
5419             return float_relation_equal;                                     \
5420         } else {                                                             \
5421             return 1 - 2 * (aSign ^ ( av < bv ));                            \
5422         }                                                                    \
5423     }                                                                        \
5424 }                                                                            \
5425                                                                              \
5426 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
5427 {                                                                            \
5428     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
5429 }                                                                            \
5430                                                                              \
5431 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
5432 {                                                                            \
5433     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
5434 }
5435
5436 COMPARE(32, 0xff)
5437 COMPARE(64, 0x7ff)
5438
5439 INLINE int float128_compare_internal( float128 a, float128 b,
5440                                       int is_quiet STATUS_PARAM )
5441 {
5442     flag aSign, bSign;
5443
5444     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
5445           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
5446         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
5447           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
5448         if (!is_quiet ||
5449             float128_is_signaling_nan( a ) ||
5450             float128_is_signaling_nan( b ) ) {
5451             float_raise( float_flag_invalid STATUS_VAR);
5452         }
5453         return float_relation_unordered;
5454     }
5455     aSign = extractFloat128Sign( a );
5456     bSign = extractFloat128Sign( b );
5457     if ( aSign != bSign ) {
5458         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
5459             /* zero case */
5460             return float_relation_equal;
5461         } else {
5462             return 1 - (2 * aSign);
5463         }
5464     } else {
5465         if (a.low == b.low && a.high == b.high) {
5466             return float_relation_equal;
5467         } else {
5468             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
5469         }
5470     }
5471 }
5472
5473 int float128_compare( float128 a, float128 b STATUS_PARAM )
5474 {
5475     return float128_compare_internal(a, b, 0 STATUS_VAR);
5476 }
5477
5478 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
5479 {
5480     return float128_compare_internal(a, b, 1 STATUS_VAR);
5481 }
5482
5483 /* Multiply A by 2 raised to the power N.  */
5484 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
5485 {
5486     flag aSign;
5487     int16 aExp;
5488     bits32 aSig;
5489
5490     aSig = extractFloat32Frac( a );
5491     aExp = extractFloat32Exp( a );
5492     aSign = extractFloat32Sign( a );
5493
5494     if ( aExp == 0xFF ) {
5495         return a;
5496     }
5497     if ( aExp != 0 )
5498         aSig |= 0x00800000;
5499     else if ( aSig == 0 )
5500         return a;
5501
5502     aExp += n - 1;
5503     aSig <<= 7;
5504     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
5505 }
5506
5507 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
5508 {
5509     flag aSign;
5510     int16 aExp;
5511     bits64 aSig;
5512
5513     aSig = extractFloat64Frac( a );
5514     aExp = extractFloat64Exp( a );
5515     aSign = extractFloat64Sign( a );
5516
5517     if ( aExp == 0x7FF ) {
5518         return a;
5519     }
5520     if ( aExp != 0 )
5521         aSig |= LIT64( 0x0010000000000000 );
5522     else if ( aSig == 0 )
5523         return a;
5524
5525     aExp += n - 1;
5526     aSig <<= 10;
5527     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
5528 }
5529
5530 #ifdef FLOATX80
5531 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
5532 {
5533     flag aSign;
5534     int16 aExp;
5535     bits64 aSig;
5536
5537     aSig = extractFloatx80Frac( a );
5538     aExp = extractFloatx80Exp( a );
5539     aSign = extractFloatx80Sign( a );
5540
5541     if ( aExp == 0x7FF ) {
5542         return a;
5543     }
5544     if (aExp == 0 && aSig == 0)
5545         return a;
5546
5547     aExp += n;
5548     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
5549                                           aSign, aExp, aSig, 0 STATUS_VAR );
5550 }
5551 #endif
5552
5553 #ifdef FLOAT128
5554 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
5555 {
5556     flag aSign;
5557     int32 aExp;
5558     bits64 aSig0, aSig1;
5559
5560     aSig1 = extractFloat128Frac1( a );
5561     aSig0 = extractFloat128Frac0( a );
5562     aExp = extractFloat128Exp( a );
5563     aSign = extractFloat128Sign( a );
5564     if ( aExp == 0x7FFF ) {
5565         return a;
5566     }
5567     if ( aExp != 0 )
5568         aSig0 |= LIT64( 0x0001000000000000 );
5569     else if ( aSig0 == 0 && aSig1 == 0 )
5570         return a;
5571
5572     aExp += n - 1;
5573     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
5574                                           STATUS_VAR );
5575
5576 }
5577 #endif