fpu/softfloat.c

   1
   2 /*============================================================================
   3
   4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
   5 Package, Release 2b.
   6
   7 Written by John R. Hauser.  This work was made possible in part by the
   8 International Computer Science Institute, located at Suite 600, 1947 Center
   9 Street, Berkeley, California 94704.  Funding was partially provided by the
  10 National Science Foundation under grant MIP-9311980.  The original version
  11 of this code was written as part of a project to build a fixed-point vector
  12 processor in collaboration with the University of California at Berkeley,
  13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  15 arithmetic/SoftFloat.html'.
  16
  17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  19 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  25
  26 Derivative works are acceptable, even for commercial purposes, so long as
  27 (1) the source code for the derivative work includes prominent notice that
  28 the work is derivative, and (2) the source code includes prominent notice with
  29 these four paragraphs for those parts of this code that are retained.
  30
  31 =============================================================================*/
  32
  33 /* FIXME: Flush-To-Zero only effects results.  Denormal inputs should also
  34    be flushed to zero.  */
  35 #include "softfloat.h"
  36
  37 /*----------------------------------------------------------------------------
  38 | Primitive arithmetic functions, including multi-word arithmetic, and
  39 | division and square root approximations.  (Can be specialized to target if
  40 | desired.)
  41 *----------------------------------------------------------------------------*/
  42 #include "softfloat-macros.h"
  43
  44 /*----------------------------------------------------------------------------
  45 | Functions and definitions to determine:  (1) whether tininess for underflow
  46 | is detected before or after rounding by default, (2) what (if anything)
  47 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  48 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  49 | are propagated from function inputs to output.  These details are target-
  50 | specific.
  51 *----------------------------------------------------------------------------*/
  52 #include "softfloat-specialize.h"
  53
  54 void set_float_rounding_mode(int val STATUS_PARAM)
  55 {
  56     STATUS(float_rounding_mode) = val;
  57 }
  58
  59 void set_float_exception_flags(int val STATUS_PARAM)
  60 {
  61     STATUS(float_exception_flags) = val;
  62 }
  63
  64 #ifdef FLOATX80
  65 void set_floatx80_rounding_precision(int val STATUS_PARAM)
  66 {
  67     STATUS(floatx80_rounding_precision) = val;
  68 }
  69 #endif
  70
  71 /*----------------------------------------------------------------------------
  72 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  73 | and 7, and returns the properly rounded 32-bit integer corresponding to the
  74 | input.  If `zSign' is 1, the input is negated before being converted to an
  75 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
  76 | is simply rounded to an integer, with the inexact exception raised if the
  77 | input cannot be represented exactly as an integer.  However, if the fixed-
  78 | point input is too large, the invalid exception is raised and the largest
  79 | positive or negative integer is returned.
  80 *----------------------------------------------------------------------------*/
  81
  82 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
  83 {
  84     int8 roundingMode;
  85     flag roundNearestEven;
  86     int8 roundIncrement, roundBits;
  87     int32 z;
  88
  89     roundingMode = STATUS(float_rounding_mode);
  90     roundNearestEven = ( roundingMode == float_round_nearest_even );
  91     roundIncrement = 0x40;
  92     if ( ! roundNearestEven ) {
  93         if ( roundingMode == float_round_to_zero ) {
  94             roundIncrement = 0;
  95         }
  96         else {
  97             roundIncrement = 0x7F;
  98             if ( zSign ) {
  99                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 100             }
 101             else {
 102                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 103             }
 104         }
 105     }
 106     roundBits = absZ & 0x7F;
 107     absZ = ( absZ + roundIncrement )>>7;
 108     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 109     z = absZ;
 110     if ( zSign ) z = - z;
 111     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 112         float_raise( float_flag_invalid STATUS_VAR);
 113         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 114     }
 115     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 116     return z;
 117
 118 }
 119
 120 /*----------------------------------------------------------------------------
 121 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 122 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 123 | and returns the properly rounded 64-bit integer corresponding to the input.
 124 | If `zSign' is 1, the input is negated before being converted to an integer.
 125 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 126 | the inexact exception raised if the input cannot be represented exactly as
 127 | an integer.  However, if the fixed-point input is too large, the invalid
 128 | exception is raised and the largest positive or negative integer is
 129 | returned.
 130 *----------------------------------------------------------------------------*/
 131
 132 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
 133 {
 134     int8 roundingMode;
 135     flag roundNearestEven, increment;
 136     int64 z;
 137
 138     roundingMode = STATUS(float_rounding_mode);
 139     roundNearestEven = ( roundingMode == float_round_nearest_even );
 140     increment = ( (sbits64) absZ1 < 0 );
 141     if ( ! roundNearestEven ) {
 142         if ( roundingMode == float_round_to_zero ) {
 143             increment = 0;
 144         }
 145         else {
 146             if ( zSign ) {
 147                 increment = ( roundingMode == float_round_down ) && absZ1;
 148             }
 149             else {
 150                 increment = ( roundingMode == float_round_up ) && absZ1;
 151             }
 152         }
 153     }
 154     if ( increment ) {
 155         ++absZ0;
 156         if ( absZ0 == 0 ) goto overflow;
 157         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 158     }
 159     z = absZ0;
 160     if ( zSign ) z = - z;
 161     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 162  overflow:
 163         float_raise( float_flag_invalid STATUS_VAR);
 164         return
 165               zSign ? (sbits64) LIT64( 0x8000000000000000 )
 166             : LIT64( 0x7FFFFFFFFFFFFFFF );
 167     }
 168     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 169     return z;
 170
 171 }
 172
 173 /*----------------------------------------------------------------------------
 174 | Returns the fraction bits of the single-precision floating-point value `a'.
 175 *----------------------------------------------------------------------------*/
 176
 177 INLINE bits32 extractFloat32Frac( float32 a )
 178 {
 179
 180     return float32_val(a) & 0x007FFFFF;
 181
 182 }
 183
 184 /*----------------------------------------------------------------------------
 185 | Returns the exponent bits of the single-precision floating-point value `a'.
 186 *----------------------------------------------------------------------------*/
 187
 188 INLINE int16 extractFloat32Exp( float32 a )
 189 {
 190
 191     return ( float32_val(a)>>23 ) & 0xFF;
 192
 193 }
 194
 195 /*----------------------------------------------------------------------------
 196 | Returns the sign bit of the single-precision floating-point value `a'.
 197 *----------------------------------------------------------------------------*/
 198
 199 INLINE flag extractFloat32Sign( float32 a )
 200 {
 201
 202     return float32_val(a)>>31;
 203
 204 }
 205
 206 /*----------------------------------------------------------------------------
 207 | Normalizes the subnormal single-precision floating-point value represented
 208 | by the denormalized significand `aSig'.  The normalized exponent and
 209 | significand are stored at the locations pointed to by `zExpPtr' and
 210 | `zSigPtr', respectively.
 211 *----------------------------------------------------------------------------*/
 212
 213 static void
 214  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 215 {
 216     int8 shiftCount;
 217
 218     shiftCount = countLeadingZeros32( aSig ) - 8;
 219     *zSigPtr = aSig<<shiftCount;
 220     *zExpPtr = 1 - shiftCount;
 221
 222 }
 223
 224 /*----------------------------------------------------------------------------
 225 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 226 | single-precision floating-point value, returning the result.  After being
 227 | shifted into the proper positions, the three fields are simply added
 228 | together to form the result.  This means that any integer portion of `zSig'
 229 | will be added into the exponent.  Since a properly normalized significand
 230 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 231 | than the desired result exponent whenever `zSig' is a complete, normalized
 232 | significand.
 233 *----------------------------------------------------------------------------*/
 234
 235 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 236 {
 237
 238     return make_float32(
 239           ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
 240
 241 }
 242
 243 /*----------------------------------------------------------------------------
 244 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 245 | and significand `zSig', and returns the proper single-precision floating-
 246 | point value corresponding to the abstract input.  Ordinarily, the abstract
 247 | value is simply rounded and packed into the single-precision format, with
 248 | the inexact exception raised if the abstract input cannot be represented
 249 | exactly.  However, if the abstract value is too large, the overflow and
 250 | inexact exceptions are raised and an infinity or maximal finite value is
 251 | returned.  If the abstract value is too small, the input value is rounded to
 252 | a subnormal number, and the underflow and inexact exceptions are raised if
 253 | the abstract input cannot be represented exactly as a subnormal single-
 254 | precision floating-point number.
 255 |     The input significand `zSig' has its binary point between bits 30
 256 | and 29, which is 7 bits to the left of the usual location.  This shifted
 257 | significand must be normalized or smaller.  If `zSig' is not normalized,
 258 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 259 | and it must not require rounding.  In the usual case that `zSig' is
 260 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 261 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 262 | Binary Floating-Point Arithmetic.
 263 *----------------------------------------------------------------------------*/
 264
 265 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 266 {
 267     int8 roundingMode;
 268     flag roundNearestEven;
 269     int8 roundIncrement, roundBits;
 270     flag isTiny;
 271
 272     roundingMode = STATUS(float_rounding_mode);
 273     roundNearestEven = ( roundingMode == float_round_nearest_even );
 274     roundIncrement = 0x40;
 275     if ( ! roundNearestEven ) {
 276         if ( roundingMode == float_round_to_zero ) {
 277             roundIncrement = 0;
 278         }
 279         else {
 280             roundIncrement = 0x7F;
 281             if ( zSign ) {
 282                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 283             }
 284             else {
 285                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 286             }
 287         }
 288     }
 289     roundBits = zSig & 0x7F;
 290     if ( 0xFD <= (bits16) zExp ) {
 291         if (    ( 0xFD < zExp )
 292              || (    ( zExp == 0xFD )
 293                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 294            ) {
 295             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 296             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 297         }
 298         if ( zExp < 0 ) {
 299             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
 300             isTiny =
 301                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 302                 || ( zExp < -1 )
 303                 || ( zSig + roundIncrement < 0x80000000 );
 304             shift32RightJamming( zSig, - zExp, &zSig );
 305             zExp = 0;
 306             roundBits = zSig & 0x7F;
 307             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 308         }
 309     }
 310     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 311     zSig = ( zSig + roundIncrement )>>7;
 312     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 313     if ( zSig == 0 ) zExp = 0;
 314     return packFloat32( zSign, zExp, zSig );
 315
 316 }
 317
 318 /*----------------------------------------------------------------------------
 319 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 320 | and significand `zSig', and returns the proper single-precision floating-
 321 | point value corresponding to the abstract input.  This routine is just like
 322 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 323 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 324 | floating-point exponent.
 325 *----------------------------------------------------------------------------*/
 326
 327 static float32
 328  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 329 {
 330     int8 shiftCount;
 331
 332     shiftCount = countLeadingZeros32( zSig ) - 1;
 333     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 334
 335 }
 336
 337 /*----------------------------------------------------------------------------
 338 | Returns the fraction bits of the double-precision floating-point value `a'.
 339 *----------------------------------------------------------------------------*/
 340
 341 INLINE bits64 extractFloat64Frac( float64 a )
 342 {
 343
 344     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 345
 346 }
 347
 348 /*----------------------------------------------------------------------------
 349 | Returns the exponent bits of the double-precision floating-point value `a'.
 350 *----------------------------------------------------------------------------*/
 351
 352 INLINE int16 extractFloat64Exp( float64 a )
 353 {
 354
 355     return ( float64_val(a)>>52 ) & 0x7FF;
 356
 357 }
 358
 359 /*----------------------------------------------------------------------------
 360 | Returns the sign bit of the double-precision floating-point value `a'.
 361 *----------------------------------------------------------------------------*/
 362
 363 INLINE flag extractFloat64Sign( float64 a )
 364 {
 365
 366     return float64_val(a)>>63;
 367
 368 }
 369
 370 /*----------------------------------------------------------------------------
 371 | Normalizes the subnormal double-precision floating-point value represented
 372 | by the denormalized significand `aSig'.  The normalized exponent and
 373 | significand are stored at the locations pointed to by `zExpPtr' and
 374 | `zSigPtr', respectively.
 375 *----------------------------------------------------------------------------*/
 376
 377 static void
 378  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 379 {
 380     int8 shiftCount;
 381
 382     shiftCount = countLeadingZeros64( aSig ) - 11;
 383     *zSigPtr = aSig<<shiftCount;
 384     *zExpPtr = 1 - shiftCount;
 385
 386 }
 387
 388 /*----------------------------------------------------------------------------
 389 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 390 | double-precision floating-point value, returning the result.  After being
 391 | shifted into the proper positions, the three fields are simply added
 392 | together to form the result.  This means that any integer portion of `zSig'
 393 | will be added into the exponent.  Since a properly normalized significand
 394 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 395 | than the desired result exponent whenever `zSig' is a complete, normalized
 396 | significand.
 397 *----------------------------------------------------------------------------*/
 398
 399 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 400 {
 401
 402     return make_float64(
 403         ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
 404
 405 }
 406
 407 /*----------------------------------------------------------------------------
 408 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 409 | and significand `zSig', and returns the proper double-precision floating-
 410 | point value corresponding to the abstract input.  Ordinarily, the abstract
 411 | value is simply rounded and packed into the double-precision format, with
 412 | the inexact exception raised if the abstract input cannot be represented
 413 | exactly.  However, if the abstract value is too large, the overflow and
 414 | inexact exceptions are raised and an infinity or maximal finite value is
 415 | returned.  If the abstract value is too small, the input value is rounded
 416 | to a subnormal number, and the underflow and inexact exceptions are raised
 417 | if the abstract input cannot be represented exactly as a subnormal double-
 418 | precision floating-point number.
 419 |     The input significand `zSig' has its binary point between bits 62
 420 | and 61, which is 10 bits to the left of the usual location.  This shifted
 421 | significand must be normalized or smaller.  If `zSig' is not normalized,
 422 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 423 | and it must not require rounding.  In the usual case that `zSig' is
 424 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 425 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 426 | Binary Floating-Point Arithmetic.
 427 *----------------------------------------------------------------------------*/
 428
 429 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 430 {
 431     int8 roundingMode;
 432     flag roundNearestEven;
 433     int16 roundIncrement, roundBits;
 434     flag isTiny;
 435
 436     roundingMode = STATUS(float_rounding_mode);
 437     roundNearestEven = ( roundingMode == float_round_nearest_even );
 438     roundIncrement = 0x200;
 439     if ( ! roundNearestEven ) {
 440         if ( roundingMode == float_round_to_zero ) {
 441             roundIncrement = 0;
 442         }
 443         else {
 444             roundIncrement = 0x3FF;
 445             if ( zSign ) {
 446                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 447             }
 448             else {
 449                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 450             }
 451         }
 452     }
 453     roundBits = zSig & 0x3FF;
 454     if ( 0x7FD <= (bits16) zExp ) {
 455         if (    ( 0x7FD < zExp )
 456              || (    ( zExp == 0x7FD )
 457                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 458            ) {
 459             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 460             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 461         }
 462         if ( zExp < 0 ) {
 463             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
 464             isTiny =
 465                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 466                 || ( zExp < -1 )
 467                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 468             shift64RightJamming( zSig, - zExp, &zSig );
 469             zExp = 0;
 470             roundBits = zSig & 0x3FF;
 471             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 472         }
 473     }
 474     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 475     zSig = ( zSig + roundIncrement )>>10;
 476     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 477     if ( zSig == 0 ) zExp = 0;
 478     return packFloat64( zSign, zExp, zSig );
 479
 480 }
 481
 482 /*----------------------------------------------------------------------------
 483 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 484 | and significand `zSig', and returns the proper double-precision floating-
 485 | point value corresponding to the abstract input.  This routine is just like
 486 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 487 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 488 | floating-point exponent.
 489 *----------------------------------------------------------------------------*/
 490
 491 static float64
 492  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 493 {
 494     int8 shiftCount;
 495
 496     shiftCount = countLeadingZeros64( zSig ) - 1;
 497     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 498
 499 }
 500
 501 #ifdef FLOATX80
 502
 503 /*----------------------------------------------------------------------------
 504 | Returns the fraction bits of the extended double-precision floating-point
 505 | value `a'.
 506 *----------------------------------------------------------------------------*/
 507
 508 INLINE bits64 extractFloatx80Frac( floatx80 a )
 509 {
 510
 511     return a.low;
 512
 513 }
 514
 515 /*----------------------------------------------------------------------------
 516 | Returns the exponent bits of the extended double-precision floating-point
 517 | value `a'.
 518 *----------------------------------------------------------------------------*/
 519
 520 INLINE int32 extractFloatx80Exp( floatx80 a )
 521 {
 522
 523     return a.high & 0x7FFF;
 524
 525 }
 526
 527 /*----------------------------------------------------------------------------
 528 | Returns the sign bit of the extended double-precision floating-point value
 529 | `a'.
 530 *----------------------------------------------------------------------------*/
 531
 532 INLINE flag extractFloatx80Sign( floatx80 a )
 533 {
 534
 535     return a.high>>15;
 536
 537 }
 538
 539 /*----------------------------------------------------------------------------
 540 | Normalizes the subnormal extended double-precision floating-point value
 541 | represented by the denormalized significand `aSig'.  The normalized exponent
 542 | and significand are stored at the locations pointed to by `zExpPtr' and
 543 | `zSigPtr', respectively.
 544 *----------------------------------------------------------------------------*/
 545
 546 static void
 547  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 548 {
 549     int8 shiftCount;
 550
 551     shiftCount = countLeadingZeros64( aSig );
 552     *zSigPtr = aSig<<shiftCount;
 553     *zExpPtr = 1 - shiftCount;
 554
 555 }
 556
 557 /*----------------------------------------------------------------------------
 558 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 559 | extended double-precision floating-point value, returning the result.
 560 *----------------------------------------------------------------------------*/
 561
 562 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 563 {
 564     floatx80 z;
 565
 566     z.low = zSig;
 567     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 568     return z;
 569
 570 }
 571
 572 /*----------------------------------------------------------------------------
 573 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 574 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 575 | and returns the proper extended double-precision floating-point value
 576 | corresponding to the abstract input.  Ordinarily, the abstract value is
 577 | rounded and packed into the extended double-precision format, with the
 578 | inexact exception raised if the abstract input cannot be represented
 579 | exactly.  However, if the abstract value is too large, the overflow and
 580 | inexact exceptions are raised and an infinity or maximal finite value is
 581 | returned.  If the abstract value is too small, the input value is rounded to
 582 | a subnormal number, and the underflow and inexact exceptions are raised if
 583 | the abstract input cannot be represented exactly as a subnormal extended
 584 | double-precision floating-point number.
 585 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 586 | number of bits as single or double precision, respectively.  Otherwise, the
 587 | result is rounded to the full precision of the extended double-precision
 588 | format.
 589 |     The input significand must be normalized or smaller.  If the input
 590 | significand is not normalized, `zExp' must be 0; in that case, the result
 591 | returned is a subnormal number, and it must not require rounding.  The
 592 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 593 | Floating-Point Arithmetic.
 594 *----------------------------------------------------------------------------*/
 595
 596 static floatx80
 597  roundAndPackFloatx80(
 598      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 599  STATUS_PARAM)
 600 {
 601     int8 roundingMode;
 602     flag roundNearestEven, increment, isTiny;
 603     int64 roundIncrement, roundMask, roundBits;
 604
 605     roundingMode = STATUS(float_rounding_mode);
 606     roundNearestEven = ( roundingMode == float_round_nearest_even );
 607     if ( roundingPrecision == 80 ) goto precision80;
 608     if ( roundingPrecision == 64 ) {
 609         roundIncrement = LIT64( 0x0000000000000400 );
 610         roundMask = LIT64( 0x00000000000007FF );
 611     }
 612     else if ( roundingPrecision == 32 ) {
 613         roundIncrement = LIT64( 0x0000008000000000 );
 614         roundMask = LIT64( 0x000000FFFFFFFFFF );
 615     }
 616     else {
 617         goto precision80;
 618     }
 619     zSig0 |= ( zSig1 != 0 );
 620     if ( ! roundNearestEven ) {
 621         if ( roundingMode == float_round_to_zero ) {
 622             roundIncrement = 0;
 623         }
 624         else {
 625             roundIncrement = roundMask;
 626             if ( zSign ) {
 627                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 628             }
 629             else {
 630                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 631             }
 632         }
 633     }
 634     roundBits = zSig0 & roundMask;
 635     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 636         if (    ( 0x7FFE < zExp )
 637              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 638            ) {
 639             goto overflow;
 640         }
 641         if ( zExp <= 0 ) {
 642             if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
 643             isTiny =
 644                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 645                 || ( zExp < 0 )
 646                 || ( zSig0 <= zSig0 + roundIncrement );
 647             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 648             zExp = 0;
 649             roundBits = zSig0 & roundMask;
 650             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 651             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 652             zSig0 += roundIncrement;
 653             if ( (sbits64) zSig0 < 0 ) zExp = 1;
 654             roundIncrement = roundMask + 1;
 655             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 656                 roundMask |= roundIncrement;
 657             }
 658             zSig0 &= ~ roundMask;
 659             return packFloatx80( zSign, zExp, zSig0 );
 660         }
 661     }
 662     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 663     zSig0 += roundIncrement;
 664     if ( zSig0 < roundIncrement ) {
 665         ++zExp;
 666         zSig0 = LIT64( 0x8000000000000000 );
 667     }
 668     roundIncrement = roundMask + 1;
 669     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 670         roundMask |= roundIncrement;
 671     }
 672     zSig0 &= ~ roundMask;
 673     if ( zSig0 == 0 ) zExp = 0;
 674     return packFloatx80( zSign, zExp, zSig0 );
 675  precision80:
 676     increment = ( (sbits64) zSig1 < 0 );
 677     if ( ! roundNearestEven ) {
 678         if ( roundingMode == float_round_to_zero ) {
 679             increment = 0;
 680         }
 681         else {
 682             if ( zSign ) {
 683                 increment = ( roundingMode == float_round_down ) && zSig1;
 684             }
 685             else {
 686                 increment = ( roundingMode == float_round_up ) && zSig1;
 687             }
 688         }
 689     }
 690     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 691         if (    ( 0x7FFE < zExp )
 692              || (    ( zExp == 0x7FFE )
 693                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 694                   && increment
 695                 )
 696            ) {
 697             roundMask = 0;
 698  overflow:
 699             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 700             if (    ( roundingMode == float_round_to_zero )
 701                  || ( zSign && ( roundingMode == float_round_up ) )
 702                  || ( ! zSign && ( roundingMode == float_round_down ) )
 703                ) {
 704                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 705             }
 706             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 707         }
 708         if ( zExp <= 0 ) {
 709             isTiny =
 710                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 711                 || ( zExp < 0 )
 712                 || ! increment
 713                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 714             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 715             zExp = 0;
 716             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 717             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 718             if ( roundNearestEven ) {
 719                 increment = ( (sbits64) zSig1 < 0 );
 720             }
 721             else {
 722                 if ( zSign ) {
 723                     increment = ( roundingMode == float_round_down ) && zSig1;
 724                 }
 725                 else {
 726                     increment = ( roundingMode == float_round_up ) && zSig1;
 727                 }
 728             }
 729             if ( increment ) {
 730                 ++zSig0;
 731                 zSig0 &=
 732                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 733                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
 734             }
 735             return packFloatx80( zSign, zExp, zSig0 );
 736         }
 737     }
 738     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 739     if ( increment ) {
 740         ++zSig0;
 741         if ( zSig0 == 0 ) {
 742             ++zExp;
 743             zSig0 = LIT64( 0x8000000000000000 );
 744         }
 745         else {
 746             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 747         }
 748     }
 749     else {
 750         if ( zSig0 == 0 ) zExp = 0;
 751     }
 752     return packFloatx80( zSign, zExp, zSig0 );
 753
 754 }
 755
 756 /*----------------------------------------------------------------------------
 757 | Takes an abstract floating-point value having sign `zSign', exponent
 758 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 759 | and returns the proper extended double-precision floating-point value
 760 | corresponding to the abstract input.  This routine is just like
 761 | `roundAndPackFloatx80' except that the input significand does not have to be
 762 | normalized.
 763 *----------------------------------------------------------------------------*/
 764
 765 static floatx80
 766  normalizeRoundAndPackFloatx80(
 767      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 768  STATUS_PARAM)
 769 {
 770     int8 shiftCount;
 771
 772     if ( zSig0 == 0 ) {
 773         zSig0 = zSig1;
 774         zSig1 = 0;
 775         zExp -= 64;
 776     }
 777     shiftCount = countLeadingZeros64( zSig0 );
 778     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 779     zExp -= shiftCount;
 780     return
 781         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 782
 783 }
 784
 785 #endif
 786
 787 #ifdef FLOAT128
 788
 789 /*----------------------------------------------------------------------------
 790 | Returns the least-significant 64 fraction bits of the quadruple-precision
 791 | floating-point value `a'.
 792 *----------------------------------------------------------------------------*/
 793
 794 INLINE bits64 extractFloat128Frac1( float128 a )
 795 {
 796
 797     return a.low;
 798
 799 }
 800
 801 /*----------------------------------------------------------------------------
 802 | Returns the most-significant 48 fraction bits of the quadruple-precision
 803 | floating-point value `a'.
 804 *----------------------------------------------------------------------------*/
 805
 806 INLINE bits64 extractFloat128Frac0( float128 a )
 807 {
 808
 809     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 810
 811 }
 812
 813 /*----------------------------------------------------------------------------
 814 | Returns the exponent bits of the quadruple-precision floating-point value
 815 | `a'.
 816 *----------------------------------------------------------------------------*/
 817
 818 INLINE int32 extractFloat128Exp( float128 a )
 819 {
 820
 821     return ( a.high>>48 ) & 0x7FFF;
 822
 823 }
 824
 825 /*----------------------------------------------------------------------------
 826 | Returns the sign bit of the quadruple-precision floating-point value `a'.
 827 *----------------------------------------------------------------------------*/
 828
 829 INLINE flag extractFloat128Sign( float128 a )
 830 {
 831
 832     return a.high>>63;
 833
 834 }
 835
 836 /*----------------------------------------------------------------------------
 837 | Normalizes the subnormal quadruple-precision floating-point value
 838 | represented by the denormalized significand formed by the concatenation of
 839 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 840 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 841 | significand are stored at the location pointed to by `zSig0Ptr', and the
 842 | least significant 64 bits of the normalized significand are stored at the
 843 | location pointed to by `zSig1Ptr'.
 844 *----------------------------------------------------------------------------*/
 845
 846 static void
 847  normalizeFloat128Subnormal(
 848      bits64 aSig0,
 849      bits64 aSig1,
 850      int32 *zExpPtr,
 851      bits64 *zSig0Ptr,
 852      bits64 *zSig1Ptr
 853  )
 854 {
 855     int8 shiftCount;
 856
 857     if ( aSig0 == 0 ) {
 858         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 859         if ( shiftCount < 0 ) {
 860             *zSig0Ptr = aSig1>>( - shiftCount );
 861             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 862         }
 863         else {
 864             *zSig0Ptr = aSig1<<shiftCount;
 865             *zSig1Ptr = 0;
 866         }
 867         *zExpPtr = - shiftCount - 63;
 868     }
 869     else {
 870         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 871         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 872         *zExpPtr = 1 - shiftCount;
 873     }
 874
 875 }
 876
 877 /*----------------------------------------------------------------------------
 878 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 879 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 880 | floating-point value, returning the result.  After being shifted into the
 881 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 882 | added together to form the most significant 32 bits of the result.  This
 883 | means that any integer portion of `zSig0' will be added into the exponent.
 884 | Since a properly normalized significand will have an integer portion equal
 885 | to 1, the `zExp' input should be 1 less than the desired result exponent
 886 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 887 | significand.
 888 *----------------------------------------------------------------------------*/
 889
 890 INLINE float128
 891  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 892 {
 893     float128 z;
 894
 895     z.low = zSig1;
 896     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
 897     return z;
 898
 899 }
 900
 901 /*----------------------------------------------------------------------------
 902 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 903 | and extended significand formed by the concatenation of `zSig0', `zSig1',
 904 | and `zSig2', and returns the proper quadruple-precision floating-point value
 905 | corresponding to the abstract input.  Ordinarily, the abstract value is
 906 | simply rounded and packed into the quadruple-precision format, with the
 907 | inexact exception raised if the abstract input cannot be represented
 908 | exactly.  However, if the abstract value is too large, the overflow and
 909 | inexact exceptions are raised and an infinity or maximal finite value is
 910 | returned.  If the abstract value is too small, the input value is rounded to
 911 | a subnormal number, and the underflow and inexact exceptions are raised if
 912 | the abstract input cannot be represented exactly as a subnormal quadruple-
 913 | precision floating-point number.
 914 |     The input significand must be normalized or smaller.  If the input
 915 | significand is not normalized, `zExp' must be 0; in that case, the result
 916 | returned is a subnormal number, and it must not require rounding.  In the
 917 | usual case that the input significand is normalized, `zExp' must be 1 less
 918 | than the ``true'' floating-point exponent.  The handling of underflow and
 919 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 920 *----------------------------------------------------------------------------*/
 921
 922 static float128
 923  roundAndPackFloat128(
 924      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
 925 {
 926     int8 roundingMode;
 927     flag roundNearestEven, increment, isTiny;
 928
 929     roundingMode = STATUS(float_rounding_mode);
 930     roundNearestEven = ( roundingMode == float_round_nearest_even );
 931     increment = ( (sbits64) zSig2 < 0 );
 932     if ( ! roundNearestEven ) {
 933         if ( roundingMode == float_round_to_zero ) {
 934             increment = 0;
 935         }
 936         else {
 937             if ( zSign ) {
 938                 increment = ( roundingMode == float_round_down ) && zSig2;
 939             }
 940             else {
 941                 increment = ( roundingMode == float_round_up ) && zSig2;
 942             }
 943         }
 944     }
 945     if ( 0x7FFD <= (bits32) zExp ) {
 946         if (    ( 0x7FFD < zExp )
 947              || (    ( zExp == 0x7FFD )
 948                   && eq128(
 949                          LIT64( 0x0001FFFFFFFFFFFF ),
 950                          LIT64( 0xFFFFFFFFFFFFFFFF ),
 951                          zSig0,
 952                          zSig1
 953                      )
 954                   && increment
 955                 )
 956            ) {
 957             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 958             if (    ( roundingMode == float_round_to_zero )
 959                  || ( zSign && ( roundingMode == float_round_up ) )
 960                  || ( ! zSign && ( roundingMode == float_round_down ) )
 961                ) {
 962                 return
 963                     packFloat128(
 964                         zSign,
 965                         0x7FFE,
 966                         LIT64( 0x0000FFFFFFFFFFFF ),
 967                         LIT64( 0xFFFFFFFFFFFFFFFF )
 968                     );
 969             }
 970             return packFloat128( zSign, 0x7FFF, 0, 0 );
 971         }
 972         if ( zExp < 0 ) {
 973             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
 974             isTiny =
 975                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 976                 || ( zExp < -1 )
 977                 || ! increment
 978                 || lt128(
 979                        zSig0,
 980                        zSig1,
 981                        LIT64( 0x0001FFFFFFFFFFFF ),
 982                        LIT64( 0xFFFFFFFFFFFFFFFF )
 983                    );
 984             shift128ExtraRightJamming(
 985                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
 986             zExp = 0;
 987             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
 988             if ( roundNearestEven ) {
 989                 increment = ( (sbits64) zSig2 < 0 );
 990             }
 991             else {
 992                 if ( zSign ) {
 993                     increment = ( roundingMode == float_round_down ) && zSig2;
 994                 }
 995                 else {
 996                     increment = ( roundingMode == float_round_up ) && zSig2;
 997                 }
 998             }
 999         }
1000     }
1001     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1002     if ( increment ) {
1003         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1004         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1005     }
1006     else {
1007         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1008     }
1009     return packFloat128( zSign, zExp, zSig0, zSig1 );
1010
1011 }
1012
1013 /*----------------------------------------------------------------------------
1014 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1015 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1016 | returns the proper quadruple-precision floating-point value corresponding
1017 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1018 | except that the input significand has fewer bits and does not have to be
1019 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1020 | point exponent.
1021 *----------------------------------------------------------------------------*/
1022
1023 static float128
1024  normalizeRoundAndPackFloat128(
1025      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1026 {
1027     int8 shiftCount;
1028     bits64 zSig2;
1029
1030     if ( zSig0 == 0 ) {
1031         zSig0 = zSig1;
1032         zSig1 = 0;
1033         zExp -= 64;
1034     }
1035     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1036     if ( 0 <= shiftCount ) {
1037         zSig2 = 0;
1038         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1039     }
1040     else {
1041         shift128ExtraRightJamming(
1042             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1043     }
1044     zExp -= shiftCount;
1045     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1046
1047 }
1048
1049 #endif
1050
1051 /*----------------------------------------------------------------------------
1052 | Returns the result of converting the 32-bit two's complement integer `a'
1053 | to the single-precision floating-point format.  The conversion is performed
1054 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1055 *----------------------------------------------------------------------------*/
1056
1057 float32 int32_to_float32( int32 a STATUS_PARAM )
1058 {
1059     flag zSign;
1060
1061     if ( a == 0 ) return float32_zero;
1062     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1063     zSign = ( a < 0 );
1064     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1065
1066 }
1067
1068 /*----------------------------------------------------------------------------
1069 | Returns the result of converting the 32-bit two's complement integer `a'
1070 | to the double-precision floating-point format.  The conversion is performed
1071 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1072 *----------------------------------------------------------------------------*/
1073
1074 float64 int32_to_float64( int32 a STATUS_PARAM )
1075 {
1076     flag zSign;
1077     uint32 absA;
1078     int8 shiftCount;
1079     bits64 zSig;
1080
1081     if ( a == 0 ) return float64_zero;
1082     zSign = ( a < 0 );
1083     absA = zSign ? - a : a;
1084     shiftCount = countLeadingZeros32( absA ) + 21;
1085     zSig = absA;
1086     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1087
1088 }
1089
1090 #ifdef FLOATX80
1091
1092 /*----------------------------------------------------------------------------
1093 | Returns the result of converting the 32-bit two's complement integer `a'
1094 | to the extended double-precision floating-point format.  The conversion
1095 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1096 | Arithmetic.
1097 *----------------------------------------------------------------------------*/
1098
1099 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1100 {
1101     flag zSign;
1102     uint32 absA;
1103     int8 shiftCount;
1104     bits64 zSig;
1105
1106     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1107     zSign = ( a < 0 );
1108     absA = zSign ? - a : a;
1109     shiftCount = countLeadingZeros32( absA ) + 32;
1110     zSig = absA;
1111     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1112
1113 }
1114
1115 #endif
1116
1117 #ifdef FLOAT128
1118
1119 /*----------------------------------------------------------------------------
1120 | Returns the result of converting the 32-bit two's complement integer `a' to
1121 | the quadruple-precision floating-point format.  The conversion is performed
1122 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1123 *----------------------------------------------------------------------------*/
1124
1125 float128 int32_to_float128( int32 a STATUS_PARAM )
1126 {
1127     flag zSign;
1128     uint32 absA;
1129     int8 shiftCount;
1130     bits64 zSig0;
1131
1132     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1133     zSign = ( a < 0 );
1134     absA = zSign ? - a : a;
1135     shiftCount = countLeadingZeros32( absA ) + 17;
1136     zSig0 = absA;
1137     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1138
1139 }
1140
1141 #endif
1142
1143 /*----------------------------------------------------------------------------
1144 | Returns the result of converting the 64-bit two's complement integer `a'
1145 | to the single-precision floating-point format.  The conversion is performed
1146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1147 *----------------------------------------------------------------------------*/
1148
1149 float32 int64_to_float32( int64 a STATUS_PARAM )
1150 {
1151     flag zSign;
1152     uint64 absA;
1153     int8 shiftCount;
1154
1155     if ( a == 0 ) return float32_zero;
1156     zSign = ( a < 0 );
1157     absA = zSign ? - a : a;
1158     shiftCount = countLeadingZeros64( absA ) - 40;
1159     if ( 0 <= shiftCount ) {
1160         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1161     }
1162     else {
1163         shiftCount += 7;
1164         if ( shiftCount < 0 ) {
1165             shift64RightJamming( absA, - shiftCount, &absA );
1166         }
1167         else {
1168             absA <<= shiftCount;
1169         }
1170         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1171     }
1172
1173 }
1174
1175 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1176 {
1177     int8 shiftCount;
1178
1179     if ( a == 0 ) return float32_zero;
1180     shiftCount = countLeadingZeros64( a ) - 40;
1181     if ( 0 <= shiftCount ) {
1182         return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1183     }
1184     else {
1185         shiftCount += 7;
1186         if ( shiftCount < 0 ) {
1187             shift64RightJamming( a, - shiftCount, &a );
1188         }
1189         else {
1190             a <<= shiftCount;
1191         }
1192         return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1193     }
1194 }
1195
1196 /*----------------------------------------------------------------------------
1197 | Returns the result of converting the 64-bit two's complement integer `a'
1198 | to the double-precision floating-point format.  The conversion is performed
1199 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1200 *----------------------------------------------------------------------------*/
1201
1202 float64 int64_to_float64( int64 a STATUS_PARAM )
1203 {
1204     flag zSign;
1205
1206     if ( a == 0 ) return float64_zero;
1207     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1208         return packFloat64( 1, 0x43E, 0 );
1209     }
1210     zSign = ( a < 0 );
1211     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1212
1213 }
1214
1215 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1216 {
1217     if ( a == 0 ) return float64_zero;
1218     return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1219
1220 }
1221
1222 #ifdef FLOATX80
1223
1224 /*----------------------------------------------------------------------------
1225 | Returns the result of converting the 64-bit two's complement integer `a'
1226 | to the extended double-precision floating-point format.  The conversion
1227 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1228 | Arithmetic.
1229 *----------------------------------------------------------------------------*/
1230
1231 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1232 {
1233     flag zSign;
1234     uint64 absA;
1235     int8 shiftCount;
1236
1237     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1238     zSign = ( a < 0 );
1239     absA = zSign ? - a : a;
1240     shiftCount = countLeadingZeros64( absA );
1241     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1242
1243 }
1244
1245 #endif
1246
1247 #ifdef FLOAT128
1248
1249 /*----------------------------------------------------------------------------
1250 | Returns the result of converting the 64-bit two's complement integer `a' to
1251 | the quadruple-precision floating-point format.  The conversion is performed
1252 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1253 *----------------------------------------------------------------------------*/
1254
1255 float128 int64_to_float128( int64 a STATUS_PARAM )
1256 {
1257     flag zSign;
1258     uint64 absA;
1259     int8 shiftCount;
1260     int32 zExp;
1261     bits64 zSig0, zSig1;
1262
1263     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1264     zSign = ( a < 0 );
1265     absA = zSign ? - a : a;
1266     shiftCount = countLeadingZeros64( absA ) + 49;
1267     zExp = 0x406E - shiftCount;
1268     if ( 64 <= shiftCount ) {
1269         zSig1 = 0;
1270         zSig0 = absA;
1271         shiftCount -= 64;
1272     }
1273     else {
1274         zSig1 = absA;
1275         zSig0 = 0;
1276     }
1277     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1278     return packFloat128( zSign, zExp, zSig0, zSig1 );
1279
1280 }
1281
1282 #endif
1283
1284 /*----------------------------------------------------------------------------
1285 | Returns the result of converting the single-precision floating-point value
1286 | `a' to the 32-bit two's complement integer format.  The conversion is
1287 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1288 | Arithmetic---which means in particular that the conversion is rounded
1289 | according to the current rounding mode.  If `a' is a NaN, the largest
1290 | positive integer is returned.  Otherwise, if the conversion overflows, the
1291 | largest integer with the same sign as `a' is returned.
1292 *----------------------------------------------------------------------------*/
1293
1294 int32 float32_to_int32( float32 a STATUS_PARAM )
1295 {
1296     flag aSign;
1297     int16 aExp, shiftCount;
1298     bits32 aSig;
1299     bits64 aSig64;
1300
1301     aSig = extractFloat32Frac( a );
1302     aExp = extractFloat32Exp( a );
1303     aSign = extractFloat32Sign( a );
1304     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1305     if ( aExp ) aSig |= 0x00800000;
1306     shiftCount = 0xAF - aExp;
1307     aSig64 = aSig;
1308     aSig64 <<= 32;
1309     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1310     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1311
1312 }
1313
1314 /*----------------------------------------------------------------------------
1315 | Returns the result of converting the single-precision floating-point value
1316 | `a' to the 32-bit two's complement integer format.  The conversion is
1317 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1318 | Arithmetic, except that the conversion is always rounded toward zero.
1319 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1320 | the conversion overflows, the largest integer with the same sign as `a' is
1321 | returned.
1322 *----------------------------------------------------------------------------*/
1323
1324 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1325 {
1326     flag aSign;
1327     int16 aExp, shiftCount;
1328     bits32 aSig;
1329     int32 z;
1330
1331     aSig = extractFloat32Frac( a );
1332     aExp = extractFloat32Exp( a );
1333     aSign = extractFloat32Sign( a );
1334     shiftCount = aExp - 0x9E;
1335     if ( 0 <= shiftCount ) {
1336         if ( float32_val(a) != 0xCF000000 ) {
1337             float_raise( float_flag_invalid STATUS_VAR);
1338             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1339         }
1340         return (sbits32) 0x80000000;
1341     }
1342     else if ( aExp <= 0x7E ) {
1343         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1344         return 0;
1345     }
1346     aSig = ( aSig | 0x00800000 )<<8;
1347     z = aSig>>( - shiftCount );
1348     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1349         STATUS(float_exception_flags) |= float_flag_inexact;
1350     }
1351     if ( aSign ) z = - z;
1352     return z;
1353
1354 }
1355
1356 /*----------------------------------------------------------------------------
1357 | Returns the result of converting the single-precision floating-point value
1358 | `a' to the 64-bit two's complement integer format.  The conversion is
1359 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1360 | Arithmetic---which means in particular that the conversion is rounded
1361 | according to the current rounding mode.  If `a' is a NaN, the largest
1362 | positive integer is returned.  Otherwise, if the conversion overflows, the
1363 | largest integer with the same sign as `a' is returned.
1364 *----------------------------------------------------------------------------*/
1365
1366 int64 float32_to_int64( float32 a STATUS_PARAM )
1367 {
1368     flag aSign;
1369     int16 aExp, shiftCount;
1370     bits32 aSig;
1371     bits64 aSig64, aSigExtra;
1372
1373     aSig = extractFloat32Frac( a );
1374     aExp = extractFloat32Exp( a );
1375     aSign = extractFloat32Sign( a );
1376     shiftCount = 0xBE - aExp;
1377     if ( shiftCount < 0 ) {
1378         float_raise( float_flag_invalid STATUS_VAR);
1379         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1380             return LIT64( 0x7FFFFFFFFFFFFFFF );
1381         }
1382         return (sbits64) LIT64( 0x8000000000000000 );
1383     }
1384     if ( aExp ) aSig |= 0x00800000;
1385     aSig64 = aSig;
1386     aSig64 <<= 40;
1387     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1388     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1389
1390 }
1391
1392 /*----------------------------------------------------------------------------
1393 | Returns the result of converting the single-precision floating-point value
1394 | `a' to the 64-bit two's complement integer format.  The conversion is
1395 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1396 | Arithmetic, except that the conversion is always rounded toward zero.  If
1397 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1398 | conversion overflows, the largest integer with the same sign as `a' is
1399 | returned.
1400 *----------------------------------------------------------------------------*/
1401
1402 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1403 {
1404     flag aSign;
1405     int16 aExp, shiftCount;
1406     bits32 aSig;
1407     bits64 aSig64;
1408     int64 z;
1409
1410     aSig = extractFloat32Frac( a );
1411     aExp = extractFloat32Exp( a );
1412     aSign = extractFloat32Sign( a );
1413     shiftCount = aExp - 0xBE;
1414     if ( 0 <= shiftCount ) {
1415         if ( float32_val(a) != 0xDF000000 ) {
1416             float_raise( float_flag_invalid STATUS_VAR);
1417             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1418                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1419             }
1420         }
1421         return (sbits64) LIT64( 0x8000000000000000 );
1422     }
1423     else if ( aExp <= 0x7E ) {
1424         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1425         return 0;
1426     }
1427     aSig64 = aSig | 0x00800000;
1428     aSig64 <<= 40;
1429     z = aSig64>>( - shiftCount );
1430     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1431         STATUS(float_exception_flags) |= float_flag_inexact;
1432     }
1433     if ( aSign ) z = - z;
1434     return z;
1435
1436 }
1437
1438 /*----------------------------------------------------------------------------
1439 | Returns the result of converting the single-precision floating-point value
1440 | `a' to the double-precision floating-point format.  The conversion is
1441 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1442 | Arithmetic.
1443 *----------------------------------------------------------------------------*/
1444
1445 float64 float32_to_float64( float32 a STATUS_PARAM )
1446 {
1447     flag aSign;
1448     int16 aExp;
1449     bits32 aSig;
1450
1451     aSig = extractFloat32Frac( a );
1452     aExp = extractFloat32Exp( a );
1453     aSign = extractFloat32Sign( a );
1454     if ( aExp == 0xFF ) {
1455         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1456         return packFloat64( aSign, 0x7FF, 0 );
1457     }
1458     if ( aExp == 0 ) {
1459         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1460         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1461         --aExp;
1462     }
1463     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1464
1465 }
1466
1467 #ifdef FLOATX80
1468
1469 /*----------------------------------------------------------------------------
1470 | Returns the result of converting the single-precision floating-point value
1471 | `a' to the extended double-precision floating-point format.  The conversion
1472 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1473 | Arithmetic.
1474 *----------------------------------------------------------------------------*/
1475
1476 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1477 {
1478     flag aSign;
1479     int16 aExp;
1480     bits32 aSig;
1481
1482     aSig = extractFloat32Frac( a );
1483     aExp = extractFloat32Exp( a );
1484     aSign = extractFloat32Sign( a );
1485     if ( aExp == 0xFF ) {
1486         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1487         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1488     }
1489     if ( aExp == 0 ) {
1490         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1491         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1492     }
1493     aSig |= 0x00800000;
1494     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1495
1496 }
1497
1498 #endif
1499
1500 #ifdef FLOAT128
1501
1502 /*----------------------------------------------------------------------------
1503 | Returns the result of converting the single-precision floating-point value
1504 | `a' to the double-precision floating-point format.  The conversion is
1505 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1506 | Arithmetic.
1507 *----------------------------------------------------------------------------*/
1508
1509 float128 float32_to_float128( float32 a STATUS_PARAM )
1510 {
1511     flag aSign;
1512     int16 aExp;
1513     bits32 aSig;
1514
1515     aSig = extractFloat32Frac( a );
1516     aExp = extractFloat32Exp( a );
1517     aSign = extractFloat32Sign( a );
1518     if ( aExp == 0xFF ) {
1519         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1520         return packFloat128( aSign, 0x7FFF, 0, 0 );
1521     }
1522     if ( aExp == 0 ) {
1523         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1524         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1525         --aExp;
1526     }
1527     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1528
1529 }
1530
1531 #endif
1532
1533 /*----------------------------------------------------------------------------
1534 | Rounds the single-precision floating-point value `a' to an integer, and
1535 | returns the result as a single-precision floating-point value.  The
1536 | operation is performed according to the IEC/IEEE Standard for Binary
1537 | Floating-Point Arithmetic.
1538 *----------------------------------------------------------------------------*/
1539
1540 float32 float32_round_to_int( float32 a STATUS_PARAM)
1541 {
1542     flag aSign;
1543     int16 aExp;
1544     bits32 lastBitMask, roundBitsMask;
1545     int8 roundingMode;
1546     bits32 z;
1547
1548     aExp = extractFloat32Exp( a );
1549     if ( 0x96 <= aExp ) {
1550         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1551             return propagateFloat32NaN( a, a STATUS_VAR );
1552         }
1553         return a;
1554     }
1555     if ( aExp <= 0x7E ) {
1556         if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1557         STATUS(float_exception_flags) |= float_flag_inexact;
1558         aSign = extractFloat32Sign( a );
1559         switch ( STATUS(float_rounding_mode) ) {
1560          case float_round_nearest_even:
1561             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1562                 return packFloat32( aSign, 0x7F, 0 );
1563             }
1564             break;
1565          case float_round_down:
1566             return make_float32(aSign ? 0xBF800000 : 0);
1567          case float_round_up:
1568             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1569         }
1570         return packFloat32( aSign, 0, 0 );
1571     }
1572     lastBitMask = 1;
1573     lastBitMask <<= 0x96 - aExp;
1574     roundBitsMask = lastBitMask - 1;
1575     z = float32_val(a);
1576     roundingMode = STATUS(float_rounding_mode);
1577     if ( roundingMode == float_round_nearest_even ) {
1578         z += lastBitMask>>1;
1579         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1580     }
1581     else if ( roundingMode != float_round_to_zero ) {
1582         if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1583             z += roundBitsMask;
1584         }
1585     }
1586     z &= ~ roundBitsMask;
1587     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1588     return make_float32(z);
1589
1590 }
1591
1592 /*----------------------------------------------------------------------------
1593 | Returns the result of adding the absolute values of the single-precision
1594 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1595 | before being returned.  `zSign' is ignored if the result is a NaN.
1596 | The addition is performed according to the IEC/IEEE Standard for Binary
1597 | Floating-Point Arithmetic.
1598 *----------------------------------------------------------------------------*/
1599
1600 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1601 {
1602     int16 aExp, bExp, zExp;
1603     bits32 aSig, bSig, zSig;
1604     int16 expDiff;
1605
1606     aSig = extractFloat32Frac( a );
1607     aExp = extractFloat32Exp( a );
1608     bSig = extractFloat32Frac( b );
1609     bExp = extractFloat32Exp( b );
1610     expDiff = aExp - bExp;
1611     aSig <<= 6;
1612     bSig <<= 6;
1613     if ( 0 < expDiff ) {
1614         if ( aExp == 0xFF ) {
1615             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1616             return a;
1617         }
1618         if ( bExp == 0 ) {
1619             --expDiff;
1620         }
1621         else {
1622             bSig |= 0x20000000;
1623         }
1624         shift32RightJamming( bSig, expDiff, &bSig );
1625         zExp = aExp;
1626     }
1627     else if ( expDiff < 0 ) {
1628         if ( bExp == 0xFF ) {
1629             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1630             return packFloat32( zSign, 0xFF, 0 );
1631         }
1632         if ( aExp == 0 ) {
1633             ++expDiff;
1634         }
1635         else {
1636             aSig |= 0x20000000;
1637         }
1638         shift32RightJamming( aSig, - expDiff, &aSig );
1639         zExp = bExp;
1640     }
1641     else {
1642         if ( aExp == 0xFF ) {
1643             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1644             return a;
1645         }
1646         if ( aExp == 0 ) {
1647             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1648             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1649         }
1650         zSig = 0x40000000 + aSig + bSig;
1651         zExp = aExp;
1652         goto roundAndPack;
1653     }
1654     aSig |= 0x20000000;
1655     zSig = ( aSig + bSig )<<1;
1656     --zExp;
1657     if ( (sbits32) zSig < 0 ) {
1658         zSig = aSig + bSig;
1659         ++zExp;
1660     }
1661  roundAndPack:
1662     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1663
1664 }
1665
1666 /*----------------------------------------------------------------------------
1667 | Returns the result of subtracting the absolute values of the single-
1668 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1669 | difference is negated before being returned.  `zSign' is ignored if the
1670 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1671 | Standard for Binary Floating-Point Arithmetic.
1672 *----------------------------------------------------------------------------*/
1673
1674 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1675 {
1676     int16 aExp, bExp, zExp;
1677     bits32 aSig, bSig, zSig;
1678     int16 expDiff;
1679
1680     aSig = extractFloat32Frac( a );
1681     aExp = extractFloat32Exp( a );
1682     bSig = extractFloat32Frac( b );
1683     bExp = extractFloat32Exp( b );
1684     expDiff = aExp - bExp;
1685     aSig <<= 7;
1686     bSig <<= 7;
1687     if ( 0 < expDiff ) goto aExpBigger;
1688     if ( expDiff < 0 ) goto bExpBigger;
1689     if ( aExp == 0xFF ) {
1690         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1691         float_raise( float_flag_invalid STATUS_VAR);
1692         return float32_default_nan;
1693     }
1694     if ( aExp == 0 ) {
1695         aExp = 1;
1696         bExp = 1;
1697     }
1698     if ( bSig < aSig ) goto aBigger;
1699     if ( aSig < bSig ) goto bBigger;
1700     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1701  bExpBigger:
1702     if ( bExp == 0xFF ) {
1703         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1704         return packFloat32( zSign ^ 1, 0xFF, 0 );
1705     }
1706     if ( aExp == 0 ) {
1707         ++expDiff;
1708     }
1709     else {
1710         aSig |= 0x40000000;
1711     }
1712     shift32RightJamming( aSig, - expDiff, &aSig );
1713     bSig |= 0x40000000;
1714  bBigger:
1715     zSig = bSig - aSig;
1716     zExp = bExp;
1717     zSign ^= 1;
1718     goto normalizeRoundAndPack;
1719  aExpBigger:
1720     if ( aExp == 0xFF ) {
1721         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1722         return a;
1723     }
1724     if ( bExp == 0 ) {
1725         --expDiff;
1726     }
1727     else {
1728         bSig |= 0x40000000;
1729     }
1730     shift32RightJamming( bSig, expDiff, &bSig );
1731     aSig |= 0x40000000;
1732  aBigger:
1733     zSig = aSig - bSig;
1734     zExp = aExp;
1735  normalizeRoundAndPack:
1736     --zExp;
1737     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1738
1739 }
1740
1741 /*----------------------------------------------------------------------------
1742 | Returns the result of adding the single-precision floating-point values `a'
1743 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1744 | Binary Floating-Point Arithmetic.
1745 *----------------------------------------------------------------------------*/
1746
1747 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1748 {
1749     flag aSign, bSign;
1750
1751     aSign = extractFloat32Sign( a );
1752     bSign = extractFloat32Sign( b );
1753     if ( aSign == bSign ) {
1754         return addFloat32Sigs( a, b, aSign STATUS_VAR);
1755     }
1756     else {
1757         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1758     }
1759
1760 }
1761
1762 /*----------------------------------------------------------------------------
1763 | Returns the result of subtracting the single-precision floating-point values
1764 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1765 | for Binary Floating-Point Arithmetic.
1766 *----------------------------------------------------------------------------*/
1767
1768 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1769 {
1770     flag aSign, bSign;
1771
1772     aSign = extractFloat32Sign( a );
1773     bSign = extractFloat32Sign( b );
1774     if ( aSign == bSign ) {
1775         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1776     }
1777     else {
1778         return addFloat32Sigs( a, b, aSign STATUS_VAR );
1779     }
1780
1781 }
1782
1783 /*----------------------------------------------------------------------------
1784 | Returns the result of multiplying the single-precision floating-point values
1785 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1786 | for Binary Floating-Point Arithmetic.
1787 *----------------------------------------------------------------------------*/
1788
1789 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1790 {
1791     flag aSign, bSign, zSign;
1792     int16 aExp, bExp, zExp;
1793     bits32 aSig, bSig;
1794     bits64 zSig64;
1795     bits32 zSig;
1796
1797     aSig = extractFloat32Frac( a );
1798     aExp = extractFloat32Exp( a );
1799     aSign = extractFloat32Sign( a );
1800     bSig = extractFloat32Frac( b );
1801     bExp = extractFloat32Exp( b );
1802     bSign = extractFloat32Sign( b );
1803     zSign = aSign ^ bSign;
1804     if ( aExp == 0xFF ) {
1805         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1806             return propagateFloat32NaN( a, b STATUS_VAR );
1807         }
1808         if ( ( bExp | bSig ) == 0 ) {
1809             float_raise( float_flag_invalid STATUS_VAR);
1810             return float32_default_nan;
1811         }
1812         return packFloat32( zSign, 0xFF, 0 );
1813     }
1814     if ( bExp == 0xFF ) {
1815         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1816         if ( ( aExp | aSig ) == 0 ) {
1817             float_raise( float_flag_invalid STATUS_VAR);
1818             return float32_default_nan;
1819         }
1820         return packFloat32( zSign, 0xFF, 0 );
1821     }
1822     if ( aExp == 0 ) {
1823         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1824         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1825     }
1826     if ( bExp == 0 ) {
1827         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1828         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1829     }
1830     zExp = aExp + bExp - 0x7F;
1831     aSig = ( aSig | 0x00800000 )<<7;
1832     bSig = ( bSig | 0x00800000 )<<8;
1833     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1834     zSig = zSig64;
1835     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1836         zSig <<= 1;
1837         --zExp;
1838     }
1839     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1840
1841 }
1842
1843 /*----------------------------------------------------------------------------
1844 | Returns the result of dividing the single-precision floating-point value `a'
1845 | by the corresponding value `b'.  The operation is performed according to the
1846 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1847 *----------------------------------------------------------------------------*/
1848
1849 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1850 {
1851     flag aSign, bSign, zSign;
1852     int16 aExp, bExp, zExp;
1853     bits32 aSig, bSig, zSig;
1854
1855     aSig = extractFloat32Frac( a );
1856     aExp = extractFloat32Exp( a );
1857     aSign = extractFloat32Sign( a );
1858     bSig = extractFloat32Frac( b );
1859     bExp = extractFloat32Exp( b );
1860     bSign = extractFloat32Sign( b );
1861     zSign = aSign ^ bSign;
1862     if ( aExp == 0xFF ) {
1863         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1864         if ( bExp == 0xFF ) {
1865             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1866             float_raise( float_flag_invalid STATUS_VAR);
1867             return float32_default_nan;
1868         }
1869         return packFloat32( zSign, 0xFF, 0 );
1870     }
1871     if ( bExp == 0xFF ) {
1872         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1873         return packFloat32( zSign, 0, 0 );
1874     }
1875     if ( bExp == 0 ) {
1876         if ( bSig == 0 ) {
1877             if ( ( aExp | aSig ) == 0 ) {
1878                 float_raise( float_flag_invalid STATUS_VAR);
1879                 return float32_default_nan;
1880             }
1881             float_raise( float_flag_divbyzero STATUS_VAR);
1882             return packFloat32( zSign, 0xFF, 0 );
1883         }
1884         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1885     }
1886     if ( aExp == 0 ) {
1887         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1888         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1889     }
1890     zExp = aExp - bExp + 0x7D;
1891     aSig = ( aSig | 0x00800000 )<<7;
1892     bSig = ( bSig | 0x00800000 )<<8;
1893     if ( bSig <= ( aSig + aSig ) ) {
1894         aSig >>= 1;
1895         ++zExp;
1896     }
1897     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1898     if ( ( zSig & 0x3F ) == 0 ) {
1899         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1900     }
1901     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1902
1903 }
1904
1905 /*----------------------------------------------------------------------------
1906 | Returns the remainder of the single-precision floating-point value `a'
1907 | with respect to the corresponding value `b'.  The operation is performed
1908 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1909 *----------------------------------------------------------------------------*/
1910
1911 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1912 {
1913     flag aSign, bSign, zSign;
1914     int16 aExp, bExp, expDiff;
1915     bits32 aSig, bSig;
1916     bits32 q;
1917     bits64 aSig64, bSig64, q64;
1918     bits32 alternateASig;
1919     sbits32 sigMean;
1920
1921     aSig = extractFloat32Frac( a );
1922     aExp = extractFloat32Exp( a );
1923     aSign = extractFloat32Sign( a );
1924     bSig = extractFloat32Frac( b );
1925     bExp = extractFloat32Exp( b );
1926     bSign = extractFloat32Sign( b );
1927     if ( aExp == 0xFF ) {
1928         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1929             return propagateFloat32NaN( a, b STATUS_VAR );
1930         }
1931         float_raise( float_flag_invalid STATUS_VAR);
1932         return float32_default_nan;
1933     }
1934     if ( bExp == 0xFF ) {
1935         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1936         return a;
1937     }
1938     if ( bExp == 0 ) {
1939         if ( bSig == 0 ) {
1940             float_raise( float_flag_invalid STATUS_VAR);
1941             return float32_default_nan;
1942         }
1943         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1944     }
1945     if ( aExp == 0 ) {
1946         if ( aSig == 0 ) return a;
1947         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1948     }
1949     expDiff = aExp - bExp;
1950     aSig |= 0x00800000;
1951     bSig |= 0x00800000;
1952     if ( expDiff < 32 ) {
1953         aSig <<= 8;
1954         bSig <<= 8;
1955         if ( expDiff < 0 ) {
1956             if ( expDiff < -1 ) return a;
1957             aSig >>= 1;
1958         }
1959         q = ( bSig <= aSig );
1960         if ( q ) aSig -= bSig;
1961         if ( 0 < expDiff ) {
1962             q = ( ( (bits64) aSig )<<32 ) / bSig;
1963             q >>= 32 - expDiff;
1964             bSig >>= 2;
1965             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1966         }
1967         else {
1968             aSig >>= 2;
1969             bSig >>= 2;
1970         }
1971     }
1972     else {
1973         if ( bSig <= aSig ) aSig -= bSig;
1974         aSig64 = ( (bits64) aSig )<<40;
1975         bSig64 = ( (bits64) bSig )<<40;
1976         expDiff -= 64;
1977         while ( 0 < expDiff ) {
1978             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1979             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1980             aSig64 = - ( ( bSig * q64 )<<38 );
1981             expDiff -= 62;
1982         }
1983         expDiff += 64;
1984         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1985         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1986         q = q64>>( 64 - expDiff );
1987         bSig <<= 6;
1988         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1989     }
1990     do {
1991         alternateASig = aSig;
1992         ++q;
1993         aSig -= bSig;
1994     } while ( 0 <= (sbits32) aSig );
1995     sigMean = aSig + alternateASig;
1996     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1997         aSig = alternateASig;
1998     }
1999     zSign = ( (sbits32) aSig < 0 );
2000     if ( zSign ) aSig = - aSig;
2001     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2002
2003 }
2004
2005 /*----------------------------------------------------------------------------
2006 | Returns the square root of the single-precision floating-point value `a'.
2007 | The operation is performed according to the IEC/IEEE Standard for Binary
2008 | Floating-Point Arithmetic.
2009 *----------------------------------------------------------------------------*/
2010
2011 float32 float32_sqrt( float32 a STATUS_PARAM )
2012 {
2013     flag aSign;
2014     int16 aExp, zExp;
2015     bits32 aSig, zSig;
2016     bits64 rem, term;
2017
2018     aSig = extractFloat32Frac( a );
2019     aExp = extractFloat32Exp( a );
2020     aSign = extractFloat32Sign( a );
2021     if ( aExp == 0xFF ) {
2022         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2023         if ( ! aSign ) return a;
2024         float_raise( float_flag_invalid STATUS_VAR);
2025         return float32_default_nan;
2026     }
2027     if ( aSign ) {
2028         if ( ( aExp | aSig ) == 0 ) return a;
2029         float_raise( float_flag_invalid STATUS_VAR);
2030         return float32_default_nan;
2031     }
2032     if ( aExp == 0 ) {
2033         if ( aSig == 0 ) return float32_zero;
2034         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2035     }
2036     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2037     aSig = ( aSig | 0x00800000 )<<8;
2038     zSig = estimateSqrt32( aExp, aSig ) + 2;
2039     if ( ( zSig & 0x7F ) <= 5 ) {
2040         if ( zSig < 2 ) {
2041             zSig = 0x7FFFFFFF;
2042             goto roundAndPack;
2043         }
2044         aSig >>= aExp & 1;
2045         term = ( (bits64) zSig ) * zSig;
2046         rem = ( ( (bits64) aSig )<<32 ) - term;
2047         while ( (sbits64) rem < 0 ) {
2048             --zSig;
2049             rem += ( ( (bits64) zSig )<<1 ) | 1;
2050         }
2051         zSig |= ( rem != 0 );
2052     }
2053     shift32RightJamming( zSig, 1, &zSig );
2054  roundAndPack:
2055     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2056
2057 }
2058
2059 /*----------------------------------------------------------------------------
2060 | Returns the binary log of the single-precision floating-point value `a'.
2061 | The operation is performed according to the IEC/IEEE Standard for Binary
2062 | Floating-Point Arithmetic.
2063 *----------------------------------------------------------------------------*/
2064 float32 float32_log2( float32 a STATUS_PARAM )
2065 {
2066     flag aSign, zSign;
2067     int16 aExp;
2068     bits32 aSig, zSig, i;
2069
2070     aSig = extractFloat32Frac( a );
2071     aExp = extractFloat32Exp( a );
2072     aSign = extractFloat32Sign( a );
2073
2074     if ( aExp == 0 ) {
2075         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2076         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2077     }
2078     if ( aSign ) {
2079         float_raise( float_flag_invalid STATUS_VAR);
2080         return float32_default_nan;
2081     }
2082     if ( aExp == 0xFF ) {
2083         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2084         return a;
2085     }
2086
2087     aExp -= 0x7F;
2088     aSig |= 0x00800000;
2089     zSign = aExp < 0;
2090     zSig = aExp << 23;
2091
2092     for (i = 1 << 22; i > 0; i >>= 1) {
2093         aSig = ( (bits64)aSig * aSig ) >> 23;
2094         if ( aSig & 0x01000000 ) {
2095             aSig >>= 1;
2096             zSig |= i;
2097         }
2098     }
2099
2100     if ( zSign )
2101         zSig = -zSig;
2102
2103     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2104 }
2105
2106 /*----------------------------------------------------------------------------
2107 | Returns 1 if the single-precision floating-point value `a' is equal to
2108 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2109 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2110 *----------------------------------------------------------------------------*/
2111
2112 int float32_eq( float32 a, float32 b STATUS_PARAM )
2113 {
2114
2115     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2116          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2117        ) {
2118         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2119             float_raise( float_flag_invalid STATUS_VAR);
2120         }
2121         return 0;
2122     }
2123     return ( float32_val(a) == float32_val(b) ) ||
2124             ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2125
2126 }
2127
2128 /*----------------------------------------------------------------------------
2129 | Returns 1 if the single-precision floating-point value `a' is less than
2130 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
2131 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2132 | Arithmetic.
2133 *----------------------------------------------------------------------------*/
2134
2135 int float32_le( float32 a, float32 b STATUS_PARAM )
2136 {
2137     flag aSign, bSign;
2138     bits32 av, bv;
2139
2140     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2141          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2142        ) {
2143         float_raise( float_flag_invalid STATUS_VAR);
2144         return 0;
2145     }
2146     aSign = extractFloat32Sign( a );
2147     bSign = extractFloat32Sign( b );
2148     av = float32_val(a);
2149     bv = float32_val(b);
2150     if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2151     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2152
2153 }
2154
2155 /*----------------------------------------------------------------------------
2156 | Returns 1 if the single-precision floating-point value `a' is less than
2157 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2158 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2159 *----------------------------------------------------------------------------*/
2160
2161 int float32_lt( float32 a, float32 b STATUS_PARAM )
2162 {
2163     flag aSign, bSign;
2164     bits32 av, bv;
2165
2166     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2167          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2168        ) {
2169         float_raise( float_flag_invalid STATUS_VAR);
2170         return 0;
2171     }
2172     aSign = extractFloat32Sign( a );
2173     bSign = extractFloat32Sign( b );
2174     av = float32_val(a);
2175     bv = float32_val(b);
2176     if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2177     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2178
2179 }
2180
2181 /*----------------------------------------------------------------------------
2182 | Returns 1 if the single-precision floating-point value `a' is equal to
2183 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2184 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2185 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2186 *----------------------------------------------------------------------------*/
2187
2188 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2189 {
2190     bits32 av, bv;
2191
2192     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2193          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2194        ) {
2195         float_raise( float_flag_invalid STATUS_VAR);
2196         return 0;
2197     }
2198     av = float32_val(a);
2199     bv = float32_val(b);
2200     return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2201
2202 }
2203
2204 /*----------------------------------------------------------------------------
2205 | Returns 1 if the single-precision floating-point value `a' is less than or
2206 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2207 | cause an exception.  Otherwise, the comparison is performed according to the
2208 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2209 *----------------------------------------------------------------------------*/
2210
2211 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2212 {
2213     flag aSign, bSign;
2214     bits32 av, bv;
2215
2216     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2217          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2218        ) {
2219         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2220             float_raise( float_flag_invalid STATUS_VAR);
2221         }
2222         return 0;
2223     }
2224     aSign = extractFloat32Sign( a );
2225     bSign = extractFloat32Sign( b );
2226     av = float32_val(a);
2227     bv = float32_val(b);
2228     if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2229     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2230
2231 }
2232
2233 /*----------------------------------------------------------------------------
2234 | Returns 1 if the single-precision floating-point value `a' is less than
2235 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2236 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2237 | Standard for Binary Floating-Point Arithmetic.
2238 *----------------------------------------------------------------------------*/
2239
2240 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2241 {
2242     flag aSign, bSign;
2243     bits32 av, bv;
2244
2245     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2246          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2247        ) {
2248         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2249             float_raise( float_flag_invalid STATUS_VAR);
2250         }
2251         return 0;
2252     }
2253     aSign = extractFloat32Sign( a );
2254     bSign = extractFloat32Sign( b );
2255     av = float32_val(a);
2256     bv = float32_val(b);
2257     if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2258     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2259
2260 }
2261
2262 /*----------------------------------------------------------------------------
2263 | Returns the result of converting the double-precision floating-point value
2264 | `a' to the 32-bit two's complement integer format.  The conversion is
2265 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2266 | Arithmetic---which means in particular that the conversion is rounded
2267 | according to the current rounding mode.  If `a' is a NaN, the largest
2268 | positive integer is returned.  Otherwise, if the conversion overflows, the
2269 | largest integer with the same sign as `a' is returned.
2270 *----------------------------------------------------------------------------*/
2271
2272 int32 float64_to_int32( float64 a STATUS_PARAM )
2273 {
2274     flag aSign;
2275     int16 aExp, shiftCount;
2276     bits64 aSig;
2277
2278     aSig = extractFloat64Frac( a );
2279     aExp = extractFloat64Exp( a );
2280     aSign = extractFloat64Sign( a );
2281     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2282     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2283     shiftCount = 0x42C - aExp;
2284     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2285     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2286
2287 }
2288
2289 /*----------------------------------------------------------------------------
2290 | Returns the result of converting the double-precision floating-point value
2291 | `a' to the 32-bit two's complement integer format.  The conversion is
2292 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2293 | Arithmetic, except that the conversion is always rounded toward zero.
2294 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2295 | the conversion overflows, the largest integer with the same sign as `a' is
2296 | returned.
2297 *----------------------------------------------------------------------------*/
2298
2299 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2300 {
2301     flag aSign;
2302     int16 aExp, shiftCount;
2303     bits64 aSig, savedASig;
2304     int32 z;
2305
2306     aSig = extractFloat64Frac( a );
2307     aExp = extractFloat64Exp( a );
2308     aSign = extractFloat64Sign( a );
2309     if ( 0x41E < aExp ) {
2310         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2311         goto invalid;
2312     }
2313     else if ( aExp < 0x3FF ) {
2314         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2315         return 0;
2316     }
2317     aSig |= LIT64( 0x0010000000000000 );
2318     shiftCount = 0x433 - aExp;
2319     savedASig = aSig;
2320     aSig >>= shiftCount;
2321     z = aSig;
2322     if ( aSign ) z = - z;
2323     if ( ( z < 0 ) ^ aSign ) {
2324  invalid:
2325         float_raise( float_flag_invalid STATUS_VAR);
2326         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2327     }
2328     if ( ( aSig<<shiftCount ) != savedASig ) {
2329         STATUS(float_exception_flags) |= float_flag_inexact;
2330     }
2331     return z;
2332
2333 }
2334
2335 /*----------------------------------------------------------------------------
2336 | Returns the result of converting the double-precision floating-point value
2337 | `a' to the 64-bit two's complement integer format.  The conversion is
2338 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2339 | Arithmetic---which means in particular that the conversion is rounded
2340 | according to the current rounding mode.  If `a' is a NaN, the largest
2341 | positive integer is returned.  Otherwise, if the conversion overflows, the
2342 | largest integer with the same sign as `a' is returned.
2343 *----------------------------------------------------------------------------*/
2344
2345 int64 float64_to_int64( float64 a STATUS_PARAM )
2346 {
2347     flag aSign;
2348     int16 aExp, shiftCount;
2349     bits64 aSig, aSigExtra;
2350
2351     aSig = extractFloat64Frac( a );
2352     aExp = extractFloat64Exp( a );
2353     aSign = extractFloat64Sign( a );
2354     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2355     shiftCount = 0x433 - aExp;
2356     if ( shiftCount <= 0 ) {
2357         if ( 0x43E < aExp ) {
2358             float_raise( float_flag_invalid STATUS_VAR);
2359             if (    ! aSign
2360                  || (    ( aExp == 0x7FF )
2361                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2362                ) {
2363                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2364             }
2365             return (sbits64) LIT64( 0x8000000000000000 );
2366         }
2367         aSigExtra = 0;
2368         aSig <<= - shiftCount;
2369     }
2370     else {
2371         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2372     }
2373     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2374
2375 }
2376
2377 /*----------------------------------------------------------------------------
2378 | Returns the result of converting the double-precision floating-point value
2379 | `a' to the 64-bit two's complement integer format.  The conversion is
2380 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2381 | Arithmetic, except that the conversion is always rounded toward zero.
2382 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2383 | the conversion overflows, the largest integer with the same sign as `a' is
2384 | returned.
2385 *----------------------------------------------------------------------------*/
2386
2387 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2388 {
2389     flag aSign;
2390     int16 aExp, shiftCount;
2391     bits64 aSig;
2392     int64 z;
2393
2394     aSig = extractFloat64Frac( a );
2395     aExp = extractFloat64Exp( a );
2396     aSign = extractFloat64Sign( a );
2397     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2398     shiftCount = aExp - 0x433;
2399     if ( 0 <= shiftCount ) {
2400         if ( 0x43E <= aExp ) {
2401             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2402                 float_raise( float_flag_invalid STATUS_VAR);
2403                 if (    ! aSign
2404                      || (    ( aExp == 0x7FF )
2405                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2406                    ) {
2407                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2408                 }
2409             }
2410             return (sbits64) LIT64( 0x8000000000000000 );
2411         }
2412         z = aSig<<shiftCount;
2413     }
2414     else {
2415         if ( aExp < 0x3FE ) {
2416             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2417             return 0;
2418         }
2419         z = aSig>>( - shiftCount );
2420         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2421             STATUS(float_exception_flags) |= float_flag_inexact;
2422         }
2423     }
2424     if ( aSign ) z = - z;
2425     return z;
2426
2427 }
2428
2429 /*----------------------------------------------------------------------------
2430 | Returns the result of converting the double-precision floating-point value
2431 | `a' to the single-precision floating-point format.  The conversion is
2432 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2433 | Arithmetic.
2434 *----------------------------------------------------------------------------*/
2435
2436 float32 float64_to_float32( float64 a STATUS_PARAM )
2437 {
2438     flag aSign;
2439     int16 aExp;
2440     bits64 aSig;
2441     bits32 zSig;
2442
2443     aSig = extractFloat64Frac( a );
2444     aExp = extractFloat64Exp( a );
2445     aSign = extractFloat64Sign( a );
2446     if ( aExp == 0x7FF ) {
2447         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2448         return packFloat32( aSign, 0xFF, 0 );
2449     }
2450     shift64RightJamming( aSig, 22, &aSig );
2451     zSig = aSig;
2452     if ( aExp || zSig ) {
2453         zSig |= 0x40000000;
2454         aExp -= 0x381;
2455     }
2456     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2457
2458 }
2459
2460
2461 /*----------------------------------------------------------------------------
2462 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2463 | half-precision floating-point value, returning the result.  After being
2464 | shifted into the proper positions, the three fields are simply added
2465 | together to form the result.  This means that any integer portion of `zSig'
2466 | will be added into the exponent.  Since a properly normalized significand
2467 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2468 | than the desired result exponent whenever `zSig' is a complete, normalized
2469 | significand.
2470 *----------------------------------------------------------------------------*/
2471 static bits16 packFloat16(flag zSign, int16 zExp, bits16 zSig)
2472 {
2473     return (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig;
2474 }
2475
2476 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2477    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2478
2479 float32 float16_to_float32( bits16 a, flag ieee STATUS_PARAM )
2480 {
2481     flag aSign;
2482     int16 aExp;
2483     bits32 aSig;
2484
2485     aSign = a >> 15;
2486     aExp = (a >> 10) & 0x1f;
2487     aSig = a & 0x3ff;
2488
2489     if (aExp == 0x1f && ieee) {
2490         if (aSig) {
2491             /* Make sure correct exceptions are raised.  */
2492             float32ToCommonNaN(a STATUS_VAR);
2493             aSig |= 0x200;
2494         }
2495         return packFloat32(aSign, 0xff, aSig << 13);
2496     }
2497     if (aExp == 0) {
2498         int8 shiftCount;
2499
2500         if (aSig == 0) {
2501             return packFloat32(aSign, 0, 0);
2502         }
2503
2504         shiftCount = countLeadingZeros32( aSig ) - 21;
2505         aSig = aSig << shiftCount;
2506         aExp = -shiftCount;
2507     }
2508     return packFloat32( aSign, aExp + 0x70, aSig << 13);
2509 }
2510
2511 bits16 float32_to_float16( float32 a, flag ieee STATUS_PARAM)
2512 {
2513     flag aSign;
2514     int16 aExp;
2515     bits32 aSig;
2516     bits32 mask;
2517     bits32 increment;
2518     int8 roundingMode;
2519
2520     aSig = extractFloat32Frac( a );
2521     aExp = extractFloat32Exp( a );
2522     aSign = extractFloat32Sign( a );
2523     if ( aExp == 0xFF ) {
2524         if (aSig) {
2525             /* Make sure correct exceptions are raised.  */
2526             float32ToCommonNaN(a STATUS_VAR);
2527             aSig |= 0x00400000;
2528         }
2529         return packFloat16(aSign, 0x1f, aSig >> 13);
2530     }
2531     if (aExp == 0 && aSign == 0) {
2532         return packFloat16(aSign, 0, 0);
2533     }
2534     /* Decimal point between bits 22 and 23.  */
2535     aSig |= 0x00800000;
2536     aExp -= 0x7f;
2537     if (aExp < -14) {
2538         mask = 0x007fffff;
2539         if (aExp < -24) {
2540             aExp = -25;
2541         } else {
2542             mask >>= 24 + aExp;
2543         }
2544     } else {
2545         mask = 0x00001fff;
2546     }
2547     if (aSig & mask) {
2548         float_raise( float_flag_underflow STATUS_VAR );
2549         roundingMode = STATUS(float_rounding_mode);
2550         switch (roundingMode) {
2551         case float_round_nearest_even:
2552             increment = (mask + 1) >> 1;
2553             if ((aSig & mask) == increment) {
2554                 increment = aSig & (increment << 1);
2555             }
2556             break;
2557         case float_round_up:
2558             increment = aSign ? 0 : mask;
2559             break;
2560         case float_round_down:
2561             increment = aSign ? mask : 0;
2562             break;
2563         default: /* round_to_zero */
2564             increment = 0;
2565             break;
2566         }
2567         aSig += increment;
2568         if (aSig >= 0x01000000) {
2569             aSig >>= 1;
2570             aExp++;
2571         }
2572     } else if (aExp < -14
2573           && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2574         float_raise( float_flag_underflow STATUS_VAR);
2575     }
2576
2577     if (ieee) {
2578         if (aExp > 15) {
2579             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2580             return packFloat16(aSign, 0x1f, 0);
2581         }
2582     } else {
2583         if (aExp > 16) {
2584             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2585             return packFloat16(aSign, 0x1f, 0x3ff);
2586         }
2587     }
2588     if (aExp < -24) {
2589         return packFloat16(aSign, 0, 0);
2590     }
2591     if (aExp < -14) {
2592         aSig >>= -14 - aExp;
2593         aExp = -14;
2594     }
2595     return packFloat16(aSign, aExp + 14, aSig >> 13);
2596 }
2597
2598 #ifdef FLOATX80
2599
2600 /*----------------------------------------------------------------------------
2601 | Returns the result of converting the double-precision floating-point value
2602 | `a' to the extended double-precision floating-point format.  The conversion
2603 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2604 | Arithmetic.
2605 *----------------------------------------------------------------------------*/
2606
2607 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2608 {
2609     flag aSign;
2610     int16 aExp;
2611     bits64 aSig;
2612
2613     aSig = extractFloat64Frac( a );
2614     aExp = extractFloat64Exp( a );
2615     aSign = extractFloat64Sign( a );
2616     if ( aExp == 0x7FF ) {
2617         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2618         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2619     }
2620     if ( aExp == 0 ) {
2621         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2622         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2623     }
2624     return
2625         packFloatx80(
2626             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2627
2628 }
2629
2630 #endif
2631
2632 #ifdef FLOAT128
2633
2634 /*----------------------------------------------------------------------------
2635 | Returns the result of converting the double-precision floating-point value
2636 | `a' to the quadruple-precision floating-point format.  The conversion is
2637 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2638 | Arithmetic.
2639 *----------------------------------------------------------------------------*/
2640
2641 float128 float64_to_float128( float64 a STATUS_PARAM )
2642 {
2643     flag aSign;
2644     int16 aExp;
2645     bits64 aSig, zSig0, zSig1;
2646
2647     aSig = extractFloat64Frac( a );
2648     aExp = extractFloat64Exp( a );
2649     aSign = extractFloat64Sign( a );
2650     if ( aExp == 0x7FF ) {
2651         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2652         return packFloat128( aSign, 0x7FFF, 0, 0 );
2653     }
2654     if ( aExp == 0 ) {
2655         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2656         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2657         --aExp;
2658     }
2659     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2660     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2661
2662 }
2663
2664 #endif
2665
2666 /*----------------------------------------------------------------------------
2667 | Rounds the double-precision floating-point value `a' to an integer, and
2668 | returns the result as a double-precision floating-point value.  The
2669 | operation is performed according to the IEC/IEEE Standard for Binary
2670 | Floating-Point Arithmetic.
2671 *----------------------------------------------------------------------------*/
2672
2673 float64 float64_round_to_int( float64 a STATUS_PARAM )
2674 {
2675     flag aSign;
2676     int16 aExp;
2677     bits64 lastBitMask, roundBitsMask;
2678     int8 roundingMode;
2679     bits64 z;
2680
2681     aExp = extractFloat64Exp( a );
2682     if ( 0x433 <= aExp ) {
2683         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2684             return propagateFloat64NaN( a, a STATUS_VAR );
2685         }
2686         return a;
2687     }
2688     if ( aExp < 0x3FF ) {
2689         if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2690         STATUS(float_exception_flags) |= float_flag_inexact;
2691         aSign = extractFloat64Sign( a );
2692         switch ( STATUS(float_rounding_mode) ) {
2693          case float_round_nearest_even:
2694             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2695                 return packFloat64( aSign, 0x3FF, 0 );
2696             }
2697             break;
2698          case float_round_down:
2699             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2700          case float_round_up:
2701             return make_float64(
2702             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2703         }
2704         return packFloat64( aSign, 0, 0 );
2705     }
2706     lastBitMask = 1;
2707     lastBitMask <<= 0x433 - aExp;
2708     roundBitsMask = lastBitMask - 1;
2709     z = float64_val(a);
2710     roundingMode = STATUS(float_rounding_mode);
2711     if ( roundingMode == float_round_nearest_even ) {
2712         z += lastBitMask>>1;
2713         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2714     }
2715     else if ( roundingMode != float_round_to_zero ) {
2716         if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2717             z += roundBitsMask;
2718         }
2719     }
2720     z &= ~ roundBitsMask;
2721     if ( z != float64_val(a) )
2722         STATUS(float_exception_flags) |= float_flag_inexact;
2723     return make_float64(z);
2724
2725 }
2726
2727 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2728 {
2729     int oldmode;
2730     float64 res;
2731     oldmode = STATUS(float_rounding_mode);
2732     STATUS(float_rounding_mode) = float_round_to_zero;
2733     res = float64_round_to_int(a STATUS_VAR);
2734     STATUS(float_rounding_mode) = oldmode;
2735     return res;
2736 }
2737
2738 /*----------------------------------------------------------------------------
2739 | Returns the result of adding the absolute values of the double-precision
2740 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2741 | before being returned.  `zSign' is ignored if the result is a NaN.
2742 | The addition is performed according to the IEC/IEEE Standard for Binary
2743 | Floating-Point Arithmetic.
2744 *----------------------------------------------------------------------------*/
2745
2746 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2747 {
2748     int16 aExp, bExp, zExp;
2749     bits64 aSig, bSig, zSig;
2750     int16 expDiff;
2751
2752     aSig = extractFloat64Frac( a );
2753     aExp = extractFloat64Exp( a );
2754     bSig = extractFloat64Frac( b );
2755     bExp = extractFloat64Exp( b );
2756     expDiff = aExp - bExp;
2757     aSig <<= 9;
2758     bSig <<= 9;
2759     if ( 0 < expDiff ) {
2760         if ( aExp == 0x7FF ) {
2761             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2762             return a;
2763         }
2764         if ( bExp == 0 ) {
2765             --expDiff;
2766         }
2767         else {
2768             bSig |= LIT64( 0x2000000000000000 );
2769         }
2770         shift64RightJamming( bSig, expDiff, &bSig );
2771         zExp = aExp;
2772     }
2773     else if ( expDiff < 0 ) {
2774         if ( bExp == 0x7FF ) {
2775             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2776             return packFloat64( zSign, 0x7FF, 0 );
2777         }
2778         if ( aExp == 0 ) {
2779             ++expDiff;
2780         }
2781         else {
2782             aSig |= LIT64( 0x2000000000000000 );
2783         }
2784         shift64RightJamming( aSig, - expDiff, &aSig );
2785         zExp = bExp;
2786     }
2787     else {
2788         if ( aExp == 0x7FF ) {
2789             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2790             return a;
2791         }
2792         if ( aExp == 0 ) {
2793             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
2794             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2795         }
2796         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2797         zExp = aExp;
2798         goto roundAndPack;
2799     }
2800     aSig |= LIT64( 0x2000000000000000 );
2801     zSig = ( aSig + bSig )<<1;
2802     --zExp;
2803     if ( (sbits64) zSig < 0 ) {
2804         zSig = aSig + bSig;
2805         ++zExp;
2806     }
2807  roundAndPack:
2808     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2809
2810 }
2811
2812 /*----------------------------------------------------------------------------
2813 | Returns the result of subtracting the absolute values of the double-
2814 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2815 | difference is negated before being returned.  `zSign' is ignored if the
2816 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2817 | Standard for Binary Floating-Point Arithmetic.
2818 *----------------------------------------------------------------------------*/
2819
2820 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2821 {
2822     int16 aExp, bExp, zExp;
2823     bits64 aSig, bSig, zSig;
2824     int16 expDiff;
2825
2826     aSig = extractFloat64Frac( a );
2827     aExp = extractFloat64Exp( a );
2828     bSig = extractFloat64Frac( b );
2829     bExp = extractFloat64Exp( b );
2830     expDiff = aExp - bExp;
2831     aSig <<= 10;
2832     bSig <<= 10;
2833     if ( 0 < expDiff ) goto aExpBigger;
2834     if ( expDiff < 0 ) goto bExpBigger;
2835     if ( aExp == 0x7FF ) {
2836         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2837         float_raise( float_flag_invalid STATUS_VAR);
2838         return float64_default_nan;
2839     }
2840     if ( aExp == 0 ) {
2841         aExp = 1;
2842         bExp = 1;
2843     }
2844     if ( bSig < aSig ) goto aBigger;
2845     if ( aSig < bSig ) goto bBigger;
2846     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2847  bExpBigger:
2848     if ( bExp == 0x7FF ) {
2849         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2850         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2851     }
2852     if ( aExp == 0 ) {
2853         ++expDiff;
2854     }
2855     else {
2856         aSig |= LIT64( 0x4000000000000000 );
2857     }
2858     shift64RightJamming( aSig, - expDiff, &aSig );
2859     bSig |= LIT64( 0x4000000000000000 );
2860  bBigger:
2861     zSig = bSig - aSig;
2862     zExp = bExp;
2863     zSign ^= 1;
2864     goto normalizeRoundAndPack;
2865  aExpBigger:
2866     if ( aExp == 0x7FF ) {
2867         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2868         return a;
2869     }
2870     if ( bExp == 0 ) {
2871         --expDiff;
2872     }
2873     else {
2874         bSig |= LIT64( 0x4000000000000000 );
2875     }
2876     shift64RightJamming( bSig, expDiff, &bSig );
2877     aSig |= LIT64( 0x4000000000000000 );
2878  aBigger:
2879     zSig = aSig - bSig;
2880     zExp = aExp;
2881  normalizeRoundAndPack:
2882     --zExp;
2883     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2884
2885 }
2886
2887 /*----------------------------------------------------------------------------
2888 | Returns the result of adding the double-precision floating-point values `a'
2889 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2890 | Binary Floating-Point Arithmetic.
2891 *----------------------------------------------------------------------------*/
2892
2893 float64 float64_add( float64 a, float64 b STATUS_PARAM )
2894 {
2895     flag aSign, bSign;
2896
2897     aSign = extractFloat64Sign( a );
2898     bSign = extractFloat64Sign( b );
2899     if ( aSign == bSign ) {
2900         return addFloat64Sigs( a, b, aSign STATUS_VAR );
2901     }
2902     else {
2903         return subFloat64Sigs( a, b, aSign STATUS_VAR );
2904     }
2905
2906 }
2907
2908 /*----------------------------------------------------------------------------
2909 | Returns the result of subtracting the double-precision floating-point values
2910 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2911 | for Binary Floating-Point Arithmetic.
2912 *----------------------------------------------------------------------------*/
2913
2914 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2915 {
2916     flag aSign, bSign;
2917
2918     aSign = extractFloat64Sign( a );
2919     bSign = extractFloat64Sign( b );
2920     if ( aSign == bSign ) {
2921         return subFloat64Sigs( a, b, aSign STATUS_VAR );
2922     }
2923     else {
2924         return addFloat64Sigs( a, b, aSign STATUS_VAR );
2925     }
2926
2927 }
2928
2929 /*----------------------------------------------------------------------------
2930 | Returns the result of multiplying the double-precision floating-point values
2931 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2932 | for Binary Floating-Point Arithmetic.
2933 *----------------------------------------------------------------------------*/
2934
2935 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2936 {
2937     flag aSign, bSign, zSign;
2938     int16 aExp, bExp, zExp;
2939     bits64 aSig, bSig, zSig0, zSig1;
2940
2941     aSig = extractFloat64Frac( a );
2942     aExp = extractFloat64Exp( a );
2943     aSign = extractFloat64Sign( a );
2944     bSig = extractFloat64Frac( b );
2945     bExp = extractFloat64Exp( b );
2946     bSign = extractFloat64Sign( b );
2947     zSign = aSign ^ bSign;
2948     if ( aExp == 0x7FF ) {
2949         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2950             return propagateFloat64NaN( a, b STATUS_VAR );
2951         }
2952         if ( ( bExp | bSig ) == 0 ) {
2953             float_raise( float_flag_invalid STATUS_VAR);
2954             return float64_default_nan;
2955         }
2956         return packFloat64( zSign, 0x7FF, 0 );
2957     }
2958     if ( bExp == 0x7FF ) {
2959         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2960         if ( ( aExp | aSig ) == 0 ) {
2961             float_raise( float_flag_invalid STATUS_VAR);
2962             return float64_default_nan;
2963         }
2964         return packFloat64( zSign, 0x7FF, 0 );
2965     }
2966     if ( aExp == 0 ) {
2967         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2968         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2969     }
2970     if ( bExp == 0 ) {
2971         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2972         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2973     }
2974     zExp = aExp + bExp - 0x3FF;
2975     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2976     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2977     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2978     zSig0 |= ( zSig1 != 0 );
2979     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2980         zSig0 <<= 1;
2981         --zExp;
2982     }
2983     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2984
2985 }
2986
2987 /*----------------------------------------------------------------------------
2988 | Returns the result of dividing the double-precision floating-point value `a'
2989 | by the corresponding value `b'.  The operation is performed according to
2990 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2991 *----------------------------------------------------------------------------*/
2992
2993 float64 float64_div( float64 a, float64 b STATUS_PARAM )
2994 {
2995     flag aSign, bSign, zSign;
2996     int16 aExp, bExp, zExp;
2997     bits64 aSig, bSig, zSig;
2998     bits64 rem0, rem1;
2999     bits64 term0, term1;
3000
3001     aSig = extractFloat64Frac( a );
3002     aExp = extractFloat64Exp( a );
3003     aSign = extractFloat64Sign( a );
3004     bSig = extractFloat64Frac( b );
3005     bExp = extractFloat64Exp( b );
3006     bSign = extractFloat64Sign( b );
3007     zSign = aSign ^ bSign;
3008     if ( aExp == 0x7FF ) {
3009         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3010         if ( bExp == 0x7FF ) {
3011             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3012             float_raise( float_flag_invalid STATUS_VAR);
3013             return float64_default_nan;
3014         }
3015         return packFloat64( zSign, 0x7FF, 0 );
3016     }
3017     if ( bExp == 0x7FF ) {
3018         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3019         return packFloat64( zSign, 0, 0 );
3020     }
3021     if ( bExp == 0 ) {
3022         if ( bSig == 0 ) {
3023             if ( ( aExp | aSig ) == 0 ) {
3024                 float_raise( float_flag_invalid STATUS_VAR);
3025                 return float64_default_nan;
3026             }
3027             float_raise( float_flag_divbyzero STATUS_VAR);
3028             return packFloat64( zSign, 0x7FF, 0 );
3029         }
3030         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3031     }
3032     if ( aExp == 0 ) {
3033         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3034         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3035     }
3036     zExp = aExp - bExp + 0x3FD;
3037     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3038     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3039     if ( bSig <= ( aSig + aSig ) ) {
3040         aSig >>= 1;
3041         ++zExp;
3042     }
3043     zSig = estimateDiv128To64( aSig, 0, bSig );
3044     if ( ( zSig & 0x1FF ) <= 2 ) {
3045         mul64To128( bSig, zSig, &term0, &term1 );
3046         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3047         while ( (sbits64) rem0 < 0 ) {
3048             --zSig;
3049             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3050         }
3051         zSig |= ( rem1 != 0 );
3052     }
3053     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3054
3055 }
3056
3057 /*----------------------------------------------------------------------------
3058 | Returns the remainder of the double-precision floating-point value `a'
3059 | with respect to the corresponding value `b'.  The operation is performed
3060 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3061 *----------------------------------------------------------------------------*/
3062
3063 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3064 {
3065     flag aSign, bSign, zSign;
3066     int16 aExp, bExp, expDiff;
3067     bits64 aSig, bSig;
3068     bits64 q, alternateASig;
3069     sbits64 sigMean;
3070
3071     aSig = extractFloat64Frac( a );
3072     aExp = extractFloat64Exp( a );
3073     aSign = extractFloat64Sign( a );
3074     bSig = extractFloat64Frac( b );
3075     bExp = extractFloat64Exp( b );
3076     bSign = extractFloat64Sign( b );
3077     if ( aExp == 0x7FF ) {
3078         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3079             return propagateFloat64NaN( a, b STATUS_VAR );
3080         }
3081         float_raise( float_flag_invalid STATUS_VAR);
3082         return float64_default_nan;
3083     }
3084     if ( bExp == 0x7FF ) {
3085         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3086         return a;
3087     }
3088     if ( bExp == 0 ) {
3089         if ( bSig == 0 ) {
3090             float_raise( float_flag_invalid STATUS_VAR);
3091             return float64_default_nan;
3092         }
3093         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3094     }
3095     if ( aExp == 0 ) {
3096         if ( aSig == 0 ) return a;
3097         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3098     }
3099     expDiff = aExp - bExp;
3100     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3101     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3102     if ( expDiff < 0 ) {
3103         if ( expDiff < -1 ) return a;
3104         aSig >>= 1;
3105     }
3106     q = ( bSig <= aSig );
3107     if ( q ) aSig -= bSig;
3108     expDiff -= 64;
3109     while ( 0 < expDiff ) {
3110         q = estimateDiv128To64( aSig, 0, bSig );
3111         q = ( 2 < q ) ? q - 2 : 0;
3112         aSig = - ( ( bSig>>2 ) * q );
3113         expDiff -= 62;
3114     }
3115     expDiff += 64;
3116     if ( 0 < expDiff ) {
3117         q = estimateDiv128To64( aSig, 0, bSig );
3118         q = ( 2 < q ) ? q - 2 : 0;
3119         q >>= 64 - expDiff;
3120         bSig >>= 2;
3121         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3122     }
3123     else {
3124         aSig >>= 2;
3125         bSig >>= 2;
3126     }
3127     do {
3128         alternateASig = aSig;
3129         ++q;
3130         aSig -= bSig;
3131     } while ( 0 <= (sbits64) aSig );
3132     sigMean = aSig + alternateASig;
3133     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3134         aSig = alternateASig;
3135     }
3136     zSign = ( (sbits64) aSig < 0 );
3137     if ( zSign ) aSig = - aSig;
3138     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3139
3140 }
3141
3142 /*----------------------------------------------------------------------------
3143 | Returns the square root of the double-precision floating-point value `a'.
3144 | The operation is performed according to the IEC/IEEE Standard for Binary
3145 | Floating-Point Arithmetic.
3146 *----------------------------------------------------------------------------*/
3147
3148 float64 float64_sqrt( float64 a STATUS_PARAM )
3149 {
3150     flag aSign;
3151     int16 aExp, zExp;
3152     bits64 aSig, zSig, doubleZSig;
3153     bits64 rem0, rem1, term0, term1;
3154
3155     aSig = extractFloat64Frac( a );
3156     aExp = extractFloat64Exp( a );
3157     aSign = extractFloat64Sign( a );
3158     if ( aExp == 0x7FF ) {
3159         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3160         if ( ! aSign ) return a;
3161         float_raise( float_flag_invalid STATUS_VAR);
3162         return float64_default_nan;
3163     }
3164     if ( aSign ) {
3165         if ( ( aExp | aSig ) == 0 ) return a;
3166         float_raise( float_flag_invalid STATUS_VAR);
3167         return float64_default_nan;
3168     }
3169     if ( aExp == 0 ) {
3170         if ( aSig == 0 ) return float64_zero;
3171         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3172     }
3173     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3174     aSig |= LIT64( 0x0010000000000000 );
3175     zSig = estimateSqrt32( aExp, aSig>>21 );
3176     aSig <<= 9 - ( aExp & 1 );
3177     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3178     if ( ( zSig & 0x1FF ) <= 5 ) {
3179         doubleZSig = zSig<<1;
3180         mul64To128( zSig, zSig, &term0, &term1 );
3181         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3182         while ( (sbits64) rem0 < 0 ) {
3183             --zSig;
3184             doubleZSig -= 2;
3185             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3186         }
3187         zSig |= ( ( rem0 | rem1 ) != 0 );
3188     }
3189     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3190
3191 }
3192
3193 /*----------------------------------------------------------------------------
3194 | Returns the binary log of the double-precision floating-point value `a'.
3195 | The operation is performed according to the IEC/IEEE Standard for Binary
3196 | Floating-Point Arithmetic.
3197 *----------------------------------------------------------------------------*/
3198 float64 float64_log2( float64 a STATUS_PARAM )
3199 {
3200     flag aSign, zSign;
3201     int16 aExp;
3202     bits64 aSig, aSig0, aSig1, zSig, i;
3203
3204     aSig = extractFloat64Frac( a );
3205     aExp = extractFloat64Exp( a );
3206     aSign = extractFloat64Sign( a );
3207
3208     if ( aExp == 0 ) {
3209         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3210         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3211     }
3212     if ( aSign ) {
3213         float_raise( float_flag_invalid STATUS_VAR);
3214         return float64_default_nan;
3215     }
3216     if ( aExp == 0x7FF ) {
3217         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3218         return a;
3219     }
3220
3221     aExp -= 0x3FF;
3222     aSig |= LIT64( 0x0010000000000000 );
3223     zSign = aExp < 0;
3224     zSig = (bits64)aExp << 52;
3225     for (i = 1LL << 51; i > 0; i >>= 1) {
3226         mul64To128( aSig, aSig, &aSig0, &aSig1 );
3227         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3228         if ( aSig & LIT64( 0x0020000000000000 ) ) {
3229             aSig >>= 1;
3230             zSig |= i;
3231         }
3232     }
3233
3234     if ( zSign )
3235         zSig = -zSig;
3236     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3237 }
3238
3239 /*----------------------------------------------------------------------------
3240 | Returns 1 if the double-precision floating-point value `a' is equal to the
3241 | corresponding value `b', and 0 otherwise.  The comparison is performed
3242 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3243 *----------------------------------------------------------------------------*/
3244
3245 int float64_eq( float64 a, float64 b STATUS_PARAM )
3246 {
3247     bits64 av, bv;
3248
3249     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3250          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3251        ) {
3252         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3253             float_raise( float_flag_invalid STATUS_VAR);
3254         }
3255         return 0;
3256     }
3257     av = float64_val(a);
3258     bv = float64_val(b);
3259     return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3260
3261 }
3262
3263 /*----------------------------------------------------------------------------
3264 | Returns 1 if the double-precision floating-point value `a' is less than or
3265 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
3266 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3267 | Arithmetic.
3268 *----------------------------------------------------------------------------*/
3269
3270 int float64_le( float64 a, float64 b STATUS_PARAM )
3271 {
3272     flag aSign, bSign;
3273     bits64 av, bv;
3274
3275     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3276          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3277        ) {
3278         float_raise( float_flag_invalid STATUS_VAR);
3279         return 0;
3280     }
3281     aSign = extractFloat64Sign( a );
3282     bSign = extractFloat64Sign( b );
3283     av = float64_val(a);
3284     bv = float64_val(b);
3285     if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3286     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3287
3288 }
3289
3290 /*----------------------------------------------------------------------------
3291 | Returns 1 if the double-precision floating-point value `a' is less than
3292 | the corresponding value `b', and 0 otherwise.  The comparison is performed
3293 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3294 *----------------------------------------------------------------------------*/
3295
3296 int float64_lt( float64 a, float64 b STATUS_PARAM )
3297 {
3298     flag aSign, bSign;
3299     bits64 av, bv;
3300
3301     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3302          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3303        ) {
3304         float_raise( float_flag_invalid STATUS_VAR);
3305         return 0;
3306     }
3307     aSign = extractFloat64Sign( a );
3308     bSign = extractFloat64Sign( b );
3309     av = float64_val(a);
3310     bv = float64_val(b);
3311     if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3312     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3313
3314 }
3315
3316 /*----------------------------------------------------------------------------
3317 | Returns 1 if the double-precision floating-point value `a' is equal to the
3318 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
3319 | if either operand is a NaN.  Otherwise, the comparison is performed
3320 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3321 *----------------------------------------------------------------------------*/
3322
3323 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3324 {
3325     bits64 av, bv;
3326
3327     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3328          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3329        ) {
3330         float_raise( float_flag_invalid STATUS_VAR);
3331         return 0;
3332     }
3333     av = float64_val(a);
3334     bv = float64_val(b);
3335     return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3336
3337 }
3338
3339 /*----------------------------------------------------------------------------
3340 | Returns 1 if the double-precision floating-point value `a' is less than or
3341 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3342 | cause an exception.  Otherwise, the comparison is performed according to the
3343 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3344 *----------------------------------------------------------------------------*/
3345
3346 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3347 {
3348     flag aSign, bSign;
3349     bits64 av, bv;
3350
3351     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3352          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3353        ) {
3354         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3355             float_raise( float_flag_invalid STATUS_VAR);
3356         }
3357         return 0;
3358     }
3359     aSign = extractFloat64Sign( a );
3360     bSign = extractFloat64Sign( b );
3361     av = float64_val(a);
3362     bv = float64_val(b);
3363     if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3364     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3365
3366 }
3367
3368 /*----------------------------------------------------------------------------
3369 | Returns 1 if the double-precision floating-point value `a' is less than
3370 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3371 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3372 | Standard for Binary Floating-Point Arithmetic.
3373 *----------------------------------------------------------------------------*/
3374
3375 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3376 {
3377     flag aSign, bSign;
3378     bits64 av, bv;
3379
3380     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3381          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3382        ) {
3383         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3384             float_raise( float_flag_invalid STATUS_VAR);
3385         }
3386         return 0;
3387     }
3388     aSign = extractFloat64Sign( a );
3389     bSign = extractFloat64Sign( b );
3390     av = float64_val(a);
3391     bv = float64_val(b);
3392     if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3393     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3394
3395 }
3396
3397 #ifdef FLOATX80
3398
3399 /*----------------------------------------------------------------------------
3400 | Returns the result of converting the extended double-precision floating-
3401 | point value `a' to the 32-bit two's complement integer format.  The
3402 | conversion is performed according to the IEC/IEEE Standard for Binary
3403 | Floating-Point Arithmetic---which means in particular that the conversion
3404 | is rounded according to the current rounding mode.  If `a' is a NaN, the
3405 | largest positive integer is returned.  Otherwise, if the conversion
3406 | overflows, the largest integer with the same sign as `a' is returned.
3407 *----------------------------------------------------------------------------*/
3408
3409 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3410 {
3411     flag aSign;
3412     int32 aExp, shiftCount;
3413     bits64 aSig;
3414
3415     aSig = extractFloatx80Frac( a );
3416     aExp = extractFloatx80Exp( a );
3417     aSign = extractFloatx80Sign( a );
3418     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3419     shiftCount = 0x4037 - aExp;
3420     if ( shiftCount <= 0 ) shiftCount = 1;
3421     shift64RightJamming( aSig, shiftCount, &aSig );
3422     return roundAndPackInt32( aSign, aSig STATUS_VAR );
3423
3424 }
3425
3426 /*----------------------------------------------------------------------------
3427 | Returns the result of converting the extended double-precision floating-
3428 | point value `a' to the 32-bit two's complement integer format.  The
3429 | conversion is performed according to the IEC/IEEE Standard for Binary
3430 | Floating-Point Arithmetic, except that the conversion is always rounded
3431 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3432 | Otherwise, if the conversion overflows, the largest integer with the same
3433 | sign as `a' is returned.
3434 *----------------------------------------------------------------------------*/
3435
3436 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3437 {
3438     flag aSign;
3439     int32 aExp, shiftCount;
3440     bits64 aSig, savedASig;
3441     int32 z;
3442
3443     aSig = extractFloatx80Frac( a );
3444     aExp = extractFloatx80Exp( a );
3445     aSign = extractFloatx80Sign( a );
3446     if ( 0x401E < aExp ) {
3447         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3448         goto invalid;
3449     }
3450     else if ( aExp < 0x3FFF ) {
3451         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3452         return 0;
3453     }
3454     shiftCount = 0x403E - aExp;
3455     savedASig = aSig;
3456     aSig >>= shiftCount;
3457     z = aSig;
3458     if ( aSign ) z = - z;
3459     if ( ( z < 0 ) ^ aSign ) {
3460  invalid:
3461         float_raise( float_flag_invalid STATUS_VAR);
3462         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3463     }
3464     if ( ( aSig<<shiftCount ) != savedASig ) {
3465         STATUS(float_exception_flags) |= float_flag_inexact;
3466     }
3467     return z;
3468
3469 }
3470
3471 /*----------------------------------------------------------------------------
3472 | Returns the result of converting the extended double-precision floating-
3473 | point value `a' to the 64-bit two's complement integer format.  The
3474 | conversion is performed according to the IEC/IEEE Standard for Binary
3475 | Floating-Point Arithmetic---which means in particular that the conversion
3476 | is rounded according to the current rounding mode.  If `a' is a NaN,
3477 | the largest positive integer is returned.  Otherwise, if the conversion
3478 | overflows, the largest integer with the same sign as `a' is returned.
3479 *----------------------------------------------------------------------------*/
3480
3481 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3482 {
3483     flag aSign;
3484     int32 aExp, shiftCount;
3485     bits64 aSig, aSigExtra;
3486
3487     aSig = extractFloatx80Frac( a );
3488     aExp = extractFloatx80Exp( a );
3489     aSign = extractFloatx80Sign( a );
3490     shiftCount = 0x403E - aExp;
3491     if ( shiftCount <= 0 ) {
3492         if ( shiftCount ) {
3493             float_raise( float_flag_invalid STATUS_VAR);
3494             if (    ! aSign
3495                  || (    ( aExp == 0x7FFF )
3496                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3497                ) {
3498                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3499             }
3500             return (sbits64) LIT64( 0x8000000000000000 );
3501         }
3502         aSigExtra = 0;
3503     }
3504     else {
3505         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3506     }
3507     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3508
3509 }
3510
3511 /*----------------------------------------------------------------------------
3512 | Returns the result of converting the extended double-precision floating-
3513 | point value `a' to the 64-bit two's complement integer format.  The
3514 | conversion is performed according to the IEC/IEEE Standard for Binary
3515 | Floating-Point Arithmetic, except that the conversion is always rounded
3516 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3517 | Otherwise, if the conversion overflows, the largest integer with the same
3518 | sign as `a' is returned.
3519 *----------------------------------------------------------------------------*/
3520
3521 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3522 {
3523     flag aSign;
3524     int32 aExp, shiftCount;
3525     bits64 aSig;
3526     int64 z;
3527
3528     aSig = extractFloatx80Frac( a );
3529     aExp = extractFloatx80Exp( a );
3530     aSign = extractFloatx80Sign( a );
3531     shiftCount = aExp - 0x403E;
3532     if ( 0 <= shiftCount ) {
3533         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3534         if ( ( a.high != 0xC03E ) || aSig ) {
3535             float_raise( float_flag_invalid STATUS_VAR);
3536             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3537                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3538             }
3539         }
3540         return (sbits64) LIT64( 0x8000000000000000 );
3541     }
3542     else if ( aExp < 0x3FFF ) {
3543         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3544         return 0;
3545     }
3546     z = aSig>>( - shiftCount );
3547     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3548         STATUS(float_exception_flags) |= float_flag_inexact;
3549     }
3550     if ( aSign ) z = - z;
3551     return z;
3552
3553 }
3554
3555 /*----------------------------------------------------------------------------
3556 | Returns the result of converting the extended double-precision floating-
3557 | point value `a' to the single-precision floating-point format.  The
3558 | conversion is performed according to the IEC/IEEE Standard for Binary
3559 | Floating-Point Arithmetic.
3560 *----------------------------------------------------------------------------*/
3561
3562 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3563 {
3564     flag aSign;
3565     int32 aExp;
3566     bits64 aSig;
3567
3568     aSig = extractFloatx80Frac( a );
3569     aExp = extractFloatx80Exp( a );
3570     aSign = extractFloatx80Sign( a );
3571     if ( aExp == 0x7FFF ) {
3572         if ( (bits64) ( aSig<<1 ) ) {
3573             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3574         }
3575         return packFloat32( aSign, 0xFF, 0 );
3576     }
3577     shift64RightJamming( aSig, 33, &aSig );
3578     if ( aExp || aSig ) aExp -= 0x3F81;
3579     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3580
3581 }
3582
3583 /*----------------------------------------------------------------------------
3584 | Returns the result of converting the extended double-precision floating-
3585 | point value `a' to the double-precision floating-point format.  The
3586 | conversion is performed according to the IEC/IEEE Standard for Binary
3587 | Floating-Point Arithmetic.
3588 *----------------------------------------------------------------------------*/
3589
3590 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3591 {
3592     flag aSign;
3593     int32 aExp;
3594     bits64 aSig, zSig;
3595
3596     aSig = extractFloatx80Frac( a );
3597     aExp = extractFloatx80Exp( a );
3598     aSign = extractFloatx80Sign( a );
3599     if ( aExp == 0x7FFF ) {
3600         if ( (bits64) ( aSig<<1 ) ) {
3601             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3602         }
3603         return packFloat64( aSign, 0x7FF, 0 );
3604     }
3605     shift64RightJamming( aSig, 1, &zSig );
3606     if ( aExp || aSig ) aExp -= 0x3C01;
3607     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3608
3609 }
3610
3611 #ifdef FLOAT128
3612
3613 /*----------------------------------------------------------------------------
3614 | Returns the result of converting the extended double-precision floating-
3615 | point value `a' to the quadruple-precision floating-point format.  The
3616 | conversion is performed according to the IEC/IEEE Standard for Binary
3617 | Floating-Point Arithmetic.
3618 *----------------------------------------------------------------------------*/
3619
3620 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3621 {
3622     flag aSign;
3623     int16 aExp;
3624     bits64 aSig, zSig0, zSig1;
3625
3626     aSig = extractFloatx80Frac( a );
3627     aExp = extractFloatx80Exp( a );
3628     aSign = extractFloatx80Sign( a );
3629     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3630         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3631     }
3632     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3633     return packFloat128( aSign, aExp, zSig0, zSig1 );
3634
3635 }
3636
3637 #endif
3638
3639 /*----------------------------------------------------------------------------
3640 | Rounds the extended double-precision floating-point value `a' to an integer,
3641 | and returns the result as an extended quadruple-precision floating-point
3642 | value.  The operation is performed according to the IEC/IEEE Standard for
3643 | Binary Floating-Point Arithmetic.
3644 *----------------------------------------------------------------------------*/
3645
3646 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3647 {
3648     flag aSign;
3649     int32 aExp;
3650     bits64 lastBitMask, roundBitsMask;
3651     int8 roundingMode;
3652     floatx80 z;
3653
3654     aExp = extractFloatx80Exp( a );
3655     if ( 0x403E <= aExp ) {
3656         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3657             return propagateFloatx80NaN( a, a STATUS_VAR );
3658         }
3659         return a;
3660     }
3661     if ( aExp < 0x3FFF ) {
3662         if (    ( aExp == 0 )
3663              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3664             return a;
3665         }
3666         STATUS(float_exception_flags) |= float_flag_inexact;
3667         aSign = extractFloatx80Sign( a );
3668         switch ( STATUS(float_rounding_mode) ) {
3669          case float_round_nearest_even:
3670             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3671                ) {
3672                 return
3673                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3674             }
3675             break;
3676          case float_round_down:
3677             return
3678                   aSign ?
3679                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3680                 : packFloatx80( 0, 0, 0 );
3681          case float_round_up:
3682             return
3683                   aSign ? packFloatx80( 1, 0, 0 )
3684                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3685         }
3686         return packFloatx80( aSign, 0, 0 );
3687     }
3688     lastBitMask = 1;
3689     lastBitMask <<= 0x403E - aExp;
3690     roundBitsMask = lastBitMask - 1;
3691     z = a;
3692     roundingMode = STATUS(float_rounding_mode);
3693     if ( roundingMode == float_round_nearest_even ) {
3694         z.low += lastBitMask>>1;
3695         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3696     }
3697     else if ( roundingMode != float_round_to_zero ) {
3698         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3699             z.low += roundBitsMask;
3700         }
3701     }
3702     z.low &= ~ roundBitsMask;
3703     if ( z.low == 0 ) {
3704         ++z.high;
3705         z.low = LIT64( 0x8000000000000000 );
3706     }
3707     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3708     return z;
3709
3710 }
3711
3712 /*----------------------------------------------------------------------------
3713 | Returns the result of adding the absolute values of the extended double-
3714 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3715 | negated before being returned.  `zSign' is ignored if the result is a NaN.
3716 | The addition is performed according to the IEC/IEEE Standard for Binary
3717 | Floating-Point Arithmetic.
3718 *----------------------------------------------------------------------------*/
3719
3720 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3721 {
3722     int32 aExp, bExp, zExp;
3723     bits64 aSig, bSig, zSig0, zSig1;
3724     int32 expDiff;
3725
3726     aSig = extractFloatx80Frac( a );
3727     aExp = extractFloatx80Exp( a );
3728     bSig = extractFloatx80Frac( b );
3729     bExp = extractFloatx80Exp( b );
3730     expDiff = aExp - bExp;
3731     if ( 0 < expDiff ) {
3732         if ( aExp == 0x7FFF ) {
3733             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3734             return a;
3735         }
3736         if ( bExp == 0 ) --expDiff;
3737         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3738         zExp = aExp;
3739     }
3740     else if ( expDiff < 0 ) {
3741         if ( bExp == 0x7FFF ) {
3742             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3743             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3744         }
3745         if ( aExp == 0 ) ++expDiff;
3746         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3747         zExp = bExp;
3748     }
3749     else {
3750         if ( aExp == 0x7FFF ) {
3751             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3752                 return propagateFloatx80NaN( a, b STATUS_VAR );
3753             }
3754             return a;
3755         }
3756         zSig1 = 0;
3757         zSig0 = aSig + bSig;
3758         if ( aExp == 0 ) {
3759             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3760             goto roundAndPack;
3761         }
3762         zExp = aExp;
3763         goto shiftRight1;
3764     }
3765     zSig0 = aSig + bSig;
3766     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3767  shiftRight1:
3768     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3769     zSig0 |= LIT64( 0x8000000000000000 );
3770     ++zExp;
3771  roundAndPack:
3772     return
3773         roundAndPackFloatx80(
3774             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3775
3776 }
3777
3778 /*----------------------------------------------------------------------------
3779 | Returns the result of subtracting the absolute values of the extended
3780 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3781 | difference is negated before being returned.  `zSign' is ignored if the
3782 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3783 | Standard for Binary Floating-Point Arithmetic.
3784 *----------------------------------------------------------------------------*/
3785
3786 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3787 {
3788     int32 aExp, bExp, zExp;
3789     bits64 aSig, bSig, zSig0, zSig1;
3790     int32 expDiff;
3791     floatx80 z;
3792
3793     aSig = extractFloatx80Frac( a );
3794     aExp = extractFloatx80Exp( a );
3795     bSig = extractFloatx80Frac( b );
3796     bExp = extractFloatx80Exp( b );
3797     expDiff = aExp - bExp;
3798     if ( 0 < expDiff ) goto aExpBigger;
3799     if ( expDiff < 0 ) goto bExpBigger;
3800     if ( aExp == 0x7FFF ) {
3801         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3802             return propagateFloatx80NaN( a, b STATUS_VAR );
3803         }
3804         float_raise( float_flag_invalid STATUS_VAR);
3805         z.low = floatx80_default_nan_low;
3806         z.high = floatx80_default_nan_high;
3807         return z;
3808     }
3809     if ( aExp == 0 ) {
3810         aExp = 1;
3811         bExp = 1;
3812     }
3813     zSig1 = 0;
3814     if ( bSig < aSig ) goto aBigger;
3815     if ( aSig < bSig ) goto bBigger;
3816     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3817  bExpBigger:
3818     if ( bExp == 0x7FFF ) {
3819         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3820         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3821     }
3822     if ( aExp == 0 ) ++expDiff;
3823     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3824  bBigger:
3825     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3826     zExp = bExp;
3827     zSign ^= 1;
3828     goto normalizeRoundAndPack;
3829  aExpBigger:
3830     if ( aExp == 0x7FFF ) {
3831         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3832         return a;
3833     }
3834     if ( bExp == 0 ) --expDiff;
3835     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3836  aBigger:
3837     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3838     zExp = aExp;
3839  normalizeRoundAndPack:
3840     return
3841         normalizeRoundAndPackFloatx80(
3842             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3843
3844 }
3845
3846 /*----------------------------------------------------------------------------
3847 | Returns the result of adding the extended double-precision floating-point
3848 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
3849 | Standard for Binary Floating-Point Arithmetic.
3850 *----------------------------------------------------------------------------*/
3851
3852 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3853 {
3854     flag aSign, bSign;
3855
3856     aSign = extractFloatx80Sign( a );
3857     bSign = extractFloatx80Sign( b );
3858     if ( aSign == bSign ) {
3859         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3860     }
3861     else {
3862         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3863     }
3864
3865 }
3866
3867 /*----------------------------------------------------------------------------
3868 | Returns the result of subtracting the extended double-precision floating-
3869 | point values `a' and `b'.  The operation is performed according to the
3870 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3871 *----------------------------------------------------------------------------*/
3872
3873 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3874 {
3875     flag aSign, bSign;
3876
3877     aSign = extractFloatx80Sign( a );
3878     bSign = extractFloatx80Sign( b );
3879     if ( aSign == bSign ) {
3880         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3881     }
3882     else {
3883         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3884     }
3885
3886 }
3887
3888 /*----------------------------------------------------------------------------
3889 | Returns the result of multiplying the extended double-precision floating-
3890 | point values `a' and `b'.  The operation is performed according to the
3891 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3892 *----------------------------------------------------------------------------*/
3893
3894 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3895 {
3896     flag aSign, bSign, zSign;
3897     int32 aExp, bExp, zExp;
3898     bits64 aSig, bSig, zSig0, zSig1;
3899     floatx80 z;
3900
3901     aSig = extractFloatx80Frac( a );
3902     aExp = extractFloatx80Exp( a );
3903     aSign = extractFloatx80Sign( a );
3904     bSig = extractFloatx80Frac( b );
3905     bExp = extractFloatx80Exp( b );
3906     bSign = extractFloatx80Sign( b );
3907     zSign = aSign ^ bSign;
3908     if ( aExp == 0x7FFF ) {
3909         if (    (bits64) ( aSig<<1 )
3910              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3911             return propagateFloatx80NaN( a, b STATUS_VAR );
3912         }
3913         if ( ( bExp | bSig ) == 0 ) goto invalid;
3914         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3915     }
3916     if ( bExp == 0x7FFF ) {
3917         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3918         if ( ( aExp | aSig ) == 0 ) {
3919  invalid:
3920             float_raise( float_flag_invalid STATUS_VAR);
3921             z.low = floatx80_default_nan_low;
3922             z.high = floatx80_default_nan_high;
3923             return z;
3924         }
3925         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3926     }
3927     if ( aExp == 0 ) {
3928         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3929         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3930     }
3931     if ( bExp == 0 ) {
3932         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3933         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3934     }
3935     zExp = aExp + bExp - 0x3FFE;
3936     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3937     if ( 0 < (sbits64) zSig0 ) {
3938         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3939         --zExp;
3940     }
3941     return
3942         roundAndPackFloatx80(
3943             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3944
3945 }
3946
3947 /*----------------------------------------------------------------------------
3948 | Returns the result of dividing the extended double-precision floating-point
3949 | value `a' by the corresponding value `b'.  The operation is performed
3950 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3951 *----------------------------------------------------------------------------*/
3952
3953 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3954 {
3955     flag aSign, bSign, zSign;
3956     int32 aExp, bExp, zExp;
3957     bits64 aSig, bSig, zSig0, zSig1;
3958     bits64 rem0, rem1, rem2, term0, term1, term2;
3959     floatx80 z;
3960
3961     aSig = extractFloatx80Frac( a );
3962     aExp = extractFloatx80Exp( a );
3963     aSign = extractFloatx80Sign( a );
3964     bSig = extractFloatx80Frac( b );
3965     bExp = extractFloatx80Exp( b );
3966     bSign = extractFloatx80Sign( b );
3967     zSign = aSign ^ bSign;
3968     if ( aExp == 0x7FFF ) {
3969         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3970         if ( bExp == 0x7FFF ) {
3971             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3972             goto invalid;
3973         }
3974         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3975     }
3976     if ( bExp == 0x7FFF ) {
3977         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3978         return packFloatx80( zSign, 0, 0 );
3979     }
3980     if ( bExp == 0 ) {
3981         if ( bSig == 0 ) {
3982             if ( ( aExp | aSig ) == 0 ) {
3983  invalid:
3984                 float_raise( float_flag_invalid STATUS_VAR);
3985                 z.low = floatx80_default_nan_low;
3986                 z.high = floatx80_default_nan_high;
3987                 return z;
3988             }
3989             float_raise( float_flag_divbyzero STATUS_VAR);
3990             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3991         }
3992         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3993     }
3994     if ( aExp == 0 ) {
3995         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3996         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3997     }
3998     zExp = aExp - bExp + 0x3FFE;
3999     rem1 = 0;
4000     if ( bSig <= aSig ) {
4001         shift128Right( aSig, 0, 1, &aSig, &rem1 );
4002         ++zExp;
4003     }
4004     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4005     mul64To128( bSig, zSig0, &term0, &term1 );
4006     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4007     while ( (sbits64) rem0 < 0 ) {
4008         --zSig0;
4009         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4010     }
4011     zSig1 = estimateDiv128To64( rem1, 0, bSig );
4012     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
4013         mul64To128( bSig, zSig1, &term1, &term2 );
4014         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4015         while ( (sbits64) rem1 < 0 ) {
4016             --zSig1;
4017             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4018         }
4019         zSig1 |= ( ( rem1 | rem2 ) != 0 );
4020     }
4021     return
4022         roundAndPackFloatx80(
4023             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4024
4025 }
4026
4027 /*----------------------------------------------------------------------------
4028 | Returns the remainder of the extended double-precision floating-point value
4029 | `a' with respect to the corresponding value `b'.  The operation is performed
4030 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4031 *----------------------------------------------------------------------------*/
4032
4033 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4034 {
4035     flag aSign, bSign, zSign;
4036     int32 aExp, bExp, expDiff;
4037     bits64 aSig0, aSig1, bSig;
4038     bits64 q, term0, term1, alternateASig0, alternateASig1;
4039     floatx80 z;
4040
4041     aSig0 = extractFloatx80Frac( a );
4042     aExp = extractFloatx80Exp( a );
4043     aSign = extractFloatx80Sign( a );
4044     bSig = extractFloatx80Frac( b );
4045     bExp = extractFloatx80Exp( b );
4046     bSign = extractFloatx80Sign( b );
4047     if ( aExp == 0x7FFF ) {
4048         if (    (bits64) ( aSig0<<1 )
4049              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4050             return propagateFloatx80NaN( a, b STATUS_VAR );
4051         }
4052         goto invalid;
4053     }
4054     if ( bExp == 0x7FFF ) {
4055         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4056         return a;
4057     }
4058     if ( bExp == 0 ) {
4059         if ( bSig == 0 ) {
4060  invalid:
4061             float_raise( float_flag_invalid STATUS_VAR);
4062             z.low = floatx80_default_nan_low;
4063             z.high = floatx80_default_nan_high;
4064             return z;
4065         }
4066         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4067     }
4068     if ( aExp == 0 ) {
4069         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4070         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4071     }
4072     bSig |= LIT64( 0x8000000000000000 );
4073     zSign = aSign;
4074     expDiff = aExp - bExp;
4075     aSig1 = 0;
4076     if ( expDiff < 0 ) {
4077         if ( expDiff < -1 ) return a;
4078         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4079         expDiff = 0;
4080     }
4081     q = ( bSig <= aSig0 );
4082     if ( q ) aSig0 -= bSig;
4083     expDiff -= 64;
4084     while ( 0 < expDiff ) {
4085         q = estimateDiv128To64( aSig0, aSig1, bSig );
4086         q = ( 2 < q ) ? q - 2 : 0;
4087         mul64To128( bSig, q, &term0, &term1 );
4088         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4089         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4090         expDiff -= 62;
4091     }
4092     expDiff += 64;
4093     if ( 0 < expDiff ) {
4094         q = estimateDiv128To64( aSig0, aSig1, bSig );
4095         q = ( 2 < q ) ? q - 2 : 0;
4096         q >>= 64 - expDiff;
4097         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4098         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4099         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4100         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4101             ++q;
4102             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4103         }
4104     }
4105     else {
4106         term1 = 0;
4107         term0 = bSig;
4108     }
4109     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4110     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4111          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4112               && ( q & 1 ) )
4113        ) {
4114         aSig0 = alternateASig0;
4115         aSig1 = alternateASig1;
4116         zSign = ! zSign;
4117     }
4118     return
4119         normalizeRoundAndPackFloatx80(
4120             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4121
4122 }
4123
4124 /*----------------------------------------------------------------------------
4125 | Returns the square root of the extended double-precision floating-point
4126 | value `a'.  The operation is performed according to the IEC/IEEE Standard
4127 | for Binary Floating-Point Arithmetic.
4128 *----------------------------------------------------------------------------*/
4129
4130 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4131 {
4132     flag aSign;
4133     int32 aExp, zExp;
4134     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4135     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4136     floatx80 z;
4137
4138     aSig0 = extractFloatx80Frac( a );
4139     aExp = extractFloatx80Exp( a );
4140     aSign = extractFloatx80Sign( a );
4141     if ( aExp == 0x7FFF ) {
4142         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4143         if ( ! aSign ) return a;
4144         goto invalid;
4145     }
4146     if ( aSign ) {
4147         if ( ( aExp | aSig0 ) == 0 ) return a;
4148  invalid:
4149         float_raise( float_flag_invalid STATUS_VAR);
4150         z.low = floatx80_default_nan_low;
4151         z.high = floatx80_default_nan_high;
4152         return z;
4153     }
4154     if ( aExp == 0 ) {
4155         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4156         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4157     }
4158     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4159     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4160     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4161     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4162     doubleZSig0 = zSig0<<1;
4163     mul64To128( zSig0, zSig0, &term0, &term1 );
4164     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4165     while ( (sbits64) rem0 < 0 ) {
4166         --zSig0;
4167         doubleZSig0 -= 2;
4168         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4169     }
4170     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4171     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4172         if ( zSig1 == 0 ) zSig1 = 1;
4173         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4174         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4175         mul64To128( zSig1, zSig1, &term2, &term3 );
4176         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4177         while ( (sbits64) rem1 < 0 ) {
4178             --zSig1;
4179             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4180             term3 |= 1;
4181             term2 |= doubleZSig0;
4182             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4183         }
4184         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4185     }
4186     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4187     zSig0 |= doubleZSig0;
4188     return
4189         roundAndPackFloatx80(
4190             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4191
4192 }
4193
4194 /*----------------------------------------------------------------------------
4195 | Returns 1 if the extended double-precision floating-point value `a' is
4196 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
4197 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4198 | Arithmetic.
4199 *----------------------------------------------------------------------------*/
4200
4201 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4202 {
4203
4204     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4205               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4206          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4207               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4208        ) {
4209         if (    floatx80_is_signaling_nan( a )
4210              || floatx80_is_signaling_nan( b ) ) {
4211             float_raise( float_flag_invalid STATUS_VAR);
4212         }
4213         return 0;
4214     }
4215     return
4216            ( a.low == b.low )
4217         && (    ( a.high == b.high )
4218              || (    ( a.low == 0 )
4219                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4220            );
4221
4222 }
4223
4224 /*----------------------------------------------------------------------------
4225 | Returns 1 if the extended double-precision floating-point value `a' is
4226 | less than or equal to the corresponding value `b', and 0 otherwise.  The
4227 | comparison is performed according to the IEC/IEEE Standard for Binary
4228 | Floating-Point Arithmetic.
4229 *----------------------------------------------------------------------------*/
4230
4231 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4232 {
4233     flag aSign, bSign;
4234
4235     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4236               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4237          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4238               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4239        ) {
4240         float_raise( float_flag_invalid STATUS_VAR);
4241         return 0;
4242     }
4243     aSign = extractFloatx80Sign( a );
4244     bSign = extractFloatx80Sign( b );
4245     if ( aSign != bSign ) {
4246         return
4247                aSign
4248             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4249                  == 0 );
4250     }
4251     return
4252           aSign ? le128( b.high, b.low, a.high, a.low )
4253         : le128( a.high, a.low, b.high, b.low );
4254
4255 }
4256
4257 /*----------------------------------------------------------------------------
4258 | Returns 1 if the extended double-precision floating-point value `a' is
4259 | less than the corresponding value `b', and 0 otherwise.  The comparison
4260 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4261 | Arithmetic.
4262 *----------------------------------------------------------------------------*/
4263
4264 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4265 {
4266     flag aSign, bSign;
4267
4268     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4269               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4270          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4271               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4272        ) {
4273         float_raise( float_flag_invalid STATUS_VAR);
4274         return 0;
4275     }
4276     aSign = extractFloatx80Sign( a );
4277     bSign = extractFloatx80Sign( b );
4278     if ( aSign != bSign ) {
4279         return
4280                aSign
4281             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4282                  != 0 );
4283     }
4284     return
4285           aSign ? lt128( b.high, b.low, a.high, a.low )
4286         : lt128( a.high, a.low, b.high, b.low );
4287
4288 }
4289
4290 /*----------------------------------------------------------------------------
4291 | Returns 1 if the extended double-precision floating-point value `a' is equal
4292 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
4293 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4294 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4295 *----------------------------------------------------------------------------*/
4296
4297 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4298 {
4299
4300     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4301               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4302          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4303               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4304        ) {
4305         float_raise( float_flag_invalid STATUS_VAR);
4306         return 0;
4307     }
4308     return
4309            ( a.low == b.low )
4310         && (    ( a.high == b.high )
4311              || (    ( a.low == 0 )
4312                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4313            );
4314
4315 }
4316
4317 /*----------------------------------------------------------------------------
4318 | Returns 1 if the extended double-precision floating-point value `a' is less
4319 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4320 | do not cause an exception.  Otherwise, the comparison is performed according
4321 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4322 *----------------------------------------------------------------------------*/
4323
4324 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4325 {
4326     flag aSign, bSign;
4327
4328     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4329               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4330          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4331               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4332        ) {
4333         if (    floatx80_is_signaling_nan( a )
4334              || floatx80_is_signaling_nan( b ) ) {
4335             float_raise( float_flag_invalid STATUS_VAR);
4336         }
4337         return 0;
4338     }
4339     aSign = extractFloatx80Sign( a );
4340     bSign = extractFloatx80Sign( b );
4341     if ( aSign != bSign ) {
4342         return
4343                aSign
4344             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4345                  == 0 );
4346     }
4347     return
4348           aSign ? le128( b.high, b.low, a.high, a.low )
4349         : le128( a.high, a.low, b.high, b.low );
4350
4351 }
4352
4353 /*----------------------------------------------------------------------------
4354 | Returns 1 if the extended double-precision floating-point value `a' is less
4355 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4356 | an exception.  Otherwise, the comparison is performed according to the
4357 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4358 *----------------------------------------------------------------------------*/
4359
4360 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4361 {
4362     flag aSign, bSign;
4363
4364     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4365               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4366          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4367               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4368        ) {
4369         if (    floatx80_is_signaling_nan( a )
4370              || floatx80_is_signaling_nan( b ) ) {
4371             float_raise( float_flag_invalid STATUS_VAR);
4372         }
4373         return 0;
4374     }
4375     aSign = extractFloatx80Sign( a );
4376     bSign = extractFloatx80Sign( b );
4377     if ( aSign != bSign ) {
4378         return
4379                aSign
4380             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4381                  != 0 );
4382     }
4383     return
4384           aSign ? lt128( b.high, b.low, a.high, a.low )
4385         : lt128( a.high, a.low, b.high, b.low );
4386
4387 }
4388
4389 #endif
4390
4391 #ifdef FLOAT128
4392
4393 /*----------------------------------------------------------------------------
4394 | Returns the result of converting the quadruple-precision floating-point
4395 | value `a' to the 32-bit two's complement integer format.  The conversion
4396 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4397 | Arithmetic---which means in particular that the conversion is rounded
4398 | according to the current rounding mode.  If `a' is a NaN, the largest
4399 | positive integer is returned.  Otherwise, if the conversion overflows, the
4400 | largest integer with the same sign as `a' is returned.
4401 *----------------------------------------------------------------------------*/
4402
4403 int32 float128_to_int32( float128 a STATUS_PARAM )
4404 {
4405     flag aSign;
4406     int32 aExp, shiftCount;
4407     bits64 aSig0, aSig1;
4408
4409     aSig1 = extractFloat128Frac1( a );
4410     aSig0 = extractFloat128Frac0( a );
4411     aExp = extractFloat128Exp( a );
4412     aSign = extractFloat128Sign( a );
4413     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4414     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4415     aSig0 |= ( aSig1 != 0 );
4416     shiftCount = 0x4028 - aExp;
4417     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4418     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4419
4420 }
4421
4422 /*----------------------------------------------------------------------------
4423 | Returns the result of converting the quadruple-precision floating-point
4424 | value `a' to the 32-bit two's complement integer format.  The conversion
4425 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4426 | Arithmetic, except that the conversion is always rounded toward zero.  If
4427 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4428 | conversion overflows, the largest integer with the same sign as `a' is
4429 | returned.
4430 *----------------------------------------------------------------------------*/
4431
4432 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4433 {
4434     flag aSign;
4435     int32 aExp, shiftCount;
4436     bits64 aSig0, aSig1, savedASig;
4437     int32 z;
4438
4439     aSig1 = extractFloat128Frac1( a );
4440     aSig0 = extractFloat128Frac0( a );
4441     aExp = extractFloat128Exp( a );
4442     aSign = extractFloat128Sign( a );
4443     aSig0 |= ( aSig1 != 0 );
4444     if ( 0x401E < aExp ) {
4445         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4446         goto invalid;
4447     }
4448     else if ( aExp < 0x3FFF ) {
4449         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4450         return 0;
4451     }
4452     aSig0 |= LIT64( 0x0001000000000000 );
4453     shiftCount = 0x402F - aExp;
4454     savedASig = aSig0;
4455     aSig0 >>= shiftCount;
4456     z = aSig0;
4457     if ( aSign ) z = - z;
4458     if ( ( z < 0 ) ^ aSign ) {
4459  invalid:
4460         float_raise( float_flag_invalid STATUS_VAR);
4461         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4462     }
4463     if ( ( aSig0<<shiftCount ) != savedASig ) {
4464         STATUS(float_exception_flags) |= float_flag_inexact;
4465     }
4466     return z;
4467
4468 }
4469
4470 /*----------------------------------------------------------------------------
4471 | Returns the result of converting the quadruple-precision floating-point
4472 | value `a' to the 64-bit two's complement integer format.  The conversion
4473 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4474 | Arithmetic---which means in particular that the conversion is rounded
4475 | according to the current rounding mode.  If `a' is a NaN, the largest
4476 | positive integer is returned.  Otherwise, if the conversion overflows, the
4477 | largest integer with the same sign as `a' is returned.
4478 *----------------------------------------------------------------------------*/
4479
4480 int64 float128_to_int64( float128 a STATUS_PARAM )
4481 {
4482     flag aSign;
4483     int32 aExp, shiftCount;
4484     bits64 aSig0, aSig1;
4485
4486     aSig1 = extractFloat128Frac1( a );
4487     aSig0 = extractFloat128Frac0( a );
4488     aExp = extractFloat128Exp( a );
4489     aSign = extractFloat128Sign( a );
4490     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4491     shiftCount = 0x402F - aExp;
4492     if ( shiftCount <= 0 ) {
4493         if ( 0x403E < aExp ) {
4494             float_raise( float_flag_invalid STATUS_VAR);
4495             if (    ! aSign
4496                  || (    ( aExp == 0x7FFF )
4497                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4498                     )
4499                ) {
4500                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4501             }
4502             return (sbits64) LIT64( 0x8000000000000000 );
4503         }
4504         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4505     }
4506     else {
4507         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4508     }
4509     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4510
4511 }
4512
4513 /*----------------------------------------------------------------------------
4514 | Returns the result of converting the quadruple-precision floating-point
4515 | value `a' to the 64-bit two's complement integer format.  The conversion
4516 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4517 | Arithmetic, except that the conversion is always rounded toward zero.
4518 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4519 | the conversion overflows, the largest integer with the same sign as `a' is
4520 | returned.
4521 *----------------------------------------------------------------------------*/
4522
4523 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4524 {
4525     flag aSign;
4526     int32 aExp, shiftCount;
4527     bits64 aSig0, aSig1;
4528     int64 z;
4529
4530     aSig1 = extractFloat128Frac1( a );
4531     aSig0 = extractFloat128Frac0( a );
4532     aExp = extractFloat128Exp( a );
4533     aSign = extractFloat128Sign( a );
4534     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4535     shiftCount = aExp - 0x402F;
4536     if ( 0 < shiftCount ) {
4537         if ( 0x403E <= aExp ) {
4538             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4539             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4540                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4541                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4542             }
4543             else {
4544                 float_raise( float_flag_invalid STATUS_VAR);
4545                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4546                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4547                 }
4548             }
4549             return (sbits64) LIT64( 0x8000000000000000 );
4550         }
4551         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4552         if ( (bits64) ( aSig1<<shiftCount ) ) {
4553             STATUS(float_exception_flags) |= float_flag_inexact;
4554         }
4555     }
4556     else {
4557         if ( aExp < 0x3FFF ) {
4558             if ( aExp | aSig0 | aSig1 ) {
4559                 STATUS(float_exception_flags) |= float_flag_inexact;
4560             }
4561             return 0;
4562         }
4563         z = aSig0>>( - shiftCount );
4564         if (    aSig1
4565              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4566             STATUS(float_exception_flags) |= float_flag_inexact;
4567         }
4568     }
4569     if ( aSign ) z = - z;
4570     return z;
4571
4572 }
4573
4574 /*----------------------------------------------------------------------------
4575 | Returns the result of converting the quadruple-precision floating-point
4576 | value `a' to the single-precision floating-point format.  The conversion
4577 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4578 | Arithmetic.
4579 *----------------------------------------------------------------------------*/
4580
4581 float32 float128_to_float32( float128 a STATUS_PARAM )
4582 {
4583     flag aSign;
4584     int32 aExp;
4585     bits64 aSig0, aSig1;
4586     bits32 zSig;
4587
4588     aSig1 = extractFloat128Frac1( a );
4589     aSig0 = extractFloat128Frac0( a );
4590     aExp = extractFloat128Exp( a );
4591     aSign = extractFloat128Sign( a );
4592     if ( aExp == 0x7FFF ) {
4593         if ( aSig0 | aSig1 ) {
4594             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4595         }
4596         return packFloat32( aSign, 0xFF, 0 );
4597     }
4598     aSig0 |= ( aSig1 != 0 );
4599     shift64RightJamming( aSig0, 18, &aSig0 );
4600     zSig = aSig0;
4601     if ( aExp || zSig ) {
4602         zSig |= 0x40000000;
4603         aExp -= 0x3F81;
4604     }
4605     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4606
4607 }
4608
4609 /*----------------------------------------------------------------------------
4610 | Returns the result of converting the quadruple-precision floating-point
4611 | value `a' to the double-precision floating-point format.  The conversion
4612 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4613 | Arithmetic.
4614 *----------------------------------------------------------------------------*/
4615
4616 float64 float128_to_float64( float128 a STATUS_PARAM )
4617 {
4618     flag aSign;
4619     int32 aExp;
4620     bits64 aSig0, aSig1;
4621
4622     aSig1 = extractFloat128Frac1( a );
4623     aSig0 = extractFloat128Frac0( a );
4624     aExp = extractFloat128Exp( a );
4625     aSign = extractFloat128Sign( a );
4626     if ( aExp == 0x7FFF ) {
4627         if ( aSig0 | aSig1 ) {
4628             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4629         }
4630         return packFloat64( aSign, 0x7FF, 0 );
4631     }
4632     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4633     aSig0 |= ( aSig1 != 0 );
4634     if ( aExp || aSig0 ) {
4635         aSig0 |= LIT64( 0x4000000000000000 );
4636         aExp -= 0x3C01;
4637     }
4638     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4639
4640 }
4641
4642 #ifdef FLOATX80
4643
4644 /*----------------------------------------------------------------------------
4645 | Returns the result of converting the quadruple-precision floating-point
4646 | value `a' to the extended double-precision floating-point format.  The
4647 | conversion is performed according to the IEC/IEEE Standard for Binary
4648 | Floating-Point Arithmetic.
4649 *----------------------------------------------------------------------------*/
4650
4651 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4652 {
4653     flag aSign;
4654     int32 aExp;
4655     bits64 aSig0, aSig1;
4656
4657     aSig1 = extractFloat128Frac1( a );
4658     aSig0 = extractFloat128Frac0( a );
4659     aExp = extractFloat128Exp( a );
4660     aSign = extractFloat128Sign( a );
4661     if ( aExp == 0x7FFF ) {
4662         if ( aSig0 | aSig1 ) {
4663             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4664         }
4665         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4666     }
4667     if ( aExp == 0 ) {
4668         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4669         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4670     }
4671     else {
4672         aSig0 |= LIT64( 0x0001000000000000 );
4673     }
4674     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4675     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4676
4677 }
4678
4679 #endif
4680
4681 /*----------------------------------------------------------------------------
4682 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4683 | returns the result as a quadruple-precision floating-point value.  The
4684 | operation is performed according to the IEC/IEEE Standard for Binary
4685 | Floating-Point Arithmetic.
4686 *----------------------------------------------------------------------------*/
4687
4688 float128 float128_round_to_int( float128 a STATUS_PARAM )
4689 {
4690     flag aSign;
4691     int32 aExp;
4692     bits64 lastBitMask, roundBitsMask;
4693     int8 roundingMode;
4694     float128 z;
4695
4696     aExp = extractFloat128Exp( a );
4697     if ( 0x402F <= aExp ) {
4698         if ( 0x406F <= aExp ) {
4699             if (    ( aExp == 0x7FFF )
4700                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4701                ) {
4702                 return propagateFloat128NaN( a, a STATUS_VAR );
4703             }
4704             return a;
4705         }
4706         lastBitMask = 1;
4707         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4708         roundBitsMask = lastBitMask - 1;
4709         z = a;
4710         roundingMode = STATUS(float_rounding_mode);
4711         if ( roundingMode == float_round_nearest_even ) {
4712             if ( lastBitMask ) {
4713                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4714                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4715             }
4716             else {
4717                 if ( (sbits64) z.low < 0 ) {
4718                     ++z.high;
4719                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4720                 }
4721             }
4722         }
4723         else if ( roundingMode != float_round_to_zero ) {
4724             if (   extractFloat128Sign( z )
4725                  ^ ( roundingMode == float_round_up ) ) {
4726                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4727             }
4728         }
4729         z.low &= ~ roundBitsMask;
4730     }
4731     else {
4732         if ( aExp < 0x3FFF ) {
4733             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4734             STATUS(float_exception_flags) |= float_flag_inexact;
4735             aSign = extractFloat128Sign( a );
4736             switch ( STATUS(float_rounding_mode) ) {
4737              case float_round_nearest_even:
4738                 if (    ( aExp == 0x3FFE )
4739                      && (   extractFloat128Frac0( a )
4740                           | extractFloat128Frac1( a ) )
4741                    ) {
4742                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4743                 }
4744                 break;
4745              case float_round_down:
4746                 return
4747                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4748                     : packFloat128( 0, 0, 0, 0 );
4749              case float_round_up:
4750                 return
4751                       aSign ? packFloat128( 1, 0, 0, 0 )
4752                     : packFloat128( 0, 0x3FFF, 0, 0 );
4753             }
4754             return packFloat128( aSign, 0, 0, 0 );
4755         }
4756         lastBitMask = 1;
4757         lastBitMask <<= 0x402F - aExp;
4758         roundBitsMask = lastBitMask - 1;
4759         z.low = 0;
4760         z.high = a.high;
4761         roundingMode = STATUS(float_rounding_mode);
4762         if ( roundingMode == float_round_nearest_even ) {
4763             z.high += lastBitMask>>1;
4764             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4765                 z.high &= ~ lastBitMask;
4766             }
4767         }
4768         else if ( roundingMode != float_round_to_zero ) {
4769             if (   extractFloat128Sign( z )
4770                  ^ ( roundingMode == float_round_up ) ) {
4771                 z.high |= ( a.low != 0 );
4772                 z.high += roundBitsMask;
4773             }
4774         }
4775         z.high &= ~ roundBitsMask;
4776     }
4777     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4778         STATUS(float_exception_flags) |= float_flag_inexact;
4779     }
4780     return z;
4781
4782 }
4783
4784 /*----------------------------------------------------------------------------
4785 | Returns the result of adding the absolute values of the quadruple-precision
4786 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4787 | before being returned.  `zSign' is ignored if the result is a NaN.
4788 | The addition is performed according to the IEC/IEEE Standard for Binary
4789 | Floating-Point Arithmetic.
4790 *----------------------------------------------------------------------------*/
4791
4792 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4793 {
4794     int32 aExp, bExp, zExp;
4795     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4796     int32 expDiff;
4797
4798     aSig1 = extractFloat128Frac1( a );
4799     aSig0 = extractFloat128Frac0( a );
4800     aExp = extractFloat128Exp( a );
4801     bSig1 = extractFloat128Frac1( b );
4802     bSig0 = extractFloat128Frac0( b );
4803     bExp = extractFloat128Exp( b );
4804     expDiff = aExp - bExp;
4805     if ( 0 < expDiff ) {
4806         if ( aExp == 0x7FFF ) {
4807             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4808             return a;
4809         }
4810         if ( bExp == 0 ) {
4811             --expDiff;
4812         }
4813         else {
4814             bSig0 |= LIT64( 0x0001000000000000 );
4815         }
4816         shift128ExtraRightJamming(
4817             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4818         zExp = aExp;
4819     }
4820     else if ( expDiff < 0 ) {
4821         if ( bExp == 0x7FFF ) {
4822             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4823             return packFloat128( zSign, 0x7FFF, 0, 0 );
4824         }
4825         if ( aExp == 0 ) {
4826             ++expDiff;
4827         }
4828         else {
4829             aSig0 |= LIT64( 0x0001000000000000 );
4830         }
4831         shift128ExtraRightJamming(
4832             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4833         zExp = bExp;
4834     }
4835     else {
4836         if ( aExp == 0x7FFF ) {
4837             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4838                 return propagateFloat128NaN( a, b STATUS_VAR );
4839             }
4840             return a;
4841         }
4842         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4843         if ( aExp == 0 ) {
4844             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
4845             return packFloat128( zSign, 0, zSig0, zSig1 );
4846         }
4847         zSig2 = 0;
4848         zSig0 |= LIT64( 0x0002000000000000 );
4849         zExp = aExp;
4850         goto shiftRight1;
4851     }
4852     aSig0 |= LIT64( 0x0001000000000000 );
4853     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4854     --zExp;
4855     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4856     ++zExp;
4857  shiftRight1:
4858     shift128ExtraRightJamming(
4859         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4860  roundAndPack:
4861     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4862
4863 }
4864
4865 /*----------------------------------------------------------------------------
4866 | Returns the result of subtracting the absolute values of the quadruple-
4867 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
4868 | difference is negated before being returned.  `zSign' is ignored if the
4869 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4870 | Standard for Binary Floating-Point Arithmetic.
4871 *----------------------------------------------------------------------------*/
4872
4873 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4874 {
4875     int32 aExp, bExp, zExp;
4876     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4877     int32 expDiff;
4878     float128 z;
4879
4880     aSig1 = extractFloat128Frac1( a );
4881     aSig0 = extractFloat128Frac0( a );
4882     aExp = extractFloat128Exp( a );
4883     bSig1 = extractFloat128Frac1( b );
4884     bSig0 = extractFloat128Frac0( b );
4885     bExp = extractFloat128Exp( b );
4886     expDiff = aExp - bExp;
4887     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4888     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4889     if ( 0 < expDiff ) goto aExpBigger;
4890     if ( expDiff < 0 ) goto bExpBigger;
4891     if ( aExp == 0x7FFF ) {
4892         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4893             return propagateFloat128NaN( a, b STATUS_VAR );
4894         }
4895         float_raise( float_flag_invalid STATUS_VAR);
4896         z.low = float128_default_nan_low;
4897         z.high = float128_default_nan_high;
4898         return z;
4899     }
4900     if ( aExp == 0 ) {
4901         aExp = 1;
4902         bExp = 1;
4903     }
4904     if ( bSig0 < aSig0 ) goto aBigger;
4905     if ( aSig0 < bSig0 ) goto bBigger;
4906     if ( bSig1 < aSig1 ) goto aBigger;
4907     if ( aSig1 < bSig1 ) goto bBigger;
4908     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4909  bExpBigger:
4910     if ( bExp == 0x7FFF ) {
4911         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4912         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4913     }
4914     if ( aExp == 0 ) {
4915         ++expDiff;
4916     }
4917     else {
4918         aSig0 |= LIT64( 0x4000000000000000 );
4919     }
4920     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4921     bSig0 |= LIT64( 0x4000000000000000 );
4922  bBigger:
4923     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4924     zExp = bExp;
4925     zSign ^= 1;
4926     goto normalizeRoundAndPack;
4927  aExpBigger:
4928     if ( aExp == 0x7FFF ) {
4929         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4930         return a;
4931     }
4932     if ( bExp == 0 ) {
4933         --expDiff;
4934     }
4935     else {
4936         bSig0 |= LIT64( 0x4000000000000000 );
4937     }
4938     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4939     aSig0 |= LIT64( 0x4000000000000000 );
4940  aBigger:
4941     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4942     zExp = aExp;
4943  normalizeRoundAndPack:
4944     --zExp;
4945     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4946
4947 }
4948
4949 /*----------------------------------------------------------------------------
4950 | Returns the result of adding the quadruple-precision floating-point values
4951 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4952 | for Binary Floating-Point Arithmetic.
4953 *----------------------------------------------------------------------------*/
4954
4955 float128 float128_add( float128 a, float128 b STATUS_PARAM )
4956 {
4957     flag aSign, bSign;
4958
4959     aSign = extractFloat128Sign( a );
4960     bSign = extractFloat128Sign( b );
4961     if ( aSign == bSign ) {
4962         return addFloat128Sigs( a, b, aSign STATUS_VAR );
4963     }
4964     else {
4965         return subFloat128Sigs( a, b, aSign STATUS_VAR );
4966     }
4967
4968 }
4969
4970 /*----------------------------------------------------------------------------
4971 | Returns the result of subtracting the quadruple-precision floating-point
4972 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4973 | Standard for Binary Floating-Point Arithmetic.
4974 *----------------------------------------------------------------------------*/
4975
4976 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4977 {
4978     flag aSign, bSign;
4979
4980     aSign = extractFloat128Sign( a );
4981     bSign = extractFloat128Sign( b );
4982     if ( aSign == bSign ) {
4983         return subFloat128Sigs( a, b, aSign STATUS_VAR );
4984     }
4985     else {
4986         return addFloat128Sigs( a, b, aSign STATUS_VAR );
4987     }
4988
4989 }
4990
4991 /*----------------------------------------------------------------------------
4992 | Returns the result of multiplying the quadruple-precision floating-point
4993 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4994 | Standard for Binary Floating-Point Arithmetic.
4995 *----------------------------------------------------------------------------*/
4996
4997 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4998 {
4999     flag aSign, bSign, zSign;
5000     int32 aExp, bExp, zExp;
5001     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5002     float128 z;
5003
5004     aSig1 = extractFloat128Frac1( a );
5005     aSig0 = extractFloat128Frac0( a );
5006     aExp = extractFloat128Exp( a );
5007     aSign = extractFloat128Sign( a );
5008     bSig1 = extractFloat128Frac1( b );
5009     bSig0 = extractFloat128Frac0( b );
5010     bExp = extractFloat128Exp( b );
5011     bSign = extractFloat128Sign( b );
5012     zSign = aSign ^ bSign;
5013     if ( aExp == 0x7FFF ) {
5014         if (    ( aSig0 | aSig1 )
5015              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5016             return propagateFloat128NaN( a, b STATUS_VAR );
5017         }
5018         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5019         return packFloat128( zSign, 0x7FFF, 0, 0 );
5020     }
5021     if ( bExp == 0x7FFF ) {
5022         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5023         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5024  invalid:
5025             float_raise( float_flag_invalid STATUS_VAR);
5026             z.low = float128_default_nan_low;
5027             z.high = float128_default_nan_high;
5028             return z;
5029         }
5030         return packFloat128( zSign, 0x7FFF, 0, 0 );
5031     }
5032     if ( aExp == 0 ) {
5033         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5034         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5035     }
5036     if ( bExp == 0 ) {
5037         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5038         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5039     }
5040     zExp = aExp + bExp - 0x4000;
5041     aSig0 |= LIT64( 0x0001000000000000 );
5042     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5043     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5044     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5045     zSig2 |= ( zSig3 != 0 );
5046     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5047         shift128ExtraRightJamming(
5048             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5049         ++zExp;
5050     }
5051     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5052
5053 }
5054
5055 /*----------------------------------------------------------------------------
5056 | Returns the result of dividing the quadruple-precision floating-point value
5057 | `a' by the corresponding value `b'.  The operation is performed according to
5058 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5059 *----------------------------------------------------------------------------*/
5060
5061 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5062 {
5063     flag aSign, bSign, zSign;
5064     int32 aExp, bExp, zExp;
5065     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5066     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5067     float128 z;
5068
5069     aSig1 = extractFloat128Frac1( a );
5070     aSig0 = extractFloat128Frac0( a );
5071     aExp = extractFloat128Exp( a );
5072     aSign = extractFloat128Sign( a );
5073     bSig1 = extractFloat128Frac1( b );
5074     bSig0 = extractFloat128Frac0( b );
5075     bExp = extractFloat128Exp( b );
5076     bSign = extractFloat128Sign( b );
5077     zSign = aSign ^ bSign;
5078     if ( aExp == 0x7FFF ) {
5079         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5080         if ( bExp == 0x7FFF ) {
5081             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5082             goto invalid;
5083         }
5084         return packFloat128( zSign, 0x7FFF, 0, 0 );
5085     }
5086     if ( bExp == 0x7FFF ) {
5087         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5088         return packFloat128( zSign, 0, 0, 0 );
5089     }
5090     if ( bExp == 0 ) {
5091         if ( ( bSig0 | bSig1 ) == 0 ) {
5092             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5093  invalid:
5094                 float_raise( float_flag_invalid STATUS_VAR);
5095                 z.low = float128_default_nan_low;
5096                 z.high = float128_default_nan_high;
5097                 return z;
5098             }
5099             float_raise( float_flag_divbyzero STATUS_VAR);
5100             return packFloat128( zSign, 0x7FFF, 0, 0 );
5101         }
5102         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5103     }
5104     if ( aExp == 0 ) {
5105         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5106         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5107     }
5108     zExp = aExp - bExp + 0x3FFD;
5109     shortShift128Left(
5110         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5111     shortShift128Left(
5112         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5113     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5114         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5115         ++zExp;
5116     }
5117     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5118     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5119     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5120     while ( (sbits64) rem0 < 0 ) {
5121         --zSig0;
5122         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5123     }
5124     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5125     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5126         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5127         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5128         while ( (sbits64) rem1 < 0 ) {
5129             --zSig1;
5130             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5131         }
5132         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5133     }
5134     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5135     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5136
5137 }
5138
5139 /*----------------------------------------------------------------------------
5140 | Returns the remainder of the quadruple-precision floating-point value `a'
5141 | with respect to the corresponding value `b'.  The operation is performed
5142 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5143 *----------------------------------------------------------------------------*/
5144
5145 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5146 {
5147     flag aSign, bSign, zSign;
5148     int32 aExp, bExp, expDiff;
5149     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5150     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5151     sbits64 sigMean0;
5152     float128 z;
5153
5154     aSig1 = extractFloat128Frac1( a );
5155     aSig0 = extractFloat128Frac0( a );
5156     aExp = extractFloat128Exp( a );
5157     aSign = extractFloat128Sign( a );
5158     bSig1 = extractFloat128Frac1( b );
5159     bSig0 = extractFloat128Frac0( b );
5160     bExp = extractFloat128Exp( b );
5161     bSign = extractFloat128Sign( b );
5162     if ( aExp == 0x7FFF ) {
5163         if (    ( aSig0 | aSig1 )
5164              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5165             return propagateFloat128NaN( a, b STATUS_VAR );
5166         }
5167         goto invalid;
5168     }
5169     if ( bExp == 0x7FFF ) {
5170         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5171         return a;
5172     }
5173     if ( bExp == 0 ) {
5174         if ( ( bSig0 | bSig1 ) == 0 ) {
5175  invalid:
5176             float_raise( float_flag_invalid STATUS_VAR);
5177             z.low = float128_default_nan_low;
5178             z.high = float128_default_nan_high;
5179             return z;
5180         }
5181         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5182     }
5183     if ( aExp == 0 ) {
5184         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5185         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5186     }
5187     expDiff = aExp - bExp;
5188     if ( expDiff < -1 ) return a;
5189     shortShift128Left(
5190         aSig0 | LIT64( 0x0001000000000000 ),
5191         aSig1,
5192         15 - ( expDiff < 0 ),
5193         &aSig0,
5194         &aSig1
5195     );
5196     shortShift128Left(
5197         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5198     q = le128( bSig0, bSig1, aSig0, aSig1 );
5199     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5200     expDiff -= 64;
5201     while ( 0 < expDiff ) {
5202         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5203         q = ( 4 < q ) ? q - 4 : 0;
5204         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5205         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5206         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5207         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5208         expDiff -= 61;
5209     }
5210     if ( -64 < expDiff ) {
5211         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5212         q = ( 4 < q ) ? q - 4 : 0;
5213         q >>= - expDiff;
5214         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5215         expDiff += 52;
5216         if ( expDiff < 0 ) {
5217             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5218         }
5219         else {
5220             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5221         }
5222         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5223         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5224     }
5225     else {
5226         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5227         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5228     }
5229     do {
5230         alternateASig0 = aSig0;
5231         alternateASig1 = aSig1;
5232         ++q;
5233         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5234     } while ( 0 <= (sbits64) aSig0 );
5235     add128(
5236         aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5237     if (    ( sigMean0 < 0 )
5238          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5239         aSig0 = alternateASig0;
5240         aSig1 = alternateASig1;
5241     }
5242     zSign = ( (sbits64) aSig0 < 0 );
5243     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5244     return
5245         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5246
5247 }
5248
5249 /*----------------------------------------------------------------------------
5250 | Returns the square root of the quadruple-precision floating-point value `a'.
5251 | The operation is performed according to the IEC/IEEE Standard for Binary
5252 | Floating-Point Arithmetic.
5253 *----------------------------------------------------------------------------*/
5254
5255 float128 float128_sqrt( float128 a STATUS_PARAM )
5256 {
5257     flag aSign;
5258     int32 aExp, zExp;
5259     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5260     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5261     float128 z;
5262
5263     aSig1 = extractFloat128Frac1( a );
5264     aSig0 = extractFloat128Frac0( a );
5265     aExp = extractFloat128Exp( a );
5266     aSign = extractFloat128Sign( a );
5267     if ( aExp == 0x7FFF ) {
5268         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5269         if ( ! aSign ) return a;
5270         goto invalid;
5271     }
5272     if ( aSign ) {
5273         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5274  invalid:
5275         float_raise( float_flag_invalid STATUS_VAR);
5276         z.low = float128_default_nan_low;
5277         z.high = float128_default_nan_high;
5278         return z;
5279     }
5280     if ( aExp == 0 ) {
5281         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5282         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5283     }
5284     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5285     aSig0 |= LIT64( 0x0001000000000000 );
5286     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5287     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5288     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5289     doubleZSig0 = zSig0<<1;
5290     mul64To128( zSig0, zSig0, &term0, &term1 );
5291     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5292     while ( (sbits64) rem0 < 0 ) {
5293         --zSig0;
5294         doubleZSig0 -= 2;
5295         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5296     }
5297     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5298     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5299         if ( zSig1 == 0 ) zSig1 = 1;
5300         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5301         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5302         mul64To128( zSig1, zSig1, &term2, &term3 );
5303         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5304         while ( (sbits64) rem1 < 0 ) {
5305             --zSig1;
5306             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5307             term3 |= 1;
5308             term2 |= doubleZSig0;
5309             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5310         }
5311         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5312     }
5313     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5314     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5315
5316 }
5317
5318 /*----------------------------------------------------------------------------
5319 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5320 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5321 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5322 *----------------------------------------------------------------------------*/
5323
5324 int float128_eq( float128 a, float128 b STATUS_PARAM )
5325 {
5326
5327     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5328               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5329          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5330               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5331        ) {
5332         if (    float128_is_signaling_nan( a )
5333              || float128_is_signaling_nan( b ) ) {
5334             float_raise( float_flag_invalid STATUS_VAR);
5335         }
5336         return 0;
5337     }
5338     return
5339            ( a.low == b.low )
5340         && (    ( a.high == b.high )
5341              || (    ( a.low == 0 )
5342                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5343            );
5344
5345 }
5346
5347 /*----------------------------------------------------------------------------
5348 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5349 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
5350 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5351 | Arithmetic.
5352 *----------------------------------------------------------------------------*/
5353
5354 int float128_le( float128 a, float128 b STATUS_PARAM )
5355 {
5356     flag aSign, bSign;
5357
5358     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5359               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5360          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5361               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5362        ) {
5363         float_raise( float_flag_invalid STATUS_VAR);
5364         return 0;
5365     }
5366     aSign = extractFloat128Sign( a );
5367     bSign = extractFloat128Sign( b );
5368     if ( aSign != bSign ) {
5369         return
5370                aSign
5371             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5372                  == 0 );
5373     }
5374     return
5375           aSign ? le128( b.high, b.low, a.high, a.low )
5376         : le128( a.high, a.low, b.high, b.low );
5377
5378 }
5379
5380 /*----------------------------------------------------------------------------
5381 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5382 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5383 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5384 *----------------------------------------------------------------------------*/
5385
5386 int float128_lt( float128 a, float128 b STATUS_PARAM )
5387 {
5388     flag aSign, bSign;
5389
5390     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5391               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5392          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5393               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5394        ) {
5395         float_raise( float_flag_invalid STATUS_VAR);
5396         return 0;
5397     }
5398     aSign = extractFloat128Sign( a );
5399     bSign = extractFloat128Sign( b );
5400     if ( aSign != bSign ) {
5401         return
5402                aSign
5403             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5404                  != 0 );
5405     }
5406     return
5407           aSign ? lt128( b.high, b.low, a.high, a.low )
5408         : lt128( a.high, a.low, b.high, b.low );
5409
5410 }
5411
5412 /*----------------------------------------------------------------------------
5413 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5414 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5415 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5416 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5417 *----------------------------------------------------------------------------*/
5418
5419 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5420 {
5421
5422     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5423               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5424          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5425               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5426        ) {
5427         float_raise( float_flag_invalid STATUS_VAR);
5428         return 0;
5429     }
5430     return
5431            ( a.low == b.low )
5432         && (    ( a.high == b.high )
5433              || (    ( a.low == 0 )
5434                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5435            );
5436
5437 }
5438
5439 /*----------------------------------------------------------------------------
5440 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5441 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5442 | cause an exception.  Otherwise, the comparison is performed according to the
5443 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5444 *----------------------------------------------------------------------------*/
5445
5446 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5447 {
5448     flag aSign, bSign;
5449
5450     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5451               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5452          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5453               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5454        ) {
5455         if (    float128_is_signaling_nan( a )
5456              || float128_is_signaling_nan( b ) ) {
5457             float_raise( float_flag_invalid STATUS_VAR);
5458         }
5459         return 0;
5460     }
5461     aSign = extractFloat128Sign( a );
5462     bSign = extractFloat128Sign( b );
5463     if ( aSign != bSign ) {
5464         return
5465                aSign
5466             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5467                  == 0 );
5468     }
5469     return
5470           aSign ? le128( b.high, b.low, a.high, a.low )
5471         : le128( a.high, a.low, b.high, b.low );
5472
5473 }
5474
5475 /*----------------------------------------------------------------------------
5476 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5477 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5478 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5479 | Standard for Binary Floating-Point Arithmetic.
5480 *----------------------------------------------------------------------------*/
5481
5482 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5483 {
5484     flag aSign, bSign;
5485
5486     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5487               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5488          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5489               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5490        ) {
5491         if (    float128_is_signaling_nan( a )
5492              || float128_is_signaling_nan( b ) ) {
5493             float_raise( float_flag_invalid STATUS_VAR);
5494         }
5495         return 0;
5496     }
5497     aSign = extractFloat128Sign( a );
5498     bSign = extractFloat128Sign( b );
5499     if ( aSign != bSign ) {
5500         return
5501                aSign
5502             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5503                  != 0 );
5504     }
5505     return
5506           aSign ? lt128( b.high, b.low, a.high, a.low )
5507         : lt128( a.high, a.low, b.high, b.low );
5508
5509 }
5510
5511 #endif
5512
5513 /* misc functions */
5514 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5515 {
5516     return int64_to_float32(a STATUS_VAR);
5517 }
5518
5519 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5520 {
5521     return int64_to_float64(a STATUS_VAR);
5522 }
5523
5524 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5525 {
5526     int64_t v;
5527     unsigned int res;
5528
5529     v = float32_to_int64(a STATUS_VAR);
5530     if (v < 0) {
5531         res = 0;
5532         float_raise( float_flag_invalid STATUS_VAR);
5533     } else if (v > 0xffffffff) {
5534         res = 0xffffffff;
5535         float_raise( float_flag_invalid STATUS_VAR);
5536     } else {
5537         res = v;
5538     }
5539     return res;
5540 }
5541
5542 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5543 {
5544     int64_t v;
5545     unsigned int res;
5546
5547     v = float32_to_int64_round_to_zero(a STATUS_VAR);
5548     if (v < 0) {
5549         res = 0;
5550         float_raise( float_flag_invalid STATUS_VAR);
5551     } else if (v > 0xffffffff) {
5552         res = 0xffffffff;
5553         float_raise( float_flag_invalid STATUS_VAR);
5554     } else {
5555         res = v;
5556     }
5557     return res;
5558 }
5559
5560 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5561 {
5562     int64_t v;
5563     unsigned int res;
5564
5565     v = float64_to_int64(a STATUS_VAR);
5566     if (v < 0) {
5567         res = 0;
5568         float_raise( float_flag_invalid STATUS_VAR);
5569     } else if (v > 0xffffffff) {
5570         res = 0xffffffff;
5571         float_raise( float_flag_invalid STATUS_VAR);
5572     } else {
5573         res = v;
5574     }
5575     return res;
5576 }
5577
5578 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5579 {
5580     int64_t v;
5581     unsigned int res;
5582
5583     v = float64_to_int64_round_to_zero(a STATUS_VAR);
5584     if (v < 0) {
5585         res = 0;
5586         float_raise( float_flag_invalid STATUS_VAR);
5587     } else if (v > 0xffffffff) {
5588         res = 0xffffffff;
5589         float_raise( float_flag_invalid STATUS_VAR);
5590     } else {
5591         res = v;
5592     }
5593     return res;
5594 }
5595
5596 /* FIXME: This looks broken.  */
5597 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5598 {
5599     int64_t v;
5600
5601     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5602     v += float64_val(a);
5603     v = float64_to_int64(make_float64(v) STATUS_VAR);
5604
5605     return v - INT64_MIN;
5606 }
5607
5608 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5609 {
5610     int64_t v;
5611
5612     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5613     v += float64_val(a);
5614     v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5615
5616     return v - INT64_MIN;
5617 }
5618
5619 #define COMPARE(s, nan_exp)                                                  \
5620 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
5621                                       int is_quiet STATUS_PARAM )            \
5622 {                                                                            \
5623     flag aSign, bSign;                                                       \
5624     bits ## s av, bv;                                                        \
5625                                                                              \
5626     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5627          extractFloat ## s ## Frac( a ) ) ||                                 \
5628         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5629           extractFloat ## s ## Frac( b ) )) {                                \
5630         if (!is_quiet ||                                                     \
5631             float ## s ## _is_signaling_nan( a ) ||                          \
5632             float ## s ## _is_signaling_nan( b ) ) {                         \
5633             float_raise( float_flag_invalid STATUS_VAR);                     \
5634         }                                                                    \
5635         return float_relation_unordered;                                     \
5636     }                                                                        \
5637     aSign = extractFloat ## s ## Sign( a );                                  \
5638     bSign = extractFloat ## s ## Sign( b );                                  \
5639     av = float ## s ## _val(a);                                              \
5640     bv = float ## s ## _val(b);                                              \
5641     if ( aSign != bSign ) {                                                  \
5642         if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) {                         \
5643             /* zero case */                                                  \
5644             return float_relation_equal;                                     \
5645         } else {                                                             \
5646             return 1 - (2 * aSign);                                          \
5647         }                                                                    \
5648     } else {                                                                 \
5649         if (av == bv) {                                                      \
5650             return float_relation_equal;                                     \
5651         } else {                                                             \
5652             return 1 - 2 * (aSign ^ ( av < bv ));                            \
5653         }                                                                    \
5654     }                                                                        \
5655 }                                                                            \
5656                                                                              \
5657 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
5658 {                                                                            \
5659     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
5660 }                                                                            \
5661                                                                              \
5662 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
5663 {                                                                            \
5664     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
5665 }
5666
5667 COMPARE(32, 0xff)
5668 COMPARE(64, 0x7ff)
5669
5670 INLINE int float128_compare_internal( float128 a, float128 b,
5671                                       int is_quiet STATUS_PARAM )
5672 {
5673     flag aSign, bSign;
5674
5675     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
5676           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
5677         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
5678           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
5679         if (!is_quiet ||
5680             float128_is_signaling_nan( a ) ||
5681             float128_is_signaling_nan( b ) ) {
5682             float_raise( float_flag_invalid STATUS_VAR);
5683         }
5684         return float_relation_unordered;
5685     }
5686     aSign = extractFloat128Sign( a );
5687     bSign = extractFloat128Sign( b );
5688     if ( aSign != bSign ) {
5689         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
5690             /* zero case */
5691             return float_relation_equal;
5692         } else {
5693             return 1 - (2 * aSign);
5694         }
5695     } else {
5696         if (a.low == b.low && a.high == b.high) {
5697             return float_relation_equal;
5698         } else {
5699             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
5700         }
5701     }
5702 }
5703
5704 int float128_compare( float128 a, float128 b STATUS_PARAM )
5705 {
5706     return float128_compare_internal(a, b, 0 STATUS_VAR);
5707 }
5708
5709 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
5710 {
5711     return float128_compare_internal(a, b, 1 STATUS_VAR);
5712 }
5713
5714 /* Multiply A by 2 raised to the power N.  */
5715 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
5716 {
5717     flag aSign;
5718     int16 aExp;
5719     bits32 aSig;
5720
5721     aSig = extractFloat32Frac( a );
5722     aExp = extractFloat32Exp( a );
5723     aSign = extractFloat32Sign( a );
5724
5725     if ( aExp == 0xFF ) {
5726         return a;
5727     }
5728     if ( aExp != 0 )
5729         aSig |= 0x00800000;
5730     else if ( aSig == 0 )
5731         return a;
5732
5733     aExp += n - 1;
5734     aSig <<= 7;
5735     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
5736 }
5737
5738 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
5739 {
5740     flag aSign;
5741     int16 aExp;
5742     bits64 aSig;
5743
5744     aSig = extractFloat64Frac( a );
5745     aExp = extractFloat64Exp( a );
5746     aSign = extractFloat64Sign( a );
5747
5748     if ( aExp == 0x7FF ) {
5749         return a;
5750     }
5751     if ( aExp != 0 )
5752         aSig |= LIT64( 0x0010000000000000 );
5753     else if ( aSig == 0 )
5754         return a;
5755
5756     aExp += n - 1;
5757     aSig <<= 10;
5758     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
5759 }
5760
5761 #ifdef FLOATX80
5762 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
5763 {
5764     flag aSign;
5765     int16 aExp;
5766     bits64 aSig;
5767
5768     aSig = extractFloatx80Frac( a );
5769     aExp = extractFloatx80Exp( a );
5770     aSign = extractFloatx80Sign( a );
5771
5772     if ( aExp == 0x7FF ) {
5773         return a;
5774     }
5775     if (aExp == 0 && aSig == 0)
5776         return a;
5777
5778     aExp += n;
5779     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
5780                                           aSign, aExp, aSig, 0 STATUS_VAR );
5781 }
5782 #endif
5783
5784 #ifdef FLOAT128
5785 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
5786 {
5787     flag aSign;
5788     int32 aExp;
5789     bits64 aSig0, aSig1;
5790
5791     aSig1 = extractFloat128Frac1( a );
5792     aSig0 = extractFloat128Frac0( a );
5793     aExp = extractFloat128Exp( a );
5794     aSign = extractFloat128Sign( a );
5795     if ( aExp == 0x7FFF ) {
5796         return a;
5797     }
5798     if ( aExp != 0 )
5799         aSig0 |= LIT64( 0x0001000000000000 );
5800     else if ( aSig0 == 0 && aSig1 == 0 )
5801         return a;
5802
5803     aExp += n - 1;
5804     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
5805                                           STATUS_VAR );
5806
5807 }
5808 #endif