fpu/softfloat.c

   1
   2 /*============================================================================
   3
   4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
   5 Package, Release 2b.
   6
   7 Written by John R. Hauser.  This work was made possible in part by the
   8 International Computer Science Institute, located at Suite 600, 1947 Center
   9 Street, Berkeley, California 94704.  Funding was partially provided by the
  10 National Science Foundation under grant MIP-9311980.  The original version
  11 of this code was written as part of a project to build a fixed-point vector
  12 processor in collaboration with the University of California at Berkeley,
  13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  15 arithmetic/SoftFloat.html'.
  16
  17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  19 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  25
  26 Derivative works are acceptable, even for commercial purposes, so long as
  27 (1) the source code for the derivative work includes prominent notice that
  28 the work is derivative, and (2) the source code includes prominent notice with
  29 these four paragraphs for those parts of this code that are retained.
  30
  31 =============================================================================*/
  32
  33 /* FIXME: Flush-To-Zero only effects results.  Denormal inputs should also
  34    be flushed to zero.  */
  35 #include "softfloat.h"
  36
  37 /*----------------------------------------------------------------------------
  38 | Primitive arithmetic functions, including multi-word arithmetic, and
  39 | division and square root approximations.  (Can be specialized to target if
  40 | desired.)
  41 *----------------------------------------------------------------------------*/
  42 #include "softfloat-macros.h"
  43
  44 /*----------------------------------------------------------------------------
  45 | Functions and definitions to determine:  (1) whether tininess for underflow
  46 | is detected before or after rounding by default, (2) what (if anything)
  47 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  48 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  49 | are propagated from function inputs to output.  These details are target-
  50 | specific.
  51 *----------------------------------------------------------------------------*/
  52 #include "softfloat-specialize.h"
  53
  54 void set_float_rounding_mode(int val STATUS_PARAM)
  55 {
  56     STATUS(float_rounding_mode) = val;
  57 }
  58
  59 void set_float_exception_flags(int val STATUS_PARAM)
  60 {
  61     STATUS(float_exception_flags) = val;
  62 }
  63
  64 #ifdef FLOATX80
  65 void set_floatx80_rounding_precision(int val STATUS_PARAM)
  66 {
  67     STATUS(floatx80_rounding_precision) = val;
  68 }
  69 #endif
  70
  71 /*----------------------------------------------------------------------------
  72 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  73 | and 7, and returns the properly rounded 32-bit integer corresponding to the
  74 | input.  If `zSign' is 1, the input is negated before being converted to an
  75 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
  76 | is simply rounded to an integer, with the inexact exception raised if the
  77 | input cannot be represented exactly as an integer.  However, if the fixed-
  78 | point input is too large, the invalid exception is raised and the largest
  79 | positive or negative integer is returned.
  80 *----------------------------------------------------------------------------*/
  81
  82 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
  83 {
  84     int8 roundingMode;
  85     flag roundNearestEven;
  86     int8 roundIncrement, roundBits;
  87     int32 z;
  88
  89     roundingMode = STATUS(float_rounding_mode);
  90     roundNearestEven = ( roundingMode == float_round_nearest_even );
  91     roundIncrement = 0x40;
  92     if ( ! roundNearestEven ) {
  93         if ( roundingMode == float_round_to_zero ) {
  94             roundIncrement = 0;
  95         }
  96         else {
  97             roundIncrement = 0x7F;
  98             if ( zSign ) {
  99                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 100             }
 101             else {
 102                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 103             }
 104         }
 105     }
 106     roundBits = absZ & 0x7F;
 107     absZ = ( absZ + roundIncrement )>>7;
 108     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 109     z = absZ;
 110     if ( zSign ) z = - z;
 111     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 112         float_raise( float_flag_invalid STATUS_VAR);
 113         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 114     }
 115     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 116     return z;
 117
 118 }
 119
 120 /*----------------------------------------------------------------------------
 121 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 122 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 123 | and returns the properly rounded 64-bit integer corresponding to the input.
 124 | If `zSign' is 1, the input is negated before being converted to an integer.
 125 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 126 | the inexact exception raised if the input cannot be represented exactly as
 127 | an integer.  However, if the fixed-point input is too large, the invalid
 128 | exception is raised and the largest positive or negative integer is
 129 | returned.
 130 *----------------------------------------------------------------------------*/
 131
 132 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
 133 {
 134     int8 roundingMode;
 135     flag roundNearestEven, increment;
 136     int64 z;
 137
 138     roundingMode = STATUS(float_rounding_mode);
 139     roundNearestEven = ( roundingMode == float_round_nearest_even );
 140     increment = ( (sbits64) absZ1 < 0 );
 141     if ( ! roundNearestEven ) {
 142         if ( roundingMode == float_round_to_zero ) {
 143             increment = 0;
 144         }
 145         else {
 146             if ( zSign ) {
 147                 increment = ( roundingMode == float_round_down ) && absZ1;
 148             }
 149             else {
 150                 increment = ( roundingMode == float_round_up ) && absZ1;
 151             }
 152         }
 153     }
 154     if ( increment ) {
 155         ++absZ0;
 156         if ( absZ0 == 0 ) goto overflow;
 157         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 158     }
 159     z = absZ0;
 160     if ( zSign ) z = - z;
 161     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 162  overflow:
 163         float_raise( float_flag_invalid STATUS_VAR);
 164         return
 165               zSign ? (sbits64) LIT64( 0x8000000000000000 )
 166             : LIT64( 0x7FFFFFFFFFFFFFFF );
 167     }
 168     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 169     return z;
 170
 171 }
 172
 173 /*----------------------------------------------------------------------------
 174 | Returns the fraction bits of the single-precision floating-point value `a'.
 175 *----------------------------------------------------------------------------*/
 176
 177 INLINE bits32 extractFloat32Frac( float32 a )
 178 {
 179
 180     return float32_val(a) & 0x007FFFFF;
 181
 182 }
 183
 184 /*----------------------------------------------------------------------------
 185 | Returns the exponent bits of the single-precision floating-point value `a'.
 186 *----------------------------------------------------------------------------*/
 187
 188 INLINE int16 extractFloat32Exp( float32 a )
 189 {
 190
 191     return ( float32_val(a)>>23 ) & 0xFF;
 192
 193 }
 194
 195 /*----------------------------------------------------------------------------
 196 | Returns the sign bit of the single-precision floating-point value `a'.
 197 *----------------------------------------------------------------------------*/
 198
 199 INLINE flag extractFloat32Sign( float32 a )
 200 {
 201
 202     return float32_val(a)>>31;
 203
 204 }
 205
 206 /*----------------------------------------------------------------------------
 207 | Normalizes the subnormal single-precision floating-point value represented
 208 | by the denormalized significand `aSig'.  The normalized exponent and
 209 | significand are stored at the locations pointed to by `zExpPtr' and
 210 | `zSigPtr', respectively.
 211 *----------------------------------------------------------------------------*/
 212
 213 static void
 214  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 215 {
 216     int8 shiftCount;
 217
 218     shiftCount = countLeadingZeros32( aSig ) - 8;
 219     *zSigPtr = aSig<<shiftCount;
 220     *zExpPtr = 1 - shiftCount;
 221
 222 }
 223
 224 /*----------------------------------------------------------------------------
 225 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 226 | single-precision floating-point value, returning the result.  After being
 227 | shifted into the proper positions, the three fields are simply added
 228 | together to form the result.  This means that any integer portion of `zSig'
 229 | will be added into the exponent.  Since a properly normalized significand
 230 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 231 | than the desired result exponent whenever `zSig' is a complete, normalized
 232 | significand.
 233 *----------------------------------------------------------------------------*/
 234
 235 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 236 {
 237
 238     return make_float32(
 239           ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
 240
 241 }
 242
 243 /*----------------------------------------------------------------------------
 244 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 245 | and significand `zSig', and returns the proper single-precision floating-
 246 | point value corresponding to the abstract input.  Ordinarily, the abstract
 247 | value is simply rounded and packed into the single-precision format, with
 248 | the inexact exception raised if the abstract input cannot be represented
 249 | exactly.  However, if the abstract value is too large, the overflow and
 250 | inexact exceptions are raised and an infinity or maximal finite value is
 251 | returned.  If the abstract value is too small, the input value is rounded to
 252 | a subnormal number, and the underflow and inexact exceptions are raised if
 253 | the abstract input cannot be represented exactly as a subnormal single-
 254 | precision floating-point number.
 255 |     The input significand `zSig' has its binary point between bits 30
 256 | and 29, which is 7 bits to the left of the usual location.  This shifted
 257 | significand must be normalized or smaller.  If `zSig' is not normalized,
 258 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 259 | and it must not require rounding.  In the usual case that `zSig' is
 260 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 261 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 262 | Binary Floating-Point Arithmetic.
 263 *----------------------------------------------------------------------------*/
 264
 265 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 266 {
 267     int8 roundingMode;
 268     flag roundNearestEven;
 269     int8 roundIncrement, roundBits;
 270     flag isTiny;
 271
 272     roundingMode = STATUS(float_rounding_mode);
 273     roundNearestEven = ( roundingMode == float_round_nearest_even );
 274     roundIncrement = 0x40;
 275     if ( ! roundNearestEven ) {
 276         if ( roundingMode == float_round_to_zero ) {
 277             roundIncrement = 0;
 278         }
 279         else {
 280             roundIncrement = 0x7F;
 281             if ( zSign ) {
 282                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 283             }
 284             else {
 285                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 286             }
 287         }
 288     }
 289     roundBits = zSig & 0x7F;
 290     if ( 0xFD <= (bits16) zExp ) {
 291         if (    ( 0xFD < zExp )
 292              || (    ( zExp == 0xFD )
 293                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 294            ) {
 295             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 296             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 297         }
 298         if ( zExp < 0 ) {
 299             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
 300             isTiny =
 301                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 302                 || ( zExp < -1 )
 303                 || ( zSig + roundIncrement < 0x80000000 );
 304             shift32RightJamming( zSig, - zExp, &zSig );
 305             zExp = 0;
 306             roundBits = zSig & 0x7F;
 307             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 308         }
 309     }
 310     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 311     zSig = ( zSig + roundIncrement )>>7;
 312     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 313     if ( zSig == 0 ) zExp = 0;
 314     return packFloat32( zSign, zExp, zSig );
 315
 316 }
 317
 318 /*----------------------------------------------------------------------------
 319 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 320 | and significand `zSig', and returns the proper single-precision floating-
 321 | point value corresponding to the abstract input.  This routine is just like
 322 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 323 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 324 | floating-point exponent.
 325 *----------------------------------------------------------------------------*/
 326
 327 static float32
 328  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 329 {
 330     int8 shiftCount;
 331
 332     shiftCount = countLeadingZeros32( zSig ) - 1;
 333     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 334
 335 }
 336
 337 /*----------------------------------------------------------------------------
 338 | Returns the fraction bits of the double-precision floating-point value `a'.
 339 *----------------------------------------------------------------------------*/
 340
 341 INLINE bits64 extractFloat64Frac( float64 a )
 342 {
 343
 344     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 345
 346 }
 347
 348 /*----------------------------------------------------------------------------
 349 | Returns the exponent bits of the double-precision floating-point value `a'.
 350 *----------------------------------------------------------------------------*/
 351
 352 INLINE int16 extractFloat64Exp( float64 a )
 353 {
 354
 355     return ( float64_val(a)>>52 ) & 0x7FF;
 356
 357 }
 358
 359 /*----------------------------------------------------------------------------
 360 | Returns the sign bit of the double-precision floating-point value `a'.
 361 *----------------------------------------------------------------------------*/
 362
 363 INLINE flag extractFloat64Sign( float64 a )
 364 {
 365
 366     return float64_val(a)>>63;
 367
 368 }
 369
 370 /*----------------------------------------------------------------------------
 371 | Normalizes the subnormal double-precision floating-point value represented
 372 | by the denormalized significand `aSig'.  The normalized exponent and
 373 | significand are stored at the locations pointed to by `zExpPtr' and
 374 | `zSigPtr', respectively.
 375 *----------------------------------------------------------------------------*/
 376
 377 static void
 378  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 379 {
 380     int8 shiftCount;
 381
 382     shiftCount = countLeadingZeros64( aSig ) - 11;
 383     *zSigPtr = aSig<<shiftCount;
 384     *zExpPtr = 1 - shiftCount;
 385
 386 }
 387
 388 /*----------------------------------------------------------------------------
 389 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 390 | double-precision floating-point value, returning the result.  After being
 391 | shifted into the proper positions, the three fields are simply added
 392 | together to form the result.  This means that any integer portion of `zSig'
 393 | will be added into the exponent.  Since a properly normalized significand
 394 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 395 | than the desired result exponent whenever `zSig' is a complete, normalized
 396 | significand.
 397 *----------------------------------------------------------------------------*/
 398
 399 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 400 {
 401
 402     return make_float64(
 403         ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
 404
 405 }
 406
 407 /*----------------------------------------------------------------------------
 408 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 409 | and significand `zSig', and returns the proper double-precision floating-
 410 | point value corresponding to the abstract input.  Ordinarily, the abstract
 411 | value is simply rounded and packed into the double-precision format, with
 412 | the inexact exception raised if the abstract input cannot be represented
 413 | exactly.  However, if the abstract value is too large, the overflow and
 414 | inexact exceptions are raised and an infinity or maximal finite value is
 415 | returned.  If the abstract value is too small, the input value is rounded
 416 | to a subnormal number, and the underflow and inexact exceptions are raised
 417 | if the abstract input cannot be represented exactly as a subnormal double-
 418 | precision floating-point number.
 419 |     The input significand `zSig' has its binary point between bits 62
 420 | and 61, which is 10 bits to the left of the usual location.  This shifted
 421 | significand must be normalized or smaller.  If `zSig' is not normalized,
 422 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 423 | and it must not require rounding.  In the usual case that `zSig' is
 424 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 425 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 426 | Binary Floating-Point Arithmetic.
 427 *----------------------------------------------------------------------------*/
 428
 429 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 430 {
 431     int8 roundingMode;
 432     flag roundNearestEven;
 433     int16 roundIncrement, roundBits;
 434     flag isTiny;
 435
 436     roundingMode = STATUS(float_rounding_mode);
 437     roundNearestEven = ( roundingMode == float_round_nearest_even );
 438     roundIncrement = 0x200;
 439     if ( ! roundNearestEven ) {
 440         if ( roundingMode == float_round_to_zero ) {
 441             roundIncrement = 0;
 442         }
 443         else {
 444             roundIncrement = 0x3FF;
 445             if ( zSign ) {
 446                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 447             }
 448             else {
 449                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 450             }
 451         }
 452     }
 453     roundBits = zSig & 0x3FF;
 454     if ( 0x7FD <= (bits16) zExp ) {
 455         if (    ( 0x7FD < zExp )
 456              || (    ( zExp == 0x7FD )
 457                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 458            ) {
 459             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 460             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 461         }
 462         if ( zExp < 0 ) {
 463             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
 464             isTiny =
 465                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 466                 || ( zExp < -1 )
 467                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 468             shift64RightJamming( zSig, - zExp, &zSig );
 469             zExp = 0;
 470             roundBits = zSig & 0x3FF;
 471             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 472         }
 473     }
 474     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 475     zSig = ( zSig + roundIncrement )>>10;
 476     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 477     if ( zSig == 0 ) zExp = 0;
 478     return packFloat64( zSign, zExp, zSig );
 479
 480 }
 481
 482 /*----------------------------------------------------------------------------
 483 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 484 | and significand `zSig', and returns the proper double-precision floating-
 485 | point value corresponding to the abstract input.  This routine is just like
 486 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 487 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 488 | floating-point exponent.
 489 *----------------------------------------------------------------------------*/
 490
 491 static float64
 492  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 493 {
 494     int8 shiftCount;
 495
 496     shiftCount = countLeadingZeros64( zSig ) - 1;
 497     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 498
 499 }
 500
 501 #ifdef FLOATX80
 502
 503 /*----------------------------------------------------------------------------
 504 | Returns the fraction bits of the extended double-precision floating-point
 505 | value `a'.
 506 *----------------------------------------------------------------------------*/
 507
 508 INLINE bits64 extractFloatx80Frac( floatx80 a )
 509 {
 510
 511     return a.low;
 512
 513 }
 514
 515 /*----------------------------------------------------------------------------
 516 | Returns the exponent bits of the extended double-precision floating-point
 517 | value `a'.
 518 *----------------------------------------------------------------------------*/
 519
 520 INLINE int32 extractFloatx80Exp( floatx80 a )
 521 {
 522
 523     return a.high & 0x7FFF;
 524
 525 }
 526
 527 /*----------------------------------------------------------------------------
 528 | Returns the sign bit of the extended double-precision floating-point value
 529 | `a'.
 530 *----------------------------------------------------------------------------*/
 531
 532 INLINE flag extractFloatx80Sign( floatx80 a )
 533 {
 534
 535     return a.high>>15;
 536
 537 }
 538
 539 /*----------------------------------------------------------------------------
 540 | Normalizes the subnormal extended double-precision floating-point value
 541 | represented by the denormalized significand `aSig'.  The normalized exponent
 542 | and significand are stored at the locations pointed to by `zExpPtr' and
 543 | `zSigPtr', respectively.
 544 *----------------------------------------------------------------------------*/
 545
 546 static void
 547  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 548 {
 549     int8 shiftCount;
 550
 551     shiftCount = countLeadingZeros64( aSig );
 552     *zSigPtr = aSig<<shiftCount;
 553     *zExpPtr = 1 - shiftCount;
 554
 555 }
 556
 557 /*----------------------------------------------------------------------------
 558 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 559 | extended double-precision floating-point value, returning the result.
 560 *----------------------------------------------------------------------------*/
 561
 562 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 563 {
 564     floatx80 z;
 565
 566     z.low = zSig;
 567     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 568     return z;
 569
 570 }
 571
 572 /*----------------------------------------------------------------------------
 573 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 574 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 575 | and returns the proper extended double-precision floating-point value
 576 | corresponding to the abstract input.  Ordinarily, the abstract value is
 577 | rounded and packed into the extended double-precision format, with the
 578 | inexact exception raised if the abstract input cannot be represented
 579 | exactly.  However, if the abstract value is too large, the overflow and
 580 | inexact exceptions are raised and an infinity or maximal finite value is
 581 | returned.  If the abstract value is too small, the input value is rounded to
 582 | a subnormal number, and the underflow and inexact exceptions are raised if
 583 | the abstract input cannot be represented exactly as a subnormal extended
 584 | double-precision floating-point number.
 585 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 586 | number of bits as single or double precision, respectively.  Otherwise, the
 587 | result is rounded to the full precision of the extended double-precision
 588 | format.
 589 |     The input significand must be normalized or smaller.  If the input
 590 | significand is not normalized, `zExp' must be 0; in that case, the result
 591 | returned is a subnormal number, and it must not require rounding.  The
 592 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 593 | Floating-Point Arithmetic.
 594 *----------------------------------------------------------------------------*/
 595
 596 static floatx80
 597  roundAndPackFloatx80(
 598      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 599  STATUS_PARAM)
 600 {
 601     int8 roundingMode;
 602     flag roundNearestEven, increment, isTiny;
 603     int64 roundIncrement, roundMask, roundBits;
 604
 605     roundingMode = STATUS(float_rounding_mode);
 606     roundNearestEven = ( roundingMode == float_round_nearest_even );
 607     if ( roundingPrecision == 80 ) goto precision80;
 608     if ( roundingPrecision == 64 ) {
 609         roundIncrement = LIT64( 0x0000000000000400 );
 610         roundMask = LIT64( 0x00000000000007FF );
 611     }
 612     else if ( roundingPrecision == 32 ) {
 613         roundIncrement = LIT64( 0x0000008000000000 );
 614         roundMask = LIT64( 0x000000FFFFFFFFFF );
 615     }
 616     else {
 617         goto precision80;
 618     }
 619     zSig0 |= ( zSig1 != 0 );
 620     if ( ! roundNearestEven ) {
 621         if ( roundingMode == float_round_to_zero ) {
 622             roundIncrement = 0;
 623         }
 624         else {
 625             roundIncrement = roundMask;
 626             if ( zSign ) {
 627                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 628             }
 629             else {
 630                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 631             }
 632         }
 633     }
 634     roundBits = zSig0 & roundMask;
 635     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 636         if (    ( 0x7FFE < zExp )
 637              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 638            ) {
 639             goto overflow;
 640         }
 641         if ( zExp <= 0 ) {
 642             if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
 643             isTiny =
 644                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 645                 || ( zExp < 0 )
 646                 || ( zSig0 <= zSig0 + roundIncrement );
 647             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 648             zExp = 0;
 649             roundBits = zSig0 & roundMask;
 650             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 651             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 652             zSig0 += roundIncrement;
 653             if ( (sbits64) zSig0 < 0 ) zExp = 1;
 654             roundIncrement = roundMask + 1;
 655             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 656                 roundMask |= roundIncrement;
 657             }
 658             zSig0 &= ~ roundMask;
 659             return packFloatx80( zSign, zExp, zSig0 );
 660         }
 661     }
 662     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 663     zSig0 += roundIncrement;
 664     if ( zSig0 < roundIncrement ) {
 665         ++zExp;
 666         zSig0 = LIT64( 0x8000000000000000 );
 667     }
 668     roundIncrement = roundMask + 1;
 669     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 670         roundMask |= roundIncrement;
 671     }
 672     zSig0 &= ~ roundMask;
 673     if ( zSig0 == 0 ) zExp = 0;
 674     return packFloatx80( zSign, zExp, zSig0 );
 675  precision80:
 676     increment = ( (sbits64) zSig1 < 0 );
 677     if ( ! roundNearestEven ) {
 678         if ( roundingMode == float_round_to_zero ) {
 679             increment = 0;
 680         }
 681         else {
 682             if ( zSign ) {
 683                 increment = ( roundingMode == float_round_down ) && zSig1;
 684             }
 685             else {
 686                 increment = ( roundingMode == float_round_up ) && zSig1;
 687             }
 688         }
 689     }
 690     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 691         if (    ( 0x7FFE < zExp )
 692              || (    ( zExp == 0x7FFE )
 693                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 694                   && increment
 695                 )
 696            ) {
 697             roundMask = 0;
 698  overflow:
 699             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 700             if (    ( roundingMode == float_round_to_zero )
 701                  || ( zSign && ( roundingMode == float_round_up ) )
 702                  || ( ! zSign && ( roundingMode == float_round_down ) )
 703                ) {
 704                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 705             }
 706             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 707         }
 708         if ( zExp <= 0 ) {
 709             isTiny =
 710                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 711                 || ( zExp < 0 )
 712                 || ! increment
 713                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 714             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 715             zExp = 0;
 716             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 717             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 718             if ( roundNearestEven ) {
 719                 increment = ( (sbits64) zSig1 < 0 );
 720             }
 721             else {
 722                 if ( zSign ) {
 723                     increment = ( roundingMode == float_round_down ) && zSig1;
 724                 }
 725                 else {
 726                     increment = ( roundingMode == float_round_up ) && zSig1;
 727                 }
 728             }
 729             if ( increment ) {
 730                 ++zSig0;
 731                 zSig0 &=
 732                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 733                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
 734             }
 735             return packFloatx80( zSign, zExp, zSig0 );
 736         }
 737     }
 738     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 739     if ( increment ) {
 740         ++zSig0;
 741         if ( zSig0 == 0 ) {
 742             ++zExp;
 743             zSig0 = LIT64( 0x8000000000000000 );
 744         }
 745         else {
 746             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 747         }
 748     }
 749     else {
 750         if ( zSig0 == 0 ) zExp = 0;
 751     }
 752     return packFloatx80( zSign, zExp, zSig0 );
 753
 754 }
 755
 756 /*----------------------------------------------------------------------------
 757 | Takes an abstract floating-point value having sign `zSign', exponent
 758 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 759 | and returns the proper extended double-precision floating-point value
 760 | corresponding to the abstract input.  This routine is just like
 761 | `roundAndPackFloatx80' except that the input significand does not have to be
 762 | normalized.
 763 *----------------------------------------------------------------------------*/
 764
 765 static floatx80
 766  normalizeRoundAndPackFloatx80(
 767      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 768  STATUS_PARAM)
 769 {
 770     int8 shiftCount;
 771
 772     if ( zSig0 == 0 ) {
 773         zSig0 = zSig1;
 774         zSig1 = 0;
 775         zExp -= 64;
 776     }
 777     shiftCount = countLeadingZeros64( zSig0 );
 778     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 779     zExp -= shiftCount;
 780     return
 781         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 782
 783 }
 784
 785 #endif
 786
 787 #ifdef FLOAT128
 788
 789 /*----------------------------------------------------------------------------
 790 | Returns the least-significant 64 fraction bits of the quadruple-precision
 791 | floating-point value `a'.
 792 *----------------------------------------------------------------------------*/
 793
 794 INLINE bits64 extractFloat128Frac1( float128 a )
 795 {
 796
 797     return a.low;
 798
 799 }
 800
 801 /*----------------------------------------------------------------------------
 802 | Returns the most-significant 48 fraction bits of the quadruple-precision
 803 | floating-point value `a'.
 804 *----------------------------------------------------------------------------*/
 805
 806 INLINE bits64 extractFloat128Frac0( float128 a )
 807 {
 808
 809     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 810
 811 }
 812
 813 /*----------------------------------------------------------------------------
 814 | Returns the exponent bits of the quadruple-precision floating-point value
 815 | `a'.
 816 *----------------------------------------------------------------------------*/
 817
 818 INLINE int32 extractFloat128Exp( float128 a )
 819 {
 820
 821     return ( a.high>>48 ) & 0x7FFF;
 822
 823 }
 824
 825 /*----------------------------------------------------------------------------
 826 | Returns the sign bit of the quadruple-precision floating-point value `a'.
 827 *----------------------------------------------------------------------------*/
 828
 829 INLINE flag extractFloat128Sign( float128 a )
 830 {
 831
 832     return a.high>>63;
 833
 834 }
 835
 836 /*----------------------------------------------------------------------------
 837 | Normalizes the subnormal quadruple-precision floating-point value
 838 | represented by the denormalized significand formed by the concatenation of
 839 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 840 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 841 | significand are stored at the location pointed to by `zSig0Ptr', and the
 842 | least significant 64 bits of the normalized significand are stored at the
 843 | location pointed to by `zSig1Ptr'.
 844 *----------------------------------------------------------------------------*/
 845
 846 static void
 847  normalizeFloat128Subnormal(
 848      bits64 aSig0,
 849      bits64 aSig1,
 850      int32 *zExpPtr,
 851      bits64 *zSig0Ptr,
 852      bits64 *zSig1Ptr
 853  )
 854 {
 855     int8 shiftCount;
 856
 857     if ( aSig0 == 0 ) {
 858         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 859         if ( shiftCount < 0 ) {
 860             *zSig0Ptr = aSig1>>( - shiftCount );
 861             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 862         }
 863         else {
 864             *zSig0Ptr = aSig1<<shiftCount;
 865             *zSig1Ptr = 0;
 866         }
 867         *zExpPtr = - shiftCount - 63;
 868     }
 869     else {
 870         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 871         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 872         *zExpPtr = 1 - shiftCount;
 873     }
 874
 875 }
 876
 877 /*----------------------------------------------------------------------------
 878 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 879 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 880 | floating-point value, returning the result.  After being shifted into the
 881 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 882 | added together to form the most significant 32 bits of the result.  This
 883 | means that any integer portion of `zSig0' will be added into the exponent.
 884 | Since a properly normalized significand will have an integer portion equal
 885 | to 1, the `zExp' input should be 1 less than the desired result exponent
 886 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 887 | significand.
 888 *----------------------------------------------------------------------------*/
 889
 890 INLINE float128
 891  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 892 {
 893     float128 z;
 894
 895     z.low = zSig1;
 896     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
 897     return z;
 898
 899 }
 900
 901 /*----------------------------------------------------------------------------
 902 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 903 | and extended significand formed by the concatenation of `zSig0', `zSig1',
 904 | and `zSig2', and returns the proper quadruple-precision floating-point value
 905 | corresponding to the abstract input.  Ordinarily, the abstract value is
 906 | simply rounded and packed into the quadruple-precision format, with the
 907 | inexact exception raised if the abstract input cannot be represented
 908 | exactly.  However, if the abstract value is too large, the overflow and
 909 | inexact exceptions are raised and an infinity or maximal finite value is
 910 | returned.  If the abstract value is too small, the input value is rounded to
 911 | a subnormal number, and the underflow and inexact exceptions are raised if
 912 | the abstract input cannot be represented exactly as a subnormal quadruple-
 913 | precision floating-point number.
 914 |     The input significand must be normalized or smaller.  If the input
 915 | significand is not normalized, `zExp' must be 0; in that case, the result
 916 | returned is a subnormal number, and it must not require rounding.  In the
 917 | usual case that the input significand is normalized, `zExp' must be 1 less
 918 | than the ``true'' floating-point exponent.  The handling of underflow and
 919 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 920 *----------------------------------------------------------------------------*/
 921
 922 static float128
 923  roundAndPackFloat128(
 924      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
 925 {
 926     int8 roundingMode;
 927     flag roundNearestEven, increment, isTiny;
 928
 929     roundingMode = STATUS(float_rounding_mode);
 930     roundNearestEven = ( roundingMode == float_round_nearest_even );
 931     increment = ( (sbits64) zSig2 < 0 );
 932     if ( ! roundNearestEven ) {
 933         if ( roundingMode == float_round_to_zero ) {
 934             increment = 0;
 935         }
 936         else {
 937             if ( zSign ) {
 938                 increment = ( roundingMode == float_round_down ) && zSig2;
 939             }
 940             else {
 941                 increment = ( roundingMode == float_round_up ) && zSig2;
 942             }
 943         }
 944     }
 945     if ( 0x7FFD <= (bits32) zExp ) {
 946         if (    ( 0x7FFD < zExp )
 947              || (    ( zExp == 0x7FFD )
 948                   && eq128(
 949                          LIT64( 0x0001FFFFFFFFFFFF ),
 950                          LIT64( 0xFFFFFFFFFFFFFFFF ),
 951                          zSig0,
 952                          zSig1
 953                      )
 954                   && increment
 955                 )
 956            ) {
 957             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 958             if (    ( roundingMode == float_round_to_zero )
 959                  || ( zSign && ( roundingMode == float_round_up ) )
 960                  || ( ! zSign && ( roundingMode == float_round_down ) )
 961                ) {
 962                 return
 963                     packFloat128(
 964                         zSign,
 965                         0x7FFE,
 966                         LIT64( 0x0000FFFFFFFFFFFF ),
 967                         LIT64( 0xFFFFFFFFFFFFFFFF )
 968                     );
 969             }
 970             return packFloat128( zSign, 0x7FFF, 0, 0 );
 971         }
 972         if ( zExp < 0 ) {
 973             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
 974             isTiny =
 975                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 976                 || ( zExp < -1 )
 977                 || ! increment
 978                 || lt128(
 979                        zSig0,
 980                        zSig1,
 981                        LIT64( 0x0001FFFFFFFFFFFF ),
 982                        LIT64( 0xFFFFFFFFFFFFFFFF )
 983                    );
 984             shift128ExtraRightJamming(
 985                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
 986             zExp = 0;
 987             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
 988             if ( roundNearestEven ) {
 989                 increment = ( (sbits64) zSig2 < 0 );
 990             }
 991             else {
 992                 if ( zSign ) {
 993                     increment = ( roundingMode == float_round_down ) && zSig2;
 994                 }
 995                 else {
 996                     increment = ( roundingMode == float_round_up ) && zSig2;
 997                 }
 998             }
 999         }
1000     }
1001     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1002     if ( increment ) {
1003         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1004         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1005     }
1006     else {
1007         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1008     }
1009     return packFloat128( zSign, zExp, zSig0, zSig1 );
1010
1011 }
1012
1013 /*----------------------------------------------------------------------------
1014 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1015 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1016 | returns the proper quadruple-precision floating-point value corresponding
1017 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1018 | except that the input significand has fewer bits and does not have to be
1019 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1020 | point exponent.
1021 *----------------------------------------------------------------------------*/
1022
1023 static float128
1024  normalizeRoundAndPackFloat128(
1025      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1026 {
1027     int8 shiftCount;
1028     bits64 zSig2;
1029
1030     if ( zSig0 == 0 ) {
1031         zSig0 = zSig1;
1032         zSig1 = 0;
1033         zExp -= 64;
1034     }
1035     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1036     if ( 0 <= shiftCount ) {
1037         zSig2 = 0;
1038         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1039     }
1040     else {
1041         shift128ExtraRightJamming(
1042             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1043     }
1044     zExp -= shiftCount;
1045     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1046
1047 }
1048
1049 #endif
1050
1051 /*----------------------------------------------------------------------------
1052 | Returns the result of converting the 32-bit two's complement integer `a'
1053 | to the single-precision floating-point format.  The conversion is performed
1054 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1055 *----------------------------------------------------------------------------*/
1056
1057 float32 int32_to_float32( int32 a STATUS_PARAM )
1058 {
1059     flag zSign;
1060
1061     if ( a == 0 ) return float32_zero;
1062     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1063     zSign = ( a < 0 );
1064     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1065
1066 }
1067
1068 /*----------------------------------------------------------------------------
1069 | Returns the result of converting the 32-bit two's complement integer `a'
1070 | to the double-precision floating-point format.  The conversion is performed
1071 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1072 *----------------------------------------------------------------------------*/
1073
1074 float64 int32_to_float64( int32 a STATUS_PARAM )
1075 {
1076     flag zSign;
1077     uint32 absA;
1078     int8 shiftCount;
1079     bits64 zSig;
1080
1081     if ( a == 0 ) return float64_zero;
1082     zSign = ( a < 0 );
1083     absA = zSign ? - a : a;
1084     shiftCount = countLeadingZeros32( absA ) + 21;
1085     zSig = absA;
1086     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1087
1088 }
1089
1090 #ifdef FLOATX80
1091
1092 /*----------------------------------------------------------------------------
1093 | Returns the result of converting the 32-bit two's complement integer `a'
1094 | to the extended double-precision floating-point format.  The conversion
1095 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1096 | Arithmetic.
1097 *----------------------------------------------------------------------------*/
1098
1099 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1100 {
1101     flag zSign;
1102     uint32 absA;
1103     int8 shiftCount;
1104     bits64 zSig;
1105
1106     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1107     zSign = ( a < 0 );
1108     absA = zSign ? - a : a;
1109     shiftCount = countLeadingZeros32( absA ) + 32;
1110     zSig = absA;
1111     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1112
1113 }
1114
1115 #endif
1116
1117 #ifdef FLOAT128
1118
1119 /*----------------------------------------------------------------------------
1120 | Returns the result of converting the 32-bit two's complement integer `a' to
1121 | the quadruple-precision floating-point format.  The conversion is performed
1122 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1123 *----------------------------------------------------------------------------*/
1124
1125 float128 int32_to_float128( int32 a STATUS_PARAM )
1126 {
1127     flag zSign;
1128     uint32 absA;
1129     int8 shiftCount;
1130     bits64 zSig0;
1131
1132     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1133     zSign = ( a < 0 );
1134     absA = zSign ? - a : a;
1135     shiftCount = countLeadingZeros32( absA ) + 17;
1136     zSig0 = absA;
1137     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1138
1139 }
1140
1141 #endif
1142
1143 /*----------------------------------------------------------------------------
1144 | Returns the result of converting the 64-bit two's complement integer `a'
1145 | to the single-precision floating-point format.  The conversion is performed
1146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1147 *----------------------------------------------------------------------------*/
1148
1149 float32 int64_to_float32( int64 a STATUS_PARAM )
1150 {
1151     flag zSign;
1152     uint64 absA;
1153     int8 shiftCount;
1154
1155     if ( a == 0 ) return float32_zero;
1156     zSign = ( a < 0 );
1157     absA = zSign ? - a : a;
1158     shiftCount = countLeadingZeros64( absA ) - 40;
1159     if ( 0 <= shiftCount ) {
1160         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1161     }
1162     else {
1163         shiftCount += 7;
1164         if ( shiftCount < 0 ) {
1165             shift64RightJamming( absA, - shiftCount, &absA );
1166         }
1167         else {
1168             absA <<= shiftCount;
1169         }
1170         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1171     }
1172
1173 }
1174
1175 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1176 {
1177     int8 shiftCount;
1178
1179     if ( a == 0 ) return float32_zero;
1180     shiftCount = countLeadingZeros64( a ) - 40;
1181     if ( 0 <= shiftCount ) {
1182         return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1183     }
1184     else {
1185         shiftCount += 7;
1186         if ( shiftCount < 0 ) {
1187             shift64RightJamming( a, - shiftCount, &a );
1188         }
1189         else {
1190             a <<= shiftCount;
1191         }
1192         return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1193     }
1194 }
1195
1196 /*----------------------------------------------------------------------------
1197 | Returns the result of converting the 64-bit two's complement integer `a'
1198 | to the double-precision floating-point format.  The conversion is performed
1199 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1200 *----------------------------------------------------------------------------*/
1201
1202 float64 int64_to_float64( int64 a STATUS_PARAM )
1203 {
1204     flag zSign;
1205
1206     if ( a == 0 ) return float64_zero;
1207     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1208         return packFloat64( 1, 0x43E, 0 );
1209     }
1210     zSign = ( a < 0 );
1211     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1212
1213 }
1214
1215 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1216 {
1217     if ( a == 0 ) return float64_zero;
1218     return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1219
1220 }
1221
1222 #ifdef FLOATX80
1223
1224 /*----------------------------------------------------------------------------
1225 | Returns the result of converting the 64-bit two's complement integer `a'
1226 | to the extended double-precision floating-point format.  The conversion
1227 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1228 | Arithmetic.
1229 *----------------------------------------------------------------------------*/
1230
1231 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1232 {
1233     flag zSign;
1234     uint64 absA;
1235     int8 shiftCount;
1236
1237     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1238     zSign = ( a < 0 );
1239     absA = zSign ? - a : a;
1240     shiftCount = countLeadingZeros64( absA );
1241     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1242
1243 }
1244
1245 #endif
1246
1247 #ifdef FLOAT128
1248
1249 /*----------------------------------------------------------------------------
1250 | Returns the result of converting the 64-bit two's complement integer `a' to
1251 | the quadruple-precision floating-point format.  The conversion is performed
1252 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1253 *----------------------------------------------------------------------------*/
1254
1255 float128 int64_to_float128( int64 a STATUS_PARAM )
1256 {
1257     flag zSign;
1258     uint64 absA;
1259     int8 shiftCount;
1260     int32 zExp;
1261     bits64 zSig0, zSig1;
1262
1263     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1264     zSign = ( a < 0 );
1265     absA = zSign ? - a : a;
1266     shiftCount = countLeadingZeros64( absA ) + 49;
1267     zExp = 0x406E - shiftCount;
1268     if ( 64 <= shiftCount ) {
1269         zSig1 = 0;
1270         zSig0 = absA;
1271         shiftCount -= 64;
1272     }
1273     else {
1274         zSig1 = absA;
1275         zSig0 = 0;
1276     }
1277     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1278     return packFloat128( zSign, zExp, zSig0, zSig1 );
1279
1280 }
1281
1282 #endif
1283
1284 /*----------------------------------------------------------------------------
1285 | Returns the result of converting the single-precision floating-point value
1286 | `a' to the 32-bit two's complement integer format.  The conversion is
1287 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1288 | Arithmetic---which means in particular that the conversion is rounded
1289 | according to the current rounding mode.  If `a' is a NaN, the largest
1290 | positive integer is returned.  Otherwise, if the conversion overflows, the
1291 | largest integer with the same sign as `a' is returned.
1292 *----------------------------------------------------------------------------*/
1293
1294 int32 float32_to_int32( float32 a STATUS_PARAM )
1295 {
1296     flag aSign;
1297     int16 aExp, shiftCount;
1298     bits32 aSig;
1299     bits64 aSig64;
1300
1301     aSig = extractFloat32Frac( a );
1302     aExp = extractFloat32Exp( a );
1303     aSign = extractFloat32Sign( a );
1304     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1305     if ( aExp ) aSig |= 0x00800000;
1306     shiftCount = 0xAF - aExp;
1307     aSig64 = aSig;
1308     aSig64 <<= 32;
1309     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1310     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1311
1312 }
1313
1314 /*----------------------------------------------------------------------------
1315 | Returns the result of converting the single-precision floating-point value
1316 | `a' to the 32-bit two's complement integer format.  The conversion is
1317 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1318 | Arithmetic, except that the conversion is always rounded toward zero.
1319 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1320 | the conversion overflows, the largest integer with the same sign as `a' is
1321 | returned.
1322 *----------------------------------------------------------------------------*/
1323
1324 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1325 {
1326     flag aSign;
1327     int16 aExp, shiftCount;
1328     bits32 aSig;
1329     int32 z;
1330
1331     aSig = extractFloat32Frac( a );
1332     aExp = extractFloat32Exp( a );
1333     aSign = extractFloat32Sign( a );
1334     shiftCount = aExp - 0x9E;
1335     if ( 0 <= shiftCount ) {
1336         if ( float32_val(a) != 0xCF000000 ) {
1337             float_raise( float_flag_invalid STATUS_VAR);
1338             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1339         }
1340         return (sbits32) 0x80000000;
1341     }
1342     else if ( aExp <= 0x7E ) {
1343         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1344         return 0;
1345     }
1346     aSig = ( aSig | 0x00800000 )<<8;
1347     z = aSig>>( - shiftCount );
1348     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1349         STATUS(float_exception_flags) |= float_flag_inexact;
1350     }
1351     if ( aSign ) z = - z;
1352     return z;
1353
1354 }
1355
1356 /*----------------------------------------------------------------------------
1357 | Returns the result of converting the single-precision floating-point value
1358 | `a' to the 64-bit two's complement integer format.  The conversion is
1359 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1360 | Arithmetic---which means in particular that the conversion is rounded
1361 | according to the current rounding mode.  If `a' is a NaN, the largest
1362 | positive integer is returned.  Otherwise, if the conversion overflows, the
1363 | largest integer with the same sign as `a' is returned.
1364 *----------------------------------------------------------------------------*/
1365
1366 int64 float32_to_int64( float32 a STATUS_PARAM )
1367 {
1368     flag aSign;
1369     int16 aExp, shiftCount;
1370     bits32 aSig;
1371     bits64 aSig64, aSigExtra;
1372
1373     aSig = extractFloat32Frac( a );
1374     aExp = extractFloat32Exp( a );
1375     aSign = extractFloat32Sign( a );
1376     shiftCount = 0xBE - aExp;
1377     if ( shiftCount < 0 ) {
1378         float_raise( float_flag_invalid STATUS_VAR);
1379         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1380             return LIT64( 0x7FFFFFFFFFFFFFFF );
1381         }
1382         return (sbits64) LIT64( 0x8000000000000000 );
1383     }
1384     if ( aExp ) aSig |= 0x00800000;
1385     aSig64 = aSig;
1386     aSig64 <<= 40;
1387     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1388     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1389
1390 }
1391
1392 /*----------------------------------------------------------------------------
1393 | Returns the result of converting the single-precision floating-point value
1394 | `a' to the 64-bit two's complement integer format.  The conversion is
1395 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1396 | Arithmetic, except that the conversion is always rounded toward zero.  If
1397 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1398 | conversion overflows, the largest integer with the same sign as `a' is
1399 | returned.
1400 *----------------------------------------------------------------------------*/
1401
1402 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1403 {
1404     flag aSign;
1405     int16 aExp, shiftCount;
1406     bits32 aSig;
1407     bits64 aSig64;
1408     int64 z;
1409
1410     aSig = extractFloat32Frac( a );
1411     aExp = extractFloat32Exp( a );
1412     aSign = extractFloat32Sign( a );
1413     shiftCount = aExp - 0xBE;
1414     if ( 0 <= shiftCount ) {
1415         if ( float32_val(a) != 0xDF000000 ) {
1416             float_raise( float_flag_invalid STATUS_VAR);
1417             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1418                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1419             }
1420         }
1421         return (sbits64) LIT64( 0x8000000000000000 );
1422     }
1423     else if ( aExp <= 0x7E ) {
1424         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1425         return 0;
1426     }
1427     aSig64 = aSig | 0x00800000;
1428     aSig64 <<= 40;
1429     z = aSig64>>( - shiftCount );
1430     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1431         STATUS(float_exception_flags) |= float_flag_inexact;
1432     }
1433     if ( aSign ) z = - z;
1434     return z;
1435
1436 }
1437
1438 /*----------------------------------------------------------------------------
1439 | Returns the result of converting the single-precision floating-point value
1440 | `a' to the double-precision floating-point format.  The conversion is
1441 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1442 | Arithmetic.
1443 *----------------------------------------------------------------------------*/
1444
1445 float64 float32_to_float64( float32 a STATUS_PARAM )
1446 {
1447     flag aSign;
1448     int16 aExp;
1449     bits32 aSig;
1450
1451     aSig = extractFloat32Frac( a );
1452     aExp = extractFloat32Exp( a );
1453     aSign = extractFloat32Sign( a );
1454     if ( aExp == 0xFF ) {
1455         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1456         return packFloat64( aSign, 0x7FF, 0 );
1457     }
1458     if ( aExp == 0 ) {
1459         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1460         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1461         --aExp;
1462     }
1463     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1464
1465 }
1466
1467 #ifdef FLOATX80
1468
1469 /*----------------------------------------------------------------------------
1470 | Returns the result of converting the single-precision floating-point value
1471 | `a' to the extended double-precision floating-point format.  The conversion
1472 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1473 | Arithmetic.
1474 *----------------------------------------------------------------------------*/
1475
1476 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1477 {
1478     flag aSign;
1479     int16 aExp;
1480     bits32 aSig;
1481
1482     aSig = extractFloat32Frac( a );
1483     aExp = extractFloat32Exp( a );
1484     aSign = extractFloat32Sign( a );
1485     if ( aExp == 0xFF ) {
1486         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1487         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1488     }
1489     if ( aExp == 0 ) {
1490         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1491         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1492     }
1493     aSig |= 0x00800000;
1494     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1495
1496 }
1497
1498 #endif
1499
1500 #ifdef FLOAT128
1501
1502 /*----------------------------------------------------------------------------
1503 | Returns the result of converting the single-precision floating-point value
1504 | `a' to the double-precision floating-point format.  The conversion is
1505 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1506 | Arithmetic.
1507 *----------------------------------------------------------------------------*/
1508
1509 float128 float32_to_float128( float32 a STATUS_PARAM )
1510 {
1511     flag aSign;
1512     int16 aExp;
1513     bits32 aSig;
1514
1515     aSig = extractFloat32Frac( a );
1516     aExp = extractFloat32Exp( a );
1517     aSign = extractFloat32Sign( a );
1518     if ( aExp == 0xFF ) {
1519         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1520         return packFloat128( aSign, 0x7FFF, 0, 0 );
1521     }
1522     if ( aExp == 0 ) {
1523         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1524         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1525         --aExp;
1526     }
1527     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1528
1529 }
1530
1531 #endif
1532
1533 /*----------------------------------------------------------------------------
1534 | Rounds the single-precision floating-point value `a' to an integer, and
1535 | returns the result as a single-precision floating-point value.  The
1536 | operation is performed according to the IEC/IEEE Standard for Binary
1537 | Floating-Point Arithmetic.
1538 *----------------------------------------------------------------------------*/
1539
1540 float32 float32_round_to_int( float32 a STATUS_PARAM)
1541 {
1542     flag aSign;
1543     int16 aExp;
1544     bits32 lastBitMask, roundBitsMask;
1545     int8 roundingMode;
1546     bits32 z;
1547
1548     aExp = extractFloat32Exp( a );
1549     if ( 0x96 <= aExp ) {
1550         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1551             return propagateFloat32NaN( a, a STATUS_VAR );
1552         }
1553         return a;
1554     }
1555     if ( aExp <= 0x7E ) {
1556         if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1557         STATUS(float_exception_flags) |= float_flag_inexact;
1558         aSign = extractFloat32Sign( a );
1559         switch ( STATUS(float_rounding_mode) ) {
1560          case float_round_nearest_even:
1561             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1562                 return packFloat32( aSign, 0x7F, 0 );
1563             }
1564             break;
1565          case float_round_down:
1566             return make_float32(aSign ? 0xBF800000 : 0);
1567          case float_round_up:
1568             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1569         }
1570         return packFloat32( aSign, 0, 0 );
1571     }
1572     lastBitMask = 1;
1573     lastBitMask <<= 0x96 - aExp;
1574     roundBitsMask = lastBitMask - 1;
1575     z = float32_val(a);
1576     roundingMode = STATUS(float_rounding_mode);
1577     if ( roundingMode == float_round_nearest_even ) {
1578         z += lastBitMask>>1;
1579         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1580     }
1581     else if ( roundingMode != float_round_to_zero ) {
1582         if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1583             z += roundBitsMask;
1584         }
1585     }
1586     z &= ~ roundBitsMask;
1587     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1588     return make_float32(z);
1589
1590 }
1591
1592 /*----------------------------------------------------------------------------
1593 | Returns the result of adding the absolute values of the single-precision
1594 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1595 | before being returned.  `zSign' is ignored if the result is a NaN.
1596 | The addition is performed according to the IEC/IEEE Standard for Binary
1597 | Floating-Point Arithmetic.
1598 *----------------------------------------------------------------------------*/
1599
1600 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1601 {
1602     int16 aExp, bExp, zExp;
1603     bits32 aSig, bSig, zSig;
1604     int16 expDiff;
1605
1606     aSig = extractFloat32Frac( a );
1607     aExp = extractFloat32Exp( a );
1608     bSig = extractFloat32Frac( b );
1609     bExp = extractFloat32Exp( b );
1610     expDiff = aExp - bExp;
1611     aSig <<= 6;
1612     bSig <<= 6;
1613     if ( 0 < expDiff ) {
1614         if ( aExp == 0xFF ) {
1615             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1616             return a;
1617         }
1618         if ( bExp == 0 ) {
1619             --expDiff;
1620         }
1621         else {
1622             bSig |= 0x20000000;
1623         }
1624         shift32RightJamming( bSig, expDiff, &bSig );
1625         zExp = aExp;
1626     }
1627     else if ( expDiff < 0 ) {
1628         if ( bExp == 0xFF ) {
1629             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1630             return packFloat32( zSign, 0xFF, 0 );
1631         }
1632         if ( aExp == 0 ) {
1633             ++expDiff;
1634         }
1635         else {
1636             aSig |= 0x20000000;
1637         }
1638         shift32RightJamming( aSig, - expDiff, &aSig );
1639         zExp = bExp;
1640     }
1641     else {
1642         if ( aExp == 0xFF ) {
1643             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1644             return a;
1645         }
1646         if ( aExp == 0 ) {
1647             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1648             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1649         }
1650         zSig = 0x40000000 + aSig + bSig;
1651         zExp = aExp;
1652         goto roundAndPack;
1653     }
1654     aSig |= 0x20000000;
1655     zSig = ( aSig + bSig )<<1;
1656     --zExp;
1657     if ( (sbits32) zSig < 0 ) {
1658         zSig = aSig + bSig;
1659         ++zExp;
1660     }
1661  roundAndPack:
1662     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1663
1664 }
1665
1666 /*----------------------------------------------------------------------------
1667 | Returns the result of subtracting the absolute values of the single-
1668 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1669 | difference is negated before being returned.  `zSign' is ignored if the
1670 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1671 | Standard for Binary Floating-Point Arithmetic.
1672 *----------------------------------------------------------------------------*/
1673
1674 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1675 {
1676     int16 aExp, bExp, zExp;
1677     bits32 aSig, bSig, zSig;
1678     int16 expDiff;
1679
1680     aSig = extractFloat32Frac( a );
1681     aExp = extractFloat32Exp( a );
1682     bSig = extractFloat32Frac( b );
1683     bExp = extractFloat32Exp( b );
1684     expDiff = aExp - bExp;
1685     aSig <<= 7;
1686     bSig <<= 7;
1687     if ( 0 < expDiff ) goto aExpBigger;
1688     if ( expDiff < 0 ) goto bExpBigger;
1689     if ( aExp == 0xFF ) {
1690         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1691         float_raise( float_flag_invalid STATUS_VAR);
1692         return float32_default_nan;
1693     }
1694     if ( aExp == 0 ) {
1695         aExp = 1;
1696         bExp = 1;
1697     }
1698     if ( bSig < aSig ) goto aBigger;
1699     if ( aSig < bSig ) goto bBigger;
1700     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1701  bExpBigger:
1702     if ( bExp == 0xFF ) {
1703         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1704         return packFloat32( zSign ^ 1, 0xFF, 0 );
1705     }
1706     if ( aExp == 0 ) {
1707         ++expDiff;
1708     }
1709     else {
1710         aSig |= 0x40000000;
1711     }
1712     shift32RightJamming( aSig, - expDiff, &aSig );
1713     bSig |= 0x40000000;
1714  bBigger:
1715     zSig = bSig - aSig;
1716     zExp = bExp;
1717     zSign ^= 1;
1718     goto normalizeRoundAndPack;
1719  aExpBigger:
1720     if ( aExp == 0xFF ) {
1721         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1722         return a;
1723     }
1724     if ( bExp == 0 ) {
1725         --expDiff;
1726     }
1727     else {
1728         bSig |= 0x40000000;
1729     }
1730     shift32RightJamming( bSig, expDiff, &bSig );
1731     aSig |= 0x40000000;
1732  aBigger:
1733     zSig = aSig - bSig;
1734     zExp = aExp;
1735  normalizeRoundAndPack:
1736     --zExp;
1737     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1738
1739 }
1740
1741 /*----------------------------------------------------------------------------
1742 | Returns the result of adding the single-precision floating-point values `a'
1743 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1744 | Binary Floating-Point Arithmetic.
1745 *----------------------------------------------------------------------------*/
1746
1747 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1748 {
1749     flag aSign, bSign;
1750
1751     aSign = extractFloat32Sign( a );
1752     bSign = extractFloat32Sign( b );
1753     if ( aSign == bSign ) {
1754         return addFloat32Sigs( a, b, aSign STATUS_VAR);
1755     }
1756     else {
1757         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1758     }
1759
1760 }
1761
1762 /*----------------------------------------------------------------------------
1763 | Returns the result of subtracting the single-precision floating-point values
1764 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1765 | for Binary Floating-Point Arithmetic.
1766 *----------------------------------------------------------------------------*/
1767
1768 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1769 {
1770     flag aSign, bSign;
1771
1772     aSign = extractFloat32Sign( a );
1773     bSign = extractFloat32Sign( b );
1774     if ( aSign == bSign ) {
1775         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1776     }
1777     else {
1778         return addFloat32Sigs( a, b, aSign STATUS_VAR );
1779     }
1780
1781 }
1782
1783 /*----------------------------------------------------------------------------
1784 | Returns the result of multiplying the single-precision floating-point values
1785 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1786 | for Binary Floating-Point Arithmetic.
1787 *----------------------------------------------------------------------------*/
1788
1789 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1790 {
1791     flag aSign, bSign, zSign;
1792     int16 aExp, bExp, zExp;
1793     bits32 aSig, bSig;
1794     bits64 zSig64;
1795     bits32 zSig;
1796
1797     aSig = extractFloat32Frac( a );
1798     aExp = extractFloat32Exp( a );
1799     aSign = extractFloat32Sign( a );
1800     bSig = extractFloat32Frac( b );
1801     bExp = extractFloat32Exp( b );
1802     bSign = extractFloat32Sign( b );
1803     zSign = aSign ^ bSign;
1804     if ( aExp == 0xFF ) {
1805         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1806             return propagateFloat32NaN( a, b STATUS_VAR );
1807         }
1808         if ( ( bExp | bSig ) == 0 ) {
1809             float_raise( float_flag_invalid STATUS_VAR);
1810             return float32_default_nan;
1811         }
1812         return packFloat32( zSign, 0xFF, 0 );
1813     }
1814     if ( bExp == 0xFF ) {
1815         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1816         if ( ( aExp | aSig ) == 0 ) {
1817             float_raise( float_flag_invalid STATUS_VAR);
1818             return float32_default_nan;
1819         }
1820         return packFloat32( zSign, 0xFF, 0 );
1821     }
1822     if ( aExp == 0 ) {
1823         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1824         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1825     }
1826     if ( bExp == 0 ) {
1827         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1828         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1829     }
1830     zExp = aExp + bExp - 0x7F;
1831     aSig = ( aSig | 0x00800000 )<<7;
1832     bSig = ( bSig | 0x00800000 )<<8;
1833     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1834     zSig = zSig64;
1835     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1836         zSig <<= 1;
1837         --zExp;
1838     }
1839     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1840
1841 }
1842
1843 /*----------------------------------------------------------------------------
1844 | Returns the result of dividing the single-precision floating-point value `a'
1845 | by the corresponding value `b'.  The operation is performed according to the
1846 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1847 *----------------------------------------------------------------------------*/
1848
1849 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1850 {
1851     flag aSign, bSign, zSign;
1852     int16 aExp, bExp, zExp;
1853     bits32 aSig, bSig, zSig;
1854
1855     aSig = extractFloat32Frac( a );
1856     aExp = extractFloat32Exp( a );
1857     aSign = extractFloat32Sign( a );
1858     bSig = extractFloat32Frac( b );
1859     bExp = extractFloat32Exp( b );
1860     bSign = extractFloat32Sign( b );
1861     zSign = aSign ^ bSign;
1862     if ( aExp == 0xFF ) {
1863         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1864         if ( bExp == 0xFF ) {
1865             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1866             float_raise( float_flag_invalid STATUS_VAR);
1867             return float32_default_nan;
1868         }
1869         return packFloat32( zSign, 0xFF, 0 );
1870     }
1871     if ( bExp == 0xFF ) {
1872         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1873         return packFloat32( zSign, 0, 0 );
1874     }
1875     if ( bExp == 0 ) {
1876         if ( bSig == 0 ) {
1877             if ( ( aExp | aSig ) == 0 ) {
1878                 float_raise( float_flag_invalid STATUS_VAR);
1879                 return float32_default_nan;
1880             }
1881             float_raise( float_flag_divbyzero STATUS_VAR);
1882             return packFloat32( zSign, 0xFF, 0 );
1883         }
1884         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1885     }
1886     if ( aExp == 0 ) {
1887         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1888         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1889     }
1890     zExp = aExp - bExp + 0x7D;
1891     aSig = ( aSig | 0x00800000 )<<7;
1892     bSig = ( bSig | 0x00800000 )<<8;
1893     if ( bSig <= ( aSig + aSig ) ) {
1894         aSig >>= 1;
1895         ++zExp;
1896     }
1897     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1898     if ( ( zSig & 0x3F ) == 0 ) {
1899         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1900     }
1901     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1902
1903 }
1904
1905 /*----------------------------------------------------------------------------
1906 | Returns the remainder of the single-precision floating-point value `a'
1907 | with respect to the corresponding value `b'.  The operation is performed
1908 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1909 *----------------------------------------------------------------------------*/
1910
1911 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1912 {
1913     flag aSign, zSign;
1914     int16 aExp, bExp, expDiff;
1915     bits32 aSig, bSig;
1916     bits32 q;
1917     bits64 aSig64, bSig64, q64;
1918     bits32 alternateASig;
1919     sbits32 sigMean;
1920
1921     aSig = extractFloat32Frac( a );
1922     aExp = extractFloat32Exp( a );
1923     aSign = extractFloat32Sign( a );
1924     bSig = extractFloat32Frac( b );
1925     bExp = extractFloat32Exp( b );
1926     if ( aExp == 0xFF ) {
1927         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1928             return propagateFloat32NaN( a, b STATUS_VAR );
1929         }
1930         float_raise( float_flag_invalid STATUS_VAR);
1931         return float32_default_nan;
1932     }
1933     if ( bExp == 0xFF ) {
1934         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1935         return a;
1936     }
1937     if ( bExp == 0 ) {
1938         if ( bSig == 0 ) {
1939             float_raise( float_flag_invalid STATUS_VAR);
1940             return float32_default_nan;
1941         }
1942         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1943     }
1944     if ( aExp == 0 ) {
1945         if ( aSig == 0 ) return a;
1946         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1947     }
1948     expDiff = aExp - bExp;
1949     aSig |= 0x00800000;
1950     bSig |= 0x00800000;
1951     if ( expDiff < 32 ) {
1952         aSig <<= 8;
1953         bSig <<= 8;
1954         if ( expDiff < 0 ) {
1955             if ( expDiff < -1 ) return a;
1956             aSig >>= 1;
1957         }
1958         q = ( bSig <= aSig );
1959         if ( q ) aSig -= bSig;
1960         if ( 0 < expDiff ) {
1961             q = ( ( (bits64) aSig )<<32 ) / bSig;
1962             q >>= 32 - expDiff;
1963             bSig >>= 2;
1964             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1965         }
1966         else {
1967             aSig >>= 2;
1968             bSig >>= 2;
1969         }
1970     }
1971     else {
1972         if ( bSig <= aSig ) aSig -= bSig;
1973         aSig64 = ( (bits64) aSig )<<40;
1974         bSig64 = ( (bits64) bSig )<<40;
1975         expDiff -= 64;
1976         while ( 0 < expDiff ) {
1977             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1978             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1979             aSig64 = - ( ( bSig * q64 )<<38 );
1980             expDiff -= 62;
1981         }
1982         expDiff += 64;
1983         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1984         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1985         q = q64>>( 64 - expDiff );
1986         bSig <<= 6;
1987         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1988     }
1989     do {
1990         alternateASig = aSig;
1991         ++q;
1992         aSig -= bSig;
1993     } while ( 0 <= (sbits32) aSig );
1994     sigMean = aSig + alternateASig;
1995     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1996         aSig = alternateASig;
1997     }
1998     zSign = ( (sbits32) aSig < 0 );
1999     if ( zSign ) aSig = - aSig;
2000     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2001
2002 }
2003
2004 /*----------------------------------------------------------------------------
2005 | Returns the square root of the single-precision floating-point value `a'.
2006 | The operation is performed according to the IEC/IEEE Standard for Binary
2007 | Floating-Point Arithmetic.
2008 *----------------------------------------------------------------------------*/
2009
2010 float32 float32_sqrt( float32 a STATUS_PARAM )
2011 {
2012     flag aSign;
2013     int16 aExp, zExp;
2014     bits32 aSig, zSig;
2015     bits64 rem, term;
2016
2017     aSig = extractFloat32Frac( a );
2018     aExp = extractFloat32Exp( a );
2019     aSign = extractFloat32Sign( a );
2020     if ( aExp == 0xFF ) {
2021         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2022         if ( ! aSign ) return a;
2023         float_raise( float_flag_invalid STATUS_VAR);
2024         return float32_default_nan;
2025     }
2026     if ( aSign ) {
2027         if ( ( aExp | aSig ) == 0 ) return a;
2028         float_raise( float_flag_invalid STATUS_VAR);
2029         return float32_default_nan;
2030     }
2031     if ( aExp == 0 ) {
2032         if ( aSig == 0 ) return float32_zero;
2033         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2034     }
2035     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2036     aSig = ( aSig | 0x00800000 )<<8;
2037     zSig = estimateSqrt32( aExp, aSig ) + 2;
2038     if ( ( zSig & 0x7F ) <= 5 ) {
2039         if ( zSig < 2 ) {
2040             zSig = 0x7FFFFFFF;
2041             goto roundAndPack;
2042         }
2043         aSig >>= aExp & 1;
2044         term = ( (bits64) zSig ) * zSig;
2045         rem = ( ( (bits64) aSig )<<32 ) - term;
2046         while ( (sbits64) rem < 0 ) {
2047             --zSig;
2048             rem += ( ( (bits64) zSig )<<1 ) | 1;
2049         }
2050         zSig |= ( rem != 0 );
2051     }
2052     shift32RightJamming( zSig, 1, &zSig );
2053  roundAndPack:
2054     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2055
2056 }
2057
2058 /*----------------------------------------------------------------------------
2059 | Returns the binary exponential of the single-precision floating-point value
2060 | `a'. The operation is performed according to the IEC/IEEE Standard for
2061 | Binary Floating-Point Arithmetic.
2062 |
2063 | Uses the following identities:
2064 |
2065 | 1. -------------------------------------------------------------------------
2066 |      x    x*ln(2)
2067 |     2  = e
2068 |
2069 | 2. -------------------------------------------------------------------------
2070 |                      2     3     4     5           n
2071 |      x        x     x     x     x     x           x
2072 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2073 |               1!    2!    3!    4!    5!          n!
2074 *----------------------------------------------------------------------------*/
2075
2076 static const float64 float32_exp2_coefficients[15] =
2077 {
2078     make_float64( 0x3ff0000000000000ll ), /*  1 */
2079     make_float64( 0x3fe0000000000000ll ), /*  2 */
2080     make_float64( 0x3fc5555555555555ll ), /*  3 */
2081     make_float64( 0x3fa5555555555555ll ), /*  4 */
2082     make_float64( 0x3f81111111111111ll ), /*  5 */
2083     make_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2084     make_float64( 0x3f2a01a01a01a01all ), /*  7 */
2085     make_float64( 0x3efa01a01a01a01all ), /*  8 */
2086     make_float64( 0x3ec71de3a556c734ll ), /*  9 */
2087     make_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2088     make_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2089     make_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2090     make_float64( 0x3de6124613a86d09ll ), /* 13 */
2091     make_float64( 0x3da93974a8c07c9dll ), /* 14 */
2092     make_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2093 };
2094
2095 float32 float32_exp2( float32 a STATUS_PARAM )
2096 {
2097     flag aSign;
2098     int16 aExp;
2099     bits32 aSig;
2100     float64 r, x, xn;
2101     int i;
2102
2103     aSig = extractFloat32Frac( a );
2104     aExp = extractFloat32Exp( a );
2105     aSign = extractFloat32Sign( a );
2106
2107     if ( aExp == 0xFF) {
2108         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2109         return (aSign) ? float32_zero : a;
2110     }
2111     if (aExp == 0) {
2112         if (aSig == 0) return float32_one;
2113     }
2114
2115     float_raise( float_flag_inexact STATUS_VAR);
2116
2117     /* ******************************* */
2118     /* using float64 for approximation */
2119     /* ******************************* */
2120     x = float32_to_float64(a STATUS_VAR);
2121     x = float64_mul(x, float64_ln2 STATUS_VAR);
2122
2123     xn = x;
2124     r = float64_one;
2125     for (i = 0 ; i < 15 ; i++) {
2126         float64 f;
2127
2128         f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2129         r = float64_add(r, f STATUS_VAR);
2130
2131         xn = float64_mul(xn, x STATUS_VAR);
2132     }
2133
2134     return float64_to_float32(r, status);
2135 }
2136
2137 /*----------------------------------------------------------------------------
2138 | Returns the binary log of the single-precision floating-point value `a'.
2139 | The operation is performed according to the IEC/IEEE Standard for Binary
2140 | Floating-Point Arithmetic.
2141 *----------------------------------------------------------------------------*/
2142 float32 float32_log2( float32 a STATUS_PARAM )
2143 {
2144     flag aSign, zSign;
2145     int16 aExp;
2146     bits32 aSig, zSig, i;
2147
2148     aSig = extractFloat32Frac( a );
2149     aExp = extractFloat32Exp( a );
2150     aSign = extractFloat32Sign( a );
2151
2152     if ( aExp == 0 ) {
2153         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2154         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2155     }
2156     if ( aSign ) {
2157         float_raise( float_flag_invalid STATUS_VAR);
2158         return float32_default_nan;
2159     }
2160     if ( aExp == 0xFF ) {
2161         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2162         return a;
2163     }
2164
2165     aExp -= 0x7F;
2166     aSig |= 0x00800000;
2167     zSign = aExp < 0;
2168     zSig = aExp << 23;
2169
2170     for (i = 1 << 22; i > 0; i >>= 1) {
2171         aSig = ( (bits64)aSig * aSig ) >> 23;
2172         if ( aSig & 0x01000000 ) {
2173             aSig >>= 1;
2174             zSig |= i;
2175         }
2176     }
2177
2178     if ( zSign )
2179         zSig = -zSig;
2180
2181     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2182 }
2183
2184 /*----------------------------------------------------------------------------
2185 | Returns 1 if the single-precision floating-point value `a' is equal to
2186 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2187 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2188 *----------------------------------------------------------------------------*/
2189
2190 int float32_eq( float32 a, float32 b STATUS_PARAM )
2191 {
2192
2193     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2194          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2195        ) {
2196         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2197             float_raise( float_flag_invalid STATUS_VAR);
2198         }
2199         return 0;
2200     }
2201     return ( float32_val(a) == float32_val(b) ) ||
2202             ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2203
2204 }
2205
2206 /*----------------------------------------------------------------------------
2207 | Returns 1 if the single-precision floating-point value `a' is less than
2208 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
2209 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2210 | Arithmetic.
2211 *----------------------------------------------------------------------------*/
2212
2213 int float32_le( float32 a, float32 b STATUS_PARAM )
2214 {
2215     flag aSign, bSign;
2216     bits32 av, bv;
2217
2218     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2219          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2220        ) {
2221         float_raise( float_flag_invalid STATUS_VAR);
2222         return 0;
2223     }
2224     aSign = extractFloat32Sign( a );
2225     bSign = extractFloat32Sign( b );
2226     av = float32_val(a);
2227     bv = float32_val(b);
2228     if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2229     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2230
2231 }
2232
2233 /*----------------------------------------------------------------------------
2234 | Returns 1 if the single-precision floating-point value `a' is less than
2235 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2236 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2237 *----------------------------------------------------------------------------*/
2238
2239 int float32_lt( float32 a, float32 b STATUS_PARAM )
2240 {
2241     flag aSign, bSign;
2242     bits32 av, bv;
2243
2244     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2245          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2246        ) {
2247         float_raise( float_flag_invalid STATUS_VAR);
2248         return 0;
2249     }
2250     aSign = extractFloat32Sign( a );
2251     bSign = extractFloat32Sign( b );
2252     av = float32_val(a);
2253     bv = float32_val(b);
2254     if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2255     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2256
2257 }
2258
2259 /*----------------------------------------------------------------------------
2260 | Returns 1 if the single-precision floating-point value `a' is equal to
2261 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2262 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2263 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2264 *----------------------------------------------------------------------------*/
2265
2266 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2267 {
2268     bits32 av, bv;
2269
2270     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2271          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2272        ) {
2273         float_raise( float_flag_invalid STATUS_VAR);
2274         return 0;
2275     }
2276     av = float32_val(a);
2277     bv = float32_val(b);
2278     return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2279
2280 }
2281
2282 /*----------------------------------------------------------------------------
2283 | Returns 1 if the single-precision floating-point value `a' is less than or
2284 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2285 | cause an exception.  Otherwise, the comparison is performed according to the
2286 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2287 *----------------------------------------------------------------------------*/
2288
2289 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2290 {
2291     flag aSign, bSign;
2292     bits32 av, bv;
2293
2294     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2295          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2296        ) {
2297         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2298             float_raise( float_flag_invalid STATUS_VAR);
2299         }
2300         return 0;
2301     }
2302     aSign = extractFloat32Sign( a );
2303     bSign = extractFloat32Sign( b );
2304     av = float32_val(a);
2305     bv = float32_val(b);
2306     if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2307     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2308
2309 }
2310
2311 /*----------------------------------------------------------------------------
2312 | Returns 1 if the single-precision floating-point value `a' is less than
2313 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2314 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2315 | Standard for Binary Floating-Point Arithmetic.
2316 *----------------------------------------------------------------------------*/
2317
2318 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2319 {
2320     flag aSign, bSign;
2321     bits32 av, bv;
2322
2323     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2324          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2325        ) {
2326         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2327             float_raise( float_flag_invalid STATUS_VAR);
2328         }
2329         return 0;
2330     }
2331     aSign = extractFloat32Sign( a );
2332     bSign = extractFloat32Sign( b );
2333     av = float32_val(a);
2334     bv = float32_val(b);
2335     if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2336     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2337
2338 }
2339
2340 /*----------------------------------------------------------------------------
2341 | Returns the result of converting the double-precision floating-point value
2342 | `a' to the 32-bit two's complement integer format.  The conversion is
2343 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2344 | Arithmetic---which means in particular that the conversion is rounded
2345 | according to the current rounding mode.  If `a' is a NaN, the largest
2346 | positive integer is returned.  Otherwise, if the conversion overflows, the
2347 | largest integer with the same sign as `a' is returned.
2348 *----------------------------------------------------------------------------*/
2349
2350 int32 float64_to_int32( float64 a STATUS_PARAM )
2351 {
2352     flag aSign;
2353     int16 aExp, shiftCount;
2354     bits64 aSig;
2355
2356     aSig = extractFloat64Frac( a );
2357     aExp = extractFloat64Exp( a );
2358     aSign = extractFloat64Sign( a );
2359     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2360     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2361     shiftCount = 0x42C - aExp;
2362     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2363     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2364
2365 }
2366
2367 /*----------------------------------------------------------------------------
2368 | Returns the result of converting the double-precision floating-point value
2369 | `a' to the 32-bit two's complement integer format.  The conversion is
2370 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2371 | Arithmetic, except that the conversion is always rounded toward zero.
2372 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2373 | the conversion overflows, the largest integer with the same sign as `a' is
2374 | returned.
2375 *----------------------------------------------------------------------------*/
2376
2377 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2378 {
2379     flag aSign;
2380     int16 aExp, shiftCount;
2381     bits64 aSig, savedASig;
2382     int32 z;
2383
2384     aSig = extractFloat64Frac( a );
2385     aExp = extractFloat64Exp( a );
2386     aSign = extractFloat64Sign( a );
2387     if ( 0x41E < aExp ) {
2388         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2389         goto invalid;
2390     }
2391     else if ( aExp < 0x3FF ) {
2392         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2393         return 0;
2394     }
2395     aSig |= LIT64( 0x0010000000000000 );
2396     shiftCount = 0x433 - aExp;
2397     savedASig = aSig;
2398     aSig >>= shiftCount;
2399     z = aSig;
2400     if ( aSign ) z = - z;
2401     if ( ( z < 0 ) ^ aSign ) {
2402  invalid:
2403         float_raise( float_flag_invalid STATUS_VAR);
2404         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2405     }
2406     if ( ( aSig<<shiftCount ) != savedASig ) {
2407         STATUS(float_exception_flags) |= float_flag_inexact;
2408     }
2409     return z;
2410
2411 }
2412
2413 /*----------------------------------------------------------------------------
2414 | Returns the result of converting the double-precision floating-point value
2415 | `a' to the 64-bit two's complement integer format.  The conversion is
2416 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2417 | Arithmetic---which means in particular that the conversion is rounded
2418 | according to the current rounding mode.  If `a' is a NaN, the largest
2419 | positive integer is returned.  Otherwise, if the conversion overflows, the
2420 | largest integer with the same sign as `a' is returned.
2421 *----------------------------------------------------------------------------*/
2422
2423 int64 float64_to_int64( float64 a STATUS_PARAM )
2424 {
2425     flag aSign;
2426     int16 aExp, shiftCount;
2427     bits64 aSig, aSigExtra;
2428
2429     aSig = extractFloat64Frac( a );
2430     aExp = extractFloat64Exp( a );
2431     aSign = extractFloat64Sign( a );
2432     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2433     shiftCount = 0x433 - aExp;
2434     if ( shiftCount <= 0 ) {
2435         if ( 0x43E < aExp ) {
2436             float_raise( float_flag_invalid STATUS_VAR);
2437             if (    ! aSign
2438                  || (    ( aExp == 0x7FF )
2439                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2440                ) {
2441                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2442             }
2443             return (sbits64) LIT64( 0x8000000000000000 );
2444         }
2445         aSigExtra = 0;
2446         aSig <<= - shiftCount;
2447     }
2448     else {
2449         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2450     }
2451     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2452
2453 }
2454
2455 /*----------------------------------------------------------------------------
2456 | Returns the result of converting the double-precision floating-point value
2457 | `a' to the 64-bit two's complement integer format.  The conversion is
2458 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2459 | Arithmetic, except that the conversion is always rounded toward zero.
2460 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2461 | the conversion overflows, the largest integer with the same sign as `a' is
2462 | returned.
2463 *----------------------------------------------------------------------------*/
2464
2465 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2466 {
2467     flag aSign;
2468     int16 aExp, shiftCount;
2469     bits64 aSig;
2470     int64 z;
2471
2472     aSig = extractFloat64Frac( a );
2473     aExp = extractFloat64Exp( a );
2474     aSign = extractFloat64Sign( a );
2475     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2476     shiftCount = aExp - 0x433;
2477     if ( 0 <= shiftCount ) {
2478         if ( 0x43E <= aExp ) {
2479             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2480                 float_raise( float_flag_invalid STATUS_VAR);
2481                 if (    ! aSign
2482                      || (    ( aExp == 0x7FF )
2483                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2484                    ) {
2485                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2486                 }
2487             }
2488             return (sbits64) LIT64( 0x8000000000000000 );
2489         }
2490         z = aSig<<shiftCount;
2491     }
2492     else {
2493         if ( aExp < 0x3FE ) {
2494             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2495             return 0;
2496         }
2497         z = aSig>>( - shiftCount );
2498         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2499             STATUS(float_exception_flags) |= float_flag_inexact;
2500         }
2501     }
2502     if ( aSign ) z = - z;
2503     return z;
2504
2505 }
2506
2507 /*----------------------------------------------------------------------------
2508 | Returns the result of converting the double-precision floating-point value
2509 | `a' to the single-precision floating-point format.  The conversion is
2510 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2511 | Arithmetic.
2512 *----------------------------------------------------------------------------*/
2513
2514 float32 float64_to_float32( float64 a STATUS_PARAM )
2515 {
2516     flag aSign;
2517     int16 aExp;
2518     bits64 aSig;
2519     bits32 zSig;
2520
2521     aSig = extractFloat64Frac( a );
2522     aExp = extractFloat64Exp( a );
2523     aSign = extractFloat64Sign( a );
2524     if ( aExp == 0x7FF ) {
2525         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2526         return packFloat32( aSign, 0xFF, 0 );
2527     }
2528     shift64RightJamming( aSig, 22, &aSig );
2529     zSig = aSig;
2530     if ( aExp || zSig ) {
2531         zSig |= 0x40000000;
2532         aExp -= 0x381;
2533     }
2534     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2535
2536 }
2537
2538
2539 /*----------------------------------------------------------------------------
2540 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2541 | half-precision floating-point value, returning the result.  After being
2542 | shifted into the proper positions, the three fields are simply added
2543 | together to form the result.  This means that any integer portion of `zSig'
2544 | will be added into the exponent.  Since a properly normalized significand
2545 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2546 | than the desired result exponent whenever `zSig' is a complete, normalized
2547 | significand.
2548 *----------------------------------------------------------------------------*/
2549 static bits16 packFloat16(flag zSign, int16 zExp, bits16 zSig)
2550 {
2551     return (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig;
2552 }
2553
2554 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2555    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2556
2557 float32 float16_to_float32( bits16 a, flag ieee STATUS_PARAM )
2558 {
2559     flag aSign;
2560     int16 aExp;
2561     bits32 aSig;
2562
2563     aSign = a >> 15;
2564     aExp = (a >> 10) & 0x1f;
2565     aSig = a & 0x3ff;
2566
2567     if (aExp == 0x1f && ieee) {
2568         if (aSig) {
2569             /* Make sure correct exceptions are raised.  */
2570             float32ToCommonNaN(a STATUS_VAR);
2571             aSig |= 0x200;
2572         }
2573         return packFloat32(aSign, 0xff, aSig << 13);
2574     }
2575     if (aExp == 0) {
2576         int8 shiftCount;
2577
2578         if (aSig == 0) {
2579             return packFloat32(aSign, 0, 0);
2580         }
2581
2582         shiftCount = countLeadingZeros32( aSig ) - 21;
2583         aSig = aSig << shiftCount;
2584         aExp = -shiftCount;
2585     }
2586     return packFloat32( aSign, aExp + 0x70, aSig << 13);
2587 }
2588
2589 bits16 float32_to_float16( float32 a, flag ieee STATUS_PARAM)
2590 {
2591     flag aSign;
2592     int16 aExp;
2593     bits32 aSig;
2594     bits32 mask;
2595     bits32 increment;
2596     int8 roundingMode;
2597
2598     aSig = extractFloat32Frac( a );
2599     aExp = extractFloat32Exp( a );
2600     aSign = extractFloat32Sign( a );
2601     if ( aExp == 0xFF ) {
2602         if (aSig) {
2603             /* Make sure correct exceptions are raised.  */
2604             float32ToCommonNaN(a STATUS_VAR);
2605             aSig |= 0x00400000;
2606         }
2607         return packFloat16(aSign, 0x1f, aSig >> 13);
2608     }
2609     if (aExp == 0 && aSign == 0) {
2610         return packFloat16(aSign, 0, 0);
2611     }
2612     /* Decimal point between bits 22 and 23.  */
2613     aSig |= 0x00800000;
2614     aExp -= 0x7f;
2615     if (aExp < -14) {
2616         mask = 0x007fffff;
2617         if (aExp < -24) {
2618             aExp = -25;
2619         } else {
2620             mask >>= 24 + aExp;
2621         }
2622     } else {
2623         mask = 0x00001fff;
2624     }
2625     if (aSig & mask) {
2626         float_raise( float_flag_underflow STATUS_VAR );
2627         roundingMode = STATUS(float_rounding_mode);
2628         switch (roundingMode) {
2629         case float_round_nearest_even:
2630             increment = (mask + 1) >> 1;
2631             if ((aSig & mask) == increment) {
2632                 increment = aSig & (increment << 1);
2633             }
2634             break;
2635         case float_round_up:
2636             increment = aSign ? 0 : mask;
2637             break;
2638         case float_round_down:
2639             increment = aSign ? mask : 0;
2640             break;
2641         default: /* round_to_zero */
2642             increment = 0;
2643             break;
2644         }
2645         aSig += increment;
2646         if (aSig >= 0x01000000) {
2647             aSig >>= 1;
2648             aExp++;
2649         }
2650     } else if (aExp < -14
2651           && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2652         float_raise( float_flag_underflow STATUS_VAR);
2653     }
2654
2655     if (ieee) {
2656         if (aExp > 15) {
2657             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2658             return packFloat16(aSign, 0x1f, 0);
2659         }
2660     } else {
2661         if (aExp > 16) {
2662             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2663             return packFloat16(aSign, 0x1f, 0x3ff);
2664         }
2665     }
2666     if (aExp < -24) {
2667         return packFloat16(aSign, 0, 0);
2668     }
2669     if (aExp < -14) {
2670         aSig >>= -14 - aExp;
2671         aExp = -14;
2672     }
2673     return packFloat16(aSign, aExp + 14, aSig >> 13);
2674 }
2675
2676 #ifdef FLOATX80
2677
2678 /*----------------------------------------------------------------------------
2679 | Returns the result of converting the double-precision floating-point value
2680 | `a' to the extended double-precision floating-point format.  The conversion
2681 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2682 | Arithmetic.
2683 *----------------------------------------------------------------------------*/
2684
2685 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2686 {
2687     flag aSign;
2688     int16 aExp;
2689     bits64 aSig;
2690
2691     aSig = extractFloat64Frac( a );
2692     aExp = extractFloat64Exp( a );
2693     aSign = extractFloat64Sign( a );
2694     if ( aExp == 0x7FF ) {
2695         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2696         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2697     }
2698     if ( aExp == 0 ) {
2699         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2700         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2701     }
2702     return
2703         packFloatx80(
2704             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2705
2706 }
2707
2708 #endif
2709
2710 #ifdef FLOAT128
2711
2712 /*----------------------------------------------------------------------------
2713 | Returns the result of converting the double-precision floating-point value
2714 | `a' to the quadruple-precision floating-point format.  The conversion is
2715 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2716 | Arithmetic.
2717 *----------------------------------------------------------------------------*/
2718
2719 float128 float64_to_float128( float64 a STATUS_PARAM )
2720 {
2721     flag aSign;
2722     int16 aExp;
2723     bits64 aSig, zSig0, zSig1;
2724
2725     aSig = extractFloat64Frac( a );
2726     aExp = extractFloat64Exp( a );
2727     aSign = extractFloat64Sign( a );
2728     if ( aExp == 0x7FF ) {
2729         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2730         return packFloat128( aSign, 0x7FFF, 0, 0 );
2731     }
2732     if ( aExp == 0 ) {
2733         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2734         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2735         --aExp;
2736     }
2737     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2738     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2739
2740 }
2741
2742 #endif
2743
2744 /*----------------------------------------------------------------------------
2745 | Rounds the double-precision floating-point value `a' to an integer, and
2746 | returns the result as a double-precision floating-point value.  The
2747 | operation is performed according to the IEC/IEEE Standard for Binary
2748 | Floating-Point Arithmetic.
2749 *----------------------------------------------------------------------------*/
2750
2751 float64 float64_round_to_int( float64 a STATUS_PARAM )
2752 {
2753     flag aSign;
2754     int16 aExp;
2755     bits64 lastBitMask, roundBitsMask;
2756     int8 roundingMode;
2757     bits64 z;
2758
2759     aExp = extractFloat64Exp( a );
2760     if ( 0x433 <= aExp ) {
2761         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2762             return propagateFloat64NaN( a, a STATUS_VAR );
2763         }
2764         return a;
2765     }
2766     if ( aExp < 0x3FF ) {
2767         if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2768         STATUS(float_exception_flags) |= float_flag_inexact;
2769         aSign = extractFloat64Sign( a );
2770         switch ( STATUS(float_rounding_mode) ) {
2771          case float_round_nearest_even:
2772             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2773                 return packFloat64( aSign, 0x3FF, 0 );
2774             }
2775             break;
2776          case float_round_down:
2777             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2778          case float_round_up:
2779             return make_float64(
2780             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2781         }
2782         return packFloat64( aSign, 0, 0 );
2783     }
2784     lastBitMask = 1;
2785     lastBitMask <<= 0x433 - aExp;
2786     roundBitsMask = lastBitMask - 1;
2787     z = float64_val(a);
2788     roundingMode = STATUS(float_rounding_mode);
2789     if ( roundingMode == float_round_nearest_even ) {
2790         z += lastBitMask>>1;
2791         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2792     }
2793     else if ( roundingMode != float_round_to_zero ) {
2794         if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2795             z += roundBitsMask;
2796         }
2797     }
2798     z &= ~ roundBitsMask;
2799     if ( z != float64_val(a) )
2800         STATUS(float_exception_flags) |= float_flag_inexact;
2801     return make_float64(z);
2802
2803 }
2804
2805 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2806 {
2807     int oldmode;
2808     float64 res;
2809     oldmode = STATUS(float_rounding_mode);
2810     STATUS(float_rounding_mode) = float_round_to_zero;
2811     res = float64_round_to_int(a STATUS_VAR);
2812     STATUS(float_rounding_mode) = oldmode;
2813     return res;
2814 }
2815
2816 /*----------------------------------------------------------------------------
2817 | Returns the result of adding the absolute values of the double-precision
2818 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2819 | before being returned.  `zSign' is ignored if the result is a NaN.
2820 | The addition is performed according to the IEC/IEEE Standard for Binary
2821 | Floating-Point Arithmetic.
2822 *----------------------------------------------------------------------------*/
2823
2824 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2825 {
2826     int16 aExp, bExp, zExp;
2827     bits64 aSig, bSig, zSig;
2828     int16 expDiff;
2829
2830     aSig = extractFloat64Frac( a );
2831     aExp = extractFloat64Exp( a );
2832     bSig = extractFloat64Frac( b );
2833     bExp = extractFloat64Exp( b );
2834     expDiff = aExp - bExp;
2835     aSig <<= 9;
2836     bSig <<= 9;
2837     if ( 0 < expDiff ) {
2838         if ( aExp == 0x7FF ) {
2839             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2840             return a;
2841         }
2842         if ( bExp == 0 ) {
2843             --expDiff;
2844         }
2845         else {
2846             bSig |= LIT64( 0x2000000000000000 );
2847         }
2848         shift64RightJamming( bSig, expDiff, &bSig );
2849         zExp = aExp;
2850     }
2851     else if ( expDiff < 0 ) {
2852         if ( bExp == 0x7FF ) {
2853             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2854             return packFloat64( zSign, 0x7FF, 0 );
2855         }
2856         if ( aExp == 0 ) {
2857             ++expDiff;
2858         }
2859         else {
2860             aSig |= LIT64( 0x2000000000000000 );
2861         }
2862         shift64RightJamming( aSig, - expDiff, &aSig );
2863         zExp = bExp;
2864     }
2865     else {
2866         if ( aExp == 0x7FF ) {
2867             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2868             return a;
2869         }
2870         if ( aExp == 0 ) {
2871             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
2872             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2873         }
2874         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2875         zExp = aExp;
2876         goto roundAndPack;
2877     }
2878     aSig |= LIT64( 0x2000000000000000 );
2879     zSig = ( aSig + bSig )<<1;
2880     --zExp;
2881     if ( (sbits64) zSig < 0 ) {
2882         zSig = aSig + bSig;
2883         ++zExp;
2884     }
2885  roundAndPack:
2886     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2887
2888 }
2889
2890 /*----------------------------------------------------------------------------
2891 | Returns the result of subtracting the absolute values of the double-
2892 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2893 | difference is negated before being returned.  `zSign' is ignored if the
2894 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2895 | Standard for Binary Floating-Point Arithmetic.
2896 *----------------------------------------------------------------------------*/
2897
2898 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2899 {
2900     int16 aExp, bExp, zExp;
2901     bits64 aSig, bSig, zSig;
2902     int16 expDiff;
2903
2904     aSig = extractFloat64Frac( a );
2905     aExp = extractFloat64Exp( a );
2906     bSig = extractFloat64Frac( b );
2907     bExp = extractFloat64Exp( b );
2908     expDiff = aExp - bExp;
2909     aSig <<= 10;
2910     bSig <<= 10;
2911     if ( 0 < expDiff ) goto aExpBigger;
2912     if ( expDiff < 0 ) goto bExpBigger;
2913     if ( aExp == 0x7FF ) {
2914         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2915         float_raise( float_flag_invalid STATUS_VAR);
2916         return float64_default_nan;
2917     }
2918     if ( aExp == 0 ) {
2919         aExp = 1;
2920         bExp = 1;
2921     }
2922     if ( bSig < aSig ) goto aBigger;
2923     if ( aSig < bSig ) goto bBigger;
2924     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2925  bExpBigger:
2926     if ( bExp == 0x7FF ) {
2927         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2928         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2929     }
2930     if ( aExp == 0 ) {
2931         ++expDiff;
2932     }
2933     else {
2934         aSig |= LIT64( 0x4000000000000000 );
2935     }
2936     shift64RightJamming( aSig, - expDiff, &aSig );
2937     bSig |= LIT64( 0x4000000000000000 );
2938  bBigger:
2939     zSig = bSig - aSig;
2940     zExp = bExp;
2941     zSign ^= 1;
2942     goto normalizeRoundAndPack;
2943  aExpBigger:
2944     if ( aExp == 0x7FF ) {
2945         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2946         return a;
2947     }
2948     if ( bExp == 0 ) {
2949         --expDiff;
2950     }
2951     else {
2952         bSig |= LIT64( 0x4000000000000000 );
2953     }
2954     shift64RightJamming( bSig, expDiff, &bSig );
2955     aSig |= LIT64( 0x4000000000000000 );
2956  aBigger:
2957     zSig = aSig - bSig;
2958     zExp = aExp;
2959  normalizeRoundAndPack:
2960     --zExp;
2961     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2962
2963 }
2964
2965 /*----------------------------------------------------------------------------
2966 | Returns the result of adding the double-precision floating-point values `a'
2967 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2968 | Binary Floating-Point Arithmetic.
2969 *----------------------------------------------------------------------------*/
2970
2971 float64 float64_add( float64 a, float64 b STATUS_PARAM )
2972 {
2973     flag aSign, bSign;
2974
2975     aSign = extractFloat64Sign( a );
2976     bSign = extractFloat64Sign( b );
2977     if ( aSign == bSign ) {
2978         return addFloat64Sigs( a, b, aSign STATUS_VAR );
2979     }
2980     else {
2981         return subFloat64Sigs( a, b, aSign STATUS_VAR );
2982     }
2983
2984 }
2985
2986 /*----------------------------------------------------------------------------
2987 | Returns the result of subtracting the double-precision floating-point values
2988 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2989 | for Binary Floating-Point Arithmetic.
2990 *----------------------------------------------------------------------------*/
2991
2992 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2993 {
2994     flag aSign, bSign;
2995
2996     aSign = extractFloat64Sign( a );
2997     bSign = extractFloat64Sign( b );
2998     if ( aSign == bSign ) {
2999         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3000     }
3001     else {
3002         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3003     }
3004
3005 }
3006
3007 /*----------------------------------------------------------------------------
3008 | Returns the result of multiplying the double-precision floating-point values
3009 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3010 | for Binary Floating-Point Arithmetic.
3011 *----------------------------------------------------------------------------*/
3012
3013 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3014 {
3015     flag aSign, bSign, zSign;
3016     int16 aExp, bExp, zExp;
3017     bits64 aSig, bSig, zSig0, zSig1;
3018
3019     aSig = extractFloat64Frac( a );
3020     aExp = extractFloat64Exp( a );
3021     aSign = extractFloat64Sign( a );
3022     bSig = extractFloat64Frac( b );
3023     bExp = extractFloat64Exp( b );
3024     bSign = extractFloat64Sign( b );
3025     zSign = aSign ^ bSign;
3026     if ( aExp == 0x7FF ) {
3027         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3028             return propagateFloat64NaN( a, b STATUS_VAR );
3029         }
3030         if ( ( bExp | bSig ) == 0 ) {
3031             float_raise( float_flag_invalid STATUS_VAR);
3032             return float64_default_nan;
3033         }
3034         return packFloat64( zSign, 0x7FF, 0 );
3035     }
3036     if ( bExp == 0x7FF ) {
3037         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3038         if ( ( aExp | aSig ) == 0 ) {
3039             float_raise( float_flag_invalid STATUS_VAR);
3040             return float64_default_nan;
3041         }
3042         return packFloat64( zSign, 0x7FF, 0 );
3043     }
3044     if ( aExp == 0 ) {
3045         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3046         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3047     }
3048     if ( bExp == 0 ) {
3049         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3050         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3051     }
3052     zExp = aExp + bExp - 0x3FF;
3053     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3054     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3055     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3056     zSig0 |= ( zSig1 != 0 );
3057     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
3058         zSig0 <<= 1;
3059         --zExp;
3060     }
3061     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3062
3063 }
3064
3065 /*----------------------------------------------------------------------------
3066 | Returns the result of dividing the double-precision floating-point value `a'
3067 | by the corresponding value `b'.  The operation is performed according to
3068 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3069 *----------------------------------------------------------------------------*/
3070
3071 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3072 {
3073     flag aSign, bSign, zSign;
3074     int16 aExp, bExp, zExp;
3075     bits64 aSig, bSig, zSig;
3076     bits64 rem0, rem1;
3077     bits64 term0, term1;
3078
3079     aSig = extractFloat64Frac( a );
3080     aExp = extractFloat64Exp( a );
3081     aSign = extractFloat64Sign( a );
3082     bSig = extractFloat64Frac( b );
3083     bExp = extractFloat64Exp( b );
3084     bSign = extractFloat64Sign( b );
3085     zSign = aSign ^ bSign;
3086     if ( aExp == 0x7FF ) {
3087         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3088         if ( bExp == 0x7FF ) {
3089             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3090             float_raise( float_flag_invalid STATUS_VAR);
3091             return float64_default_nan;
3092         }
3093         return packFloat64( zSign, 0x7FF, 0 );
3094     }
3095     if ( bExp == 0x7FF ) {
3096         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3097         return packFloat64( zSign, 0, 0 );
3098     }
3099     if ( bExp == 0 ) {
3100         if ( bSig == 0 ) {
3101             if ( ( aExp | aSig ) == 0 ) {
3102                 float_raise( float_flag_invalid STATUS_VAR);
3103                 return float64_default_nan;
3104             }
3105             float_raise( float_flag_divbyzero STATUS_VAR);
3106             return packFloat64( zSign, 0x7FF, 0 );
3107         }
3108         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3109     }
3110     if ( aExp == 0 ) {
3111         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3112         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3113     }
3114     zExp = aExp - bExp + 0x3FD;
3115     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3116     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3117     if ( bSig <= ( aSig + aSig ) ) {
3118         aSig >>= 1;
3119         ++zExp;
3120     }
3121     zSig = estimateDiv128To64( aSig, 0, bSig );
3122     if ( ( zSig & 0x1FF ) <= 2 ) {
3123         mul64To128( bSig, zSig, &term0, &term1 );
3124         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3125         while ( (sbits64) rem0 < 0 ) {
3126             --zSig;
3127             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3128         }
3129         zSig |= ( rem1 != 0 );
3130     }
3131     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3132
3133 }
3134
3135 /*----------------------------------------------------------------------------
3136 | Returns the remainder of the double-precision floating-point value `a'
3137 | with respect to the corresponding value `b'.  The operation is performed
3138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3139 *----------------------------------------------------------------------------*/
3140
3141 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3142 {
3143     flag aSign, zSign;
3144     int16 aExp, bExp, expDiff;
3145     bits64 aSig, bSig;
3146     bits64 q, alternateASig;
3147     sbits64 sigMean;
3148
3149     aSig = extractFloat64Frac( a );
3150     aExp = extractFloat64Exp( a );
3151     aSign = extractFloat64Sign( a );
3152     bSig = extractFloat64Frac( b );
3153     bExp = extractFloat64Exp( b );
3154     if ( aExp == 0x7FF ) {
3155         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3156             return propagateFloat64NaN( a, b STATUS_VAR );
3157         }
3158         float_raise( float_flag_invalid STATUS_VAR);
3159         return float64_default_nan;
3160     }
3161     if ( bExp == 0x7FF ) {
3162         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3163         return a;
3164     }
3165     if ( bExp == 0 ) {
3166         if ( bSig == 0 ) {
3167             float_raise( float_flag_invalid STATUS_VAR);
3168             return float64_default_nan;
3169         }
3170         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3171     }
3172     if ( aExp == 0 ) {
3173         if ( aSig == 0 ) return a;
3174         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3175     }
3176     expDiff = aExp - bExp;
3177     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3178     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3179     if ( expDiff < 0 ) {
3180         if ( expDiff < -1 ) return a;
3181         aSig >>= 1;
3182     }
3183     q = ( bSig <= aSig );
3184     if ( q ) aSig -= bSig;
3185     expDiff -= 64;
3186     while ( 0 < expDiff ) {
3187         q = estimateDiv128To64( aSig, 0, bSig );
3188         q = ( 2 < q ) ? q - 2 : 0;
3189         aSig = - ( ( bSig>>2 ) * q );
3190         expDiff -= 62;
3191     }
3192     expDiff += 64;
3193     if ( 0 < expDiff ) {
3194         q = estimateDiv128To64( aSig, 0, bSig );
3195         q = ( 2 < q ) ? q - 2 : 0;
3196         q >>= 64 - expDiff;
3197         bSig >>= 2;
3198         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3199     }
3200     else {
3201         aSig >>= 2;
3202         bSig >>= 2;
3203     }
3204     do {
3205         alternateASig = aSig;
3206         ++q;
3207         aSig -= bSig;
3208     } while ( 0 <= (sbits64) aSig );
3209     sigMean = aSig + alternateASig;
3210     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3211         aSig = alternateASig;
3212     }
3213     zSign = ( (sbits64) aSig < 0 );
3214     if ( zSign ) aSig = - aSig;
3215     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3216
3217 }
3218
3219 /*----------------------------------------------------------------------------
3220 | Returns the square root of the double-precision floating-point value `a'.
3221 | The operation is performed according to the IEC/IEEE Standard for Binary
3222 | Floating-Point Arithmetic.
3223 *----------------------------------------------------------------------------*/
3224
3225 float64 float64_sqrt( float64 a STATUS_PARAM )
3226 {
3227     flag aSign;
3228     int16 aExp, zExp;
3229     bits64 aSig, zSig, doubleZSig;
3230     bits64 rem0, rem1, term0, term1;
3231
3232     aSig = extractFloat64Frac( a );
3233     aExp = extractFloat64Exp( a );
3234     aSign = extractFloat64Sign( a );
3235     if ( aExp == 0x7FF ) {
3236         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3237         if ( ! aSign ) return a;
3238         float_raise( float_flag_invalid STATUS_VAR);
3239         return float64_default_nan;
3240     }
3241     if ( aSign ) {
3242         if ( ( aExp | aSig ) == 0 ) return a;
3243         float_raise( float_flag_invalid STATUS_VAR);
3244         return float64_default_nan;
3245     }
3246     if ( aExp == 0 ) {
3247         if ( aSig == 0 ) return float64_zero;
3248         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3249     }
3250     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3251     aSig |= LIT64( 0x0010000000000000 );
3252     zSig = estimateSqrt32( aExp, aSig>>21 );
3253     aSig <<= 9 - ( aExp & 1 );
3254     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3255     if ( ( zSig & 0x1FF ) <= 5 ) {
3256         doubleZSig = zSig<<1;
3257         mul64To128( zSig, zSig, &term0, &term1 );
3258         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3259         while ( (sbits64) rem0 < 0 ) {
3260             --zSig;
3261             doubleZSig -= 2;
3262             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3263         }
3264         zSig |= ( ( rem0 | rem1 ) != 0 );
3265     }
3266     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3267
3268 }
3269
3270 /*----------------------------------------------------------------------------
3271 | Returns the binary log of the double-precision floating-point value `a'.
3272 | The operation is performed according to the IEC/IEEE Standard for Binary
3273 | Floating-Point Arithmetic.
3274 *----------------------------------------------------------------------------*/
3275 float64 float64_log2( float64 a STATUS_PARAM )
3276 {
3277     flag aSign, zSign;
3278     int16 aExp;
3279     bits64 aSig, aSig0, aSig1, zSig, i;
3280
3281     aSig = extractFloat64Frac( a );
3282     aExp = extractFloat64Exp( a );
3283     aSign = extractFloat64Sign( a );
3284
3285     if ( aExp == 0 ) {
3286         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3287         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3288     }
3289     if ( aSign ) {
3290         float_raise( float_flag_invalid STATUS_VAR);
3291         return float64_default_nan;
3292     }
3293     if ( aExp == 0x7FF ) {
3294         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3295         return a;
3296     }
3297
3298     aExp -= 0x3FF;
3299     aSig |= LIT64( 0x0010000000000000 );
3300     zSign = aExp < 0;
3301     zSig = (bits64)aExp << 52;
3302     for (i = 1LL << 51; i > 0; i >>= 1) {
3303         mul64To128( aSig, aSig, &aSig0, &aSig1 );
3304         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3305         if ( aSig & LIT64( 0x0020000000000000 ) ) {
3306             aSig >>= 1;
3307             zSig |= i;
3308         }
3309     }
3310
3311     if ( zSign )
3312         zSig = -zSig;
3313     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3314 }
3315
3316 /*----------------------------------------------------------------------------
3317 | Returns 1 if the double-precision floating-point value `a' is equal to the
3318 | corresponding value `b', and 0 otherwise.  The comparison is performed
3319 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3320 *----------------------------------------------------------------------------*/
3321
3322 int float64_eq( float64 a, float64 b STATUS_PARAM )
3323 {
3324     bits64 av, bv;
3325
3326     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3327          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3328        ) {
3329         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3330             float_raise( float_flag_invalid STATUS_VAR);
3331         }
3332         return 0;
3333     }
3334     av = float64_val(a);
3335     bv = float64_val(b);
3336     return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3337
3338 }
3339
3340 /*----------------------------------------------------------------------------
3341 | Returns 1 if the double-precision floating-point value `a' is less than or
3342 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
3343 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3344 | Arithmetic.
3345 *----------------------------------------------------------------------------*/
3346
3347 int float64_le( float64 a, float64 b STATUS_PARAM )
3348 {
3349     flag aSign, bSign;
3350     bits64 av, bv;
3351
3352     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3353          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3354        ) {
3355         float_raise( float_flag_invalid STATUS_VAR);
3356         return 0;
3357     }
3358     aSign = extractFloat64Sign( a );
3359     bSign = extractFloat64Sign( b );
3360     av = float64_val(a);
3361     bv = float64_val(b);
3362     if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3363     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3364
3365 }
3366
3367 /*----------------------------------------------------------------------------
3368 | Returns 1 if the double-precision floating-point value `a' is less than
3369 | the corresponding value `b', and 0 otherwise.  The comparison is performed
3370 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3371 *----------------------------------------------------------------------------*/
3372
3373 int float64_lt( float64 a, float64 b STATUS_PARAM )
3374 {
3375     flag aSign, bSign;
3376     bits64 av, bv;
3377
3378     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3379          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3380        ) {
3381         float_raise( float_flag_invalid STATUS_VAR);
3382         return 0;
3383     }
3384     aSign = extractFloat64Sign( a );
3385     bSign = extractFloat64Sign( b );
3386     av = float64_val(a);
3387     bv = float64_val(b);
3388     if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3389     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3390
3391 }
3392
3393 /*----------------------------------------------------------------------------
3394 | Returns 1 if the double-precision floating-point value `a' is equal to the
3395 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
3396 | if either operand is a NaN.  Otherwise, the comparison is performed
3397 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3398 *----------------------------------------------------------------------------*/
3399
3400 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3401 {
3402     bits64 av, bv;
3403
3404     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3405          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3406        ) {
3407         float_raise( float_flag_invalid STATUS_VAR);
3408         return 0;
3409     }
3410     av = float64_val(a);
3411     bv = float64_val(b);
3412     return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3413
3414 }
3415
3416 /*----------------------------------------------------------------------------
3417 | Returns 1 if the double-precision floating-point value `a' is less than or
3418 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3419 | cause an exception.  Otherwise, the comparison is performed according to the
3420 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3421 *----------------------------------------------------------------------------*/
3422
3423 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3424 {
3425     flag aSign, bSign;
3426     bits64 av, bv;
3427
3428     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3429          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3430        ) {
3431         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3432             float_raise( float_flag_invalid STATUS_VAR);
3433         }
3434         return 0;
3435     }
3436     aSign = extractFloat64Sign( a );
3437     bSign = extractFloat64Sign( b );
3438     av = float64_val(a);
3439     bv = float64_val(b);
3440     if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3441     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3442
3443 }
3444
3445 /*----------------------------------------------------------------------------
3446 | Returns 1 if the double-precision floating-point value `a' is less than
3447 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3448 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3449 | Standard for Binary Floating-Point Arithmetic.
3450 *----------------------------------------------------------------------------*/
3451
3452 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3453 {
3454     flag aSign, bSign;
3455     bits64 av, bv;
3456
3457     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3458          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3459        ) {
3460         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3461             float_raise( float_flag_invalid STATUS_VAR);
3462         }
3463         return 0;
3464     }
3465     aSign = extractFloat64Sign( a );
3466     bSign = extractFloat64Sign( b );
3467     av = float64_val(a);
3468     bv = float64_val(b);
3469     if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3470     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3471
3472 }
3473
3474 #ifdef FLOATX80
3475
3476 /*----------------------------------------------------------------------------
3477 | Returns the result of converting the extended double-precision floating-
3478 | point value `a' to the 32-bit two's complement integer format.  The
3479 | conversion is performed according to the IEC/IEEE Standard for Binary
3480 | Floating-Point Arithmetic---which means in particular that the conversion
3481 | is rounded according to the current rounding mode.  If `a' is a NaN, the
3482 | largest positive integer is returned.  Otherwise, if the conversion
3483 | overflows, the largest integer with the same sign as `a' is returned.
3484 *----------------------------------------------------------------------------*/
3485
3486 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3487 {
3488     flag aSign;
3489     int32 aExp, shiftCount;
3490     bits64 aSig;
3491
3492     aSig = extractFloatx80Frac( a );
3493     aExp = extractFloatx80Exp( a );
3494     aSign = extractFloatx80Sign( a );
3495     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3496     shiftCount = 0x4037 - aExp;
3497     if ( shiftCount <= 0 ) shiftCount = 1;
3498     shift64RightJamming( aSig, shiftCount, &aSig );
3499     return roundAndPackInt32( aSign, aSig STATUS_VAR );
3500
3501 }
3502
3503 /*----------------------------------------------------------------------------
3504 | Returns the result of converting the extended double-precision floating-
3505 | point value `a' to the 32-bit two's complement integer format.  The
3506 | conversion is performed according to the IEC/IEEE Standard for Binary
3507 | Floating-Point Arithmetic, except that the conversion is always rounded
3508 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3509 | Otherwise, if the conversion overflows, the largest integer with the same
3510 | sign as `a' is returned.
3511 *----------------------------------------------------------------------------*/
3512
3513 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3514 {
3515     flag aSign;
3516     int32 aExp, shiftCount;
3517     bits64 aSig, savedASig;
3518     int32 z;
3519
3520     aSig = extractFloatx80Frac( a );
3521     aExp = extractFloatx80Exp( a );
3522     aSign = extractFloatx80Sign( a );
3523     if ( 0x401E < aExp ) {
3524         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3525         goto invalid;
3526     }
3527     else if ( aExp < 0x3FFF ) {
3528         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3529         return 0;
3530     }
3531     shiftCount = 0x403E - aExp;
3532     savedASig = aSig;
3533     aSig >>= shiftCount;
3534     z = aSig;
3535     if ( aSign ) z = - z;
3536     if ( ( z < 0 ) ^ aSign ) {
3537  invalid:
3538         float_raise( float_flag_invalid STATUS_VAR);
3539         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3540     }
3541     if ( ( aSig<<shiftCount ) != savedASig ) {
3542         STATUS(float_exception_flags) |= float_flag_inexact;
3543     }
3544     return z;
3545
3546 }
3547
3548 /*----------------------------------------------------------------------------
3549 | Returns the result of converting the extended double-precision floating-
3550 | point value `a' to the 64-bit two's complement integer format.  The
3551 | conversion is performed according to the IEC/IEEE Standard for Binary
3552 | Floating-Point Arithmetic---which means in particular that the conversion
3553 | is rounded according to the current rounding mode.  If `a' is a NaN,
3554 | the largest positive integer is returned.  Otherwise, if the conversion
3555 | overflows, the largest integer with the same sign as `a' is returned.
3556 *----------------------------------------------------------------------------*/
3557
3558 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3559 {
3560     flag aSign;
3561     int32 aExp, shiftCount;
3562     bits64 aSig, aSigExtra;
3563
3564     aSig = extractFloatx80Frac( a );
3565     aExp = extractFloatx80Exp( a );
3566     aSign = extractFloatx80Sign( a );
3567     shiftCount = 0x403E - aExp;
3568     if ( shiftCount <= 0 ) {
3569         if ( shiftCount ) {
3570             float_raise( float_flag_invalid STATUS_VAR);
3571             if (    ! aSign
3572                  || (    ( aExp == 0x7FFF )
3573                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3574                ) {
3575                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3576             }
3577             return (sbits64) LIT64( 0x8000000000000000 );
3578         }
3579         aSigExtra = 0;
3580     }
3581     else {
3582         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3583     }
3584     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3585
3586 }
3587
3588 /*----------------------------------------------------------------------------
3589 | Returns the result of converting the extended double-precision floating-
3590 | point value `a' to the 64-bit two's complement integer format.  The
3591 | conversion is performed according to the IEC/IEEE Standard for Binary
3592 | Floating-Point Arithmetic, except that the conversion is always rounded
3593 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3594 | Otherwise, if the conversion overflows, the largest integer with the same
3595 | sign as `a' is returned.
3596 *----------------------------------------------------------------------------*/
3597
3598 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3599 {
3600     flag aSign;
3601     int32 aExp, shiftCount;
3602     bits64 aSig;
3603     int64 z;
3604
3605     aSig = extractFloatx80Frac( a );
3606     aExp = extractFloatx80Exp( a );
3607     aSign = extractFloatx80Sign( a );
3608     shiftCount = aExp - 0x403E;
3609     if ( 0 <= shiftCount ) {
3610         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3611         if ( ( a.high != 0xC03E ) || aSig ) {
3612             float_raise( float_flag_invalid STATUS_VAR);
3613             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3614                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3615             }
3616         }
3617         return (sbits64) LIT64( 0x8000000000000000 );
3618     }
3619     else if ( aExp < 0x3FFF ) {
3620         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3621         return 0;
3622     }
3623     z = aSig>>( - shiftCount );
3624     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3625         STATUS(float_exception_flags) |= float_flag_inexact;
3626     }
3627     if ( aSign ) z = - z;
3628     return z;
3629
3630 }
3631
3632 /*----------------------------------------------------------------------------
3633 | Returns the result of converting the extended double-precision floating-
3634 | point value `a' to the single-precision floating-point format.  The
3635 | conversion is performed according to the IEC/IEEE Standard for Binary
3636 | Floating-Point Arithmetic.
3637 *----------------------------------------------------------------------------*/
3638
3639 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3640 {
3641     flag aSign;
3642     int32 aExp;
3643     bits64 aSig;
3644
3645     aSig = extractFloatx80Frac( a );
3646     aExp = extractFloatx80Exp( a );
3647     aSign = extractFloatx80Sign( a );
3648     if ( aExp == 0x7FFF ) {
3649         if ( (bits64) ( aSig<<1 ) ) {
3650             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3651         }
3652         return packFloat32( aSign, 0xFF, 0 );
3653     }
3654     shift64RightJamming( aSig, 33, &aSig );
3655     if ( aExp || aSig ) aExp -= 0x3F81;
3656     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3657
3658 }
3659
3660 /*----------------------------------------------------------------------------
3661 | Returns the result of converting the extended double-precision floating-
3662 | point value `a' to the double-precision floating-point format.  The
3663 | conversion is performed according to the IEC/IEEE Standard for Binary
3664 | Floating-Point Arithmetic.
3665 *----------------------------------------------------------------------------*/
3666
3667 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3668 {
3669     flag aSign;
3670     int32 aExp;
3671     bits64 aSig, zSig;
3672
3673     aSig = extractFloatx80Frac( a );
3674     aExp = extractFloatx80Exp( a );
3675     aSign = extractFloatx80Sign( a );
3676     if ( aExp == 0x7FFF ) {
3677         if ( (bits64) ( aSig<<1 ) ) {
3678             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3679         }
3680         return packFloat64( aSign, 0x7FF, 0 );
3681     }
3682     shift64RightJamming( aSig, 1, &zSig );
3683     if ( aExp || aSig ) aExp -= 0x3C01;
3684     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3685
3686 }
3687
3688 #ifdef FLOAT128
3689
3690 /*----------------------------------------------------------------------------
3691 | Returns the result of converting the extended double-precision floating-
3692 | point value `a' to the quadruple-precision floating-point format.  The
3693 | conversion is performed according to the IEC/IEEE Standard for Binary
3694 | Floating-Point Arithmetic.
3695 *----------------------------------------------------------------------------*/
3696
3697 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3698 {
3699     flag aSign;
3700     int16 aExp;
3701     bits64 aSig, zSig0, zSig1;
3702
3703     aSig = extractFloatx80Frac( a );
3704     aExp = extractFloatx80Exp( a );
3705     aSign = extractFloatx80Sign( a );
3706     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3707         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3708     }
3709     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3710     return packFloat128( aSign, aExp, zSig0, zSig1 );
3711
3712 }
3713
3714 #endif
3715
3716 /*----------------------------------------------------------------------------
3717 | Rounds the extended double-precision floating-point value `a' to an integer,
3718 | and returns the result as an extended quadruple-precision floating-point
3719 | value.  The operation is performed according to the IEC/IEEE Standard for
3720 | Binary Floating-Point Arithmetic.
3721 *----------------------------------------------------------------------------*/
3722
3723 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3724 {
3725     flag aSign;
3726     int32 aExp;
3727     bits64 lastBitMask, roundBitsMask;
3728     int8 roundingMode;
3729     floatx80 z;
3730
3731     aExp = extractFloatx80Exp( a );
3732     if ( 0x403E <= aExp ) {
3733         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3734             return propagateFloatx80NaN( a, a STATUS_VAR );
3735         }
3736         return a;
3737     }
3738     if ( aExp < 0x3FFF ) {
3739         if (    ( aExp == 0 )
3740              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3741             return a;
3742         }
3743         STATUS(float_exception_flags) |= float_flag_inexact;
3744         aSign = extractFloatx80Sign( a );
3745         switch ( STATUS(float_rounding_mode) ) {
3746          case float_round_nearest_even:
3747             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3748                ) {
3749                 return
3750                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3751             }
3752             break;
3753          case float_round_down:
3754             return
3755                   aSign ?
3756                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3757                 : packFloatx80( 0, 0, 0 );
3758          case float_round_up:
3759             return
3760                   aSign ? packFloatx80( 1, 0, 0 )
3761                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3762         }
3763         return packFloatx80( aSign, 0, 0 );
3764     }
3765     lastBitMask = 1;
3766     lastBitMask <<= 0x403E - aExp;
3767     roundBitsMask = lastBitMask - 1;
3768     z = a;
3769     roundingMode = STATUS(float_rounding_mode);
3770     if ( roundingMode == float_round_nearest_even ) {
3771         z.low += lastBitMask>>1;
3772         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3773     }
3774     else if ( roundingMode != float_round_to_zero ) {
3775         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3776             z.low += roundBitsMask;
3777         }
3778     }
3779     z.low &= ~ roundBitsMask;
3780     if ( z.low == 0 ) {
3781         ++z.high;
3782         z.low = LIT64( 0x8000000000000000 );
3783     }
3784     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3785     return z;
3786
3787 }
3788
3789 /*----------------------------------------------------------------------------
3790 | Returns the result of adding the absolute values of the extended double-
3791 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3792 | negated before being returned.  `zSign' is ignored if the result is a NaN.
3793 | The addition is performed according to the IEC/IEEE Standard for Binary
3794 | Floating-Point Arithmetic.
3795 *----------------------------------------------------------------------------*/
3796
3797 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3798 {
3799     int32 aExp, bExp, zExp;
3800     bits64 aSig, bSig, zSig0, zSig1;
3801     int32 expDiff;
3802
3803     aSig = extractFloatx80Frac( a );
3804     aExp = extractFloatx80Exp( a );
3805     bSig = extractFloatx80Frac( b );
3806     bExp = extractFloatx80Exp( b );
3807     expDiff = aExp - bExp;
3808     if ( 0 < expDiff ) {
3809         if ( aExp == 0x7FFF ) {
3810             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3811             return a;
3812         }
3813         if ( bExp == 0 ) --expDiff;
3814         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3815         zExp = aExp;
3816     }
3817     else if ( expDiff < 0 ) {
3818         if ( bExp == 0x7FFF ) {
3819             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3820             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3821         }
3822         if ( aExp == 0 ) ++expDiff;
3823         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3824         zExp = bExp;
3825     }
3826     else {
3827         if ( aExp == 0x7FFF ) {
3828             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3829                 return propagateFloatx80NaN( a, b STATUS_VAR );
3830             }
3831             return a;
3832         }
3833         zSig1 = 0;
3834         zSig0 = aSig + bSig;
3835         if ( aExp == 0 ) {
3836             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3837             goto roundAndPack;
3838         }
3839         zExp = aExp;
3840         goto shiftRight1;
3841     }
3842     zSig0 = aSig + bSig;
3843     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3844  shiftRight1:
3845     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3846     zSig0 |= LIT64( 0x8000000000000000 );
3847     ++zExp;
3848  roundAndPack:
3849     return
3850         roundAndPackFloatx80(
3851             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3852
3853 }
3854
3855 /*----------------------------------------------------------------------------
3856 | Returns the result of subtracting the absolute values of the extended
3857 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3858 | difference is negated before being returned.  `zSign' is ignored if the
3859 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3860 | Standard for Binary Floating-Point Arithmetic.
3861 *----------------------------------------------------------------------------*/
3862
3863 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3864 {
3865     int32 aExp, bExp, zExp;
3866     bits64 aSig, bSig, zSig0, zSig1;
3867     int32 expDiff;
3868     floatx80 z;
3869
3870     aSig = extractFloatx80Frac( a );
3871     aExp = extractFloatx80Exp( a );
3872     bSig = extractFloatx80Frac( b );
3873     bExp = extractFloatx80Exp( b );
3874     expDiff = aExp - bExp;
3875     if ( 0 < expDiff ) goto aExpBigger;
3876     if ( expDiff < 0 ) goto bExpBigger;
3877     if ( aExp == 0x7FFF ) {
3878         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3879             return propagateFloatx80NaN( a, b STATUS_VAR );
3880         }
3881         float_raise( float_flag_invalid STATUS_VAR);
3882         z.low = floatx80_default_nan_low;
3883         z.high = floatx80_default_nan_high;
3884         return z;
3885     }
3886     if ( aExp == 0 ) {
3887         aExp = 1;
3888         bExp = 1;
3889     }
3890     zSig1 = 0;
3891     if ( bSig < aSig ) goto aBigger;
3892     if ( aSig < bSig ) goto bBigger;
3893     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3894  bExpBigger:
3895     if ( bExp == 0x7FFF ) {
3896         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3897         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3898     }
3899     if ( aExp == 0 ) ++expDiff;
3900     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3901  bBigger:
3902     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3903     zExp = bExp;
3904     zSign ^= 1;
3905     goto normalizeRoundAndPack;
3906  aExpBigger:
3907     if ( aExp == 0x7FFF ) {
3908         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3909         return a;
3910     }
3911     if ( bExp == 0 ) --expDiff;
3912     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3913  aBigger:
3914     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3915     zExp = aExp;
3916  normalizeRoundAndPack:
3917     return
3918         normalizeRoundAndPackFloatx80(
3919             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3920
3921 }
3922
3923 /*----------------------------------------------------------------------------
3924 | Returns the result of adding the extended double-precision floating-point
3925 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
3926 | Standard for Binary Floating-Point Arithmetic.
3927 *----------------------------------------------------------------------------*/
3928
3929 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3930 {
3931     flag aSign, bSign;
3932
3933     aSign = extractFloatx80Sign( a );
3934     bSign = extractFloatx80Sign( b );
3935     if ( aSign == bSign ) {
3936         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3937     }
3938     else {
3939         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3940     }
3941
3942 }
3943
3944 /*----------------------------------------------------------------------------
3945 | Returns the result of subtracting the extended double-precision floating-
3946 | point values `a' and `b'.  The operation is performed according to the
3947 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3948 *----------------------------------------------------------------------------*/
3949
3950 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3951 {
3952     flag aSign, bSign;
3953
3954     aSign = extractFloatx80Sign( a );
3955     bSign = extractFloatx80Sign( b );
3956     if ( aSign == bSign ) {
3957         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3958     }
3959     else {
3960         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3961     }
3962
3963 }
3964
3965 /*----------------------------------------------------------------------------
3966 | Returns the result of multiplying the extended double-precision floating-
3967 | point values `a' and `b'.  The operation is performed according to the
3968 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3969 *----------------------------------------------------------------------------*/
3970
3971 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3972 {
3973     flag aSign, bSign, zSign;
3974     int32 aExp, bExp, zExp;
3975     bits64 aSig, bSig, zSig0, zSig1;
3976     floatx80 z;
3977
3978     aSig = extractFloatx80Frac( a );
3979     aExp = extractFloatx80Exp( a );
3980     aSign = extractFloatx80Sign( a );
3981     bSig = extractFloatx80Frac( b );
3982     bExp = extractFloatx80Exp( b );
3983     bSign = extractFloatx80Sign( b );
3984     zSign = aSign ^ bSign;
3985     if ( aExp == 0x7FFF ) {
3986         if (    (bits64) ( aSig<<1 )
3987              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3988             return propagateFloatx80NaN( a, b STATUS_VAR );
3989         }
3990         if ( ( bExp | bSig ) == 0 ) goto invalid;
3991         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3992     }
3993     if ( bExp == 0x7FFF ) {
3994         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3995         if ( ( aExp | aSig ) == 0 ) {
3996  invalid:
3997             float_raise( float_flag_invalid STATUS_VAR);
3998             z.low = floatx80_default_nan_low;
3999             z.high = floatx80_default_nan_high;
4000             return z;
4001         }
4002         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4003     }
4004     if ( aExp == 0 ) {
4005         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4006         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4007     }
4008     if ( bExp == 0 ) {
4009         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4010         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4011     }
4012     zExp = aExp + bExp - 0x3FFE;
4013     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4014     if ( 0 < (sbits64) zSig0 ) {
4015         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4016         --zExp;
4017     }
4018     return
4019         roundAndPackFloatx80(
4020             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4021
4022 }
4023
4024 /*----------------------------------------------------------------------------
4025 | Returns the result of dividing the extended double-precision floating-point
4026 | value `a' by the corresponding value `b'.  The operation is performed
4027 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4028 *----------------------------------------------------------------------------*/
4029
4030 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4031 {
4032     flag aSign, bSign, zSign;
4033     int32 aExp, bExp, zExp;
4034     bits64 aSig, bSig, zSig0, zSig1;
4035     bits64 rem0, rem1, rem2, term0, term1, term2;
4036     floatx80 z;
4037
4038     aSig = extractFloatx80Frac( a );
4039     aExp = extractFloatx80Exp( a );
4040     aSign = extractFloatx80Sign( a );
4041     bSig = extractFloatx80Frac( b );
4042     bExp = extractFloatx80Exp( b );
4043     bSign = extractFloatx80Sign( b );
4044     zSign = aSign ^ bSign;
4045     if ( aExp == 0x7FFF ) {
4046         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4047         if ( bExp == 0x7FFF ) {
4048             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4049             goto invalid;
4050         }
4051         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4052     }
4053     if ( bExp == 0x7FFF ) {
4054         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4055         return packFloatx80( zSign, 0, 0 );
4056     }
4057     if ( bExp == 0 ) {
4058         if ( bSig == 0 ) {
4059             if ( ( aExp | aSig ) == 0 ) {
4060  invalid:
4061                 float_raise( float_flag_invalid STATUS_VAR);
4062                 z.low = floatx80_default_nan_low;
4063                 z.high = floatx80_default_nan_high;
4064                 return z;
4065             }
4066             float_raise( float_flag_divbyzero STATUS_VAR);
4067             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4068         }
4069         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4070     }
4071     if ( aExp == 0 ) {
4072         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4073         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4074     }
4075     zExp = aExp - bExp + 0x3FFE;
4076     rem1 = 0;
4077     if ( bSig <= aSig ) {
4078         shift128Right( aSig, 0, 1, &aSig, &rem1 );
4079         ++zExp;
4080     }
4081     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4082     mul64To128( bSig, zSig0, &term0, &term1 );
4083     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4084     while ( (sbits64) rem0 < 0 ) {
4085         --zSig0;
4086         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4087     }
4088     zSig1 = estimateDiv128To64( rem1, 0, bSig );
4089     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
4090         mul64To128( bSig, zSig1, &term1, &term2 );
4091         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4092         while ( (sbits64) rem1 < 0 ) {
4093             --zSig1;
4094             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4095         }
4096         zSig1 |= ( ( rem1 | rem2 ) != 0 );
4097     }
4098     return
4099         roundAndPackFloatx80(
4100             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4101
4102 }
4103
4104 /*----------------------------------------------------------------------------
4105 | Returns the remainder of the extended double-precision floating-point value
4106 | `a' with respect to the corresponding value `b'.  The operation is performed
4107 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4108 *----------------------------------------------------------------------------*/
4109
4110 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4111 {
4112     flag aSign, zSign;
4113     int32 aExp, bExp, expDiff;
4114     bits64 aSig0, aSig1, bSig;
4115     bits64 q, term0, term1, alternateASig0, alternateASig1;
4116     floatx80 z;
4117
4118     aSig0 = extractFloatx80Frac( a );
4119     aExp = extractFloatx80Exp( a );
4120     aSign = extractFloatx80Sign( a );
4121     bSig = extractFloatx80Frac( b );
4122     bExp = extractFloatx80Exp( b );
4123     if ( aExp == 0x7FFF ) {
4124         if (    (bits64) ( aSig0<<1 )
4125              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4126             return propagateFloatx80NaN( a, b STATUS_VAR );
4127         }
4128         goto invalid;
4129     }
4130     if ( bExp == 0x7FFF ) {
4131         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4132         return a;
4133     }
4134     if ( bExp == 0 ) {
4135         if ( bSig == 0 ) {
4136  invalid:
4137             float_raise( float_flag_invalid STATUS_VAR);
4138             z.low = floatx80_default_nan_low;
4139             z.high = floatx80_default_nan_high;
4140             return z;
4141         }
4142         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4143     }
4144     if ( aExp == 0 ) {
4145         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4146         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4147     }
4148     bSig |= LIT64( 0x8000000000000000 );
4149     zSign = aSign;
4150     expDiff = aExp - bExp;
4151     aSig1 = 0;
4152     if ( expDiff < 0 ) {
4153         if ( expDiff < -1 ) return a;
4154         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4155         expDiff = 0;
4156     }
4157     q = ( bSig <= aSig0 );
4158     if ( q ) aSig0 -= bSig;
4159     expDiff -= 64;
4160     while ( 0 < expDiff ) {
4161         q = estimateDiv128To64( aSig0, aSig1, bSig );
4162         q = ( 2 < q ) ? q - 2 : 0;
4163         mul64To128( bSig, q, &term0, &term1 );
4164         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4165         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4166         expDiff -= 62;
4167     }
4168     expDiff += 64;
4169     if ( 0 < expDiff ) {
4170         q = estimateDiv128To64( aSig0, aSig1, bSig );
4171         q = ( 2 < q ) ? q - 2 : 0;
4172         q >>= 64 - expDiff;
4173         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4174         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4175         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4176         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4177             ++q;
4178             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4179         }
4180     }
4181     else {
4182         term1 = 0;
4183         term0 = bSig;
4184     }
4185     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4186     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4187          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4188               && ( q & 1 ) )
4189        ) {
4190         aSig0 = alternateASig0;
4191         aSig1 = alternateASig1;
4192         zSign = ! zSign;
4193     }
4194     return
4195         normalizeRoundAndPackFloatx80(
4196             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4197
4198 }
4199
4200 /*----------------------------------------------------------------------------
4201 | Returns the square root of the extended double-precision floating-point
4202 | value `a'.  The operation is performed according to the IEC/IEEE Standard
4203 | for Binary Floating-Point Arithmetic.
4204 *----------------------------------------------------------------------------*/
4205
4206 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4207 {
4208     flag aSign;
4209     int32 aExp, zExp;
4210     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4211     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4212     floatx80 z;
4213
4214     aSig0 = extractFloatx80Frac( a );
4215     aExp = extractFloatx80Exp( a );
4216     aSign = extractFloatx80Sign( a );
4217     if ( aExp == 0x7FFF ) {
4218         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4219         if ( ! aSign ) return a;
4220         goto invalid;
4221     }
4222     if ( aSign ) {
4223         if ( ( aExp | aSig0 ) == 0 ) return a;
4224  invalid:
4225         float_raise( float_flag_invalid STATUS_VAR);
4226         z.low = floatx80_default_nan_low;
4227         z.high = floatx80_default_nan_high;
4228         return z;
4229     }
4230     if ( aExp == 0 ) {
4231         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4232         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4233     }
4234     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4235     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4236     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4237     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4238     doubleZSig0 = zSig0<<1;
4239     mul64To128( zSig0, zSig0, &term0, &term1 );
4240     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4241     while ( (sbits64) rem0 < 0 ) {
4242         --zSig0;
4243         doubleZSig0 -= 2;
4244         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4245     }
4246     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4247     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4248         if ( zSig1 == 0 ) zSig1 = 1;
4249         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4250         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4251         mul64To128( zSig1, zSig1, &term2, &term3 );
4252         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4253         while ( (sbits64) rem1 < 0 ) {
4254             --zSig1;
4255             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4256             term3 |= 1;
4257             term2 |= doubleZSig0;
4258             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4259         }
4260         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4261     }
4262     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4263     zSig0 |= doubleZSig0;
4264     return
4265         roundAndPackFloatx80(
4266             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4267
4268 }
4269
4270 /*----------------------------------------------------------------------------
4271 | Returns 1 if the extended double-precision floating-point value `a' is
4272 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
4273 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4274 | Arithmetic.
4275 *----------------------------------------------------------------------------*/
4276
4277 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4278 {
4279
4280     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4281               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4282          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4283               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4284        ) {
4285         if (    floatx80_is_signaling_nan( a )
4286              || floatx80_is_signaling_nan( b ) ) {
4287             float_raise( float_flag_invalid STATUS_VAR);
4288         }
4289         return 0;
4290     }
4291     return
4292            ( a.low == b.low )
4293         && (    ( a.high == b.high )
4294              || (    ( a.low == 0 )
4295                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4296            );
4297
4298 }
4299
4300 /*----------------------------------------------------------------------------
4301 | Returns 1 if the extended double-precision floating-point value `a' is
4302 | less than or equal to the corresponding value `b', and 0 otherwise.  The
4303 | comparison is performed according to the IEC/IEEE Standard for Binary
4304 | Floating-Point Arithmetic.
4305 *----------------------------------------------------------------------------*/
4306
4307 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4308 {
4309     flag aSign, bSign;
4310
4311     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4312               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4313          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4314               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4315        ) {
4316         float_raise( float_flag_invalid STATUS_VAR);
4317         return 0;
4318     }
4319     aSign = extractFloatx80Sign( a );
4320     bSign = extractFloatx80Sign( b );
4321     if ( aSign != bSign ) {
4322         return
4323                aSign
4324             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4325                  == 0 );
4326     }
4327     return
4328           aSign ? le128( b.high, b.low, a.high, a.low )
4329         : le128( a.high, a.low, b.high, b.low );
4330
4331 }
4332
4333 /*----------------------------------------------------------------------------
4334 | Returns 1 if the extended double-precision floating-point value `a' is
4335 | less than the corresponding value `b', and 0 otherwise.  The comparison
4336 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4337 | Arithmetic.
4338 *----------------------------------------------------------------------------*/
4339
4340 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4341 {
4342     flag aSign, bSign;
4343
4344     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4345               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4346          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4347               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4348        ) {
4349         float_raise( float_flag_invalid STATUS_VAR);
4350         return 0;
4351     }
4352     aSign = extractFloatx80Sign( a );
4353     bSign = extractFloatx80Sign( b );
4354     if ( aSign != bSign ) {
4355         return
4356                aSign
4357             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4358                  != 0 );
4359     }
4360     return
4361           aSign ? lt128( b.high, b.low, a.high, a.low )
4362         : lt128( a.high, a.low, b.high, b.low );
4363
4364 }
4365
4366 /*----------------------------------------------------------------------------
4367 | Returns 1 if the extended double-precision floating-point value `a' is equal
4368 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
4369 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4370 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4371 *----------------------------------------------------------------------------*/
4372
4373 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4374 {
4375
4376     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4377               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4378          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4379               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4380        ) {
4381         float_raise( float_flag_invalid STATUS_VAR);
4382         return 0;
4383     }
4384     return
4385            ( a.low == b.low )
4386         && (    ( a.high == b.high )
4387              || (    ( a.low == 0 )
4388                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4389            );
4390
4391 }
4392
4393 /*----------------------------------------------------------------------------
4394 | Returns 1 if the extended double-precision floating-point value `a' is less
4395 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4396 | do not cause an exception.  Otherwise, the comparison is performed according
4397 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4398 *----------------------------------------------------------------------------*/
4399
4400 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4401 {
4402     flag aSign, bSign;
4403
4404     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4405               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4406          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4407               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4408        ) {
4409         if (    floatx80_is_signaling_nan( a )
4410              || floatx80_is_signaling_nan( b ) ) {
4411             float_raise( float_flag_invalid STATUS_VAR);
4412         }
4413         return 0;
4414     }
4415     aSign = extractFloatx80Sign( a );
4416     bSign = extractFloatx80Sign( b );
4417     if ( aSign != bSign ) {
4418         return
4419                aSign
4420             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4421                  == 0 );
4422     }
4423     return
4424           aSign ? le128( b.high, b.low, a.high, a.low )
4425         : le128( a.high, a.low, b.high, b.low );
4426
4427 }
4428
4429 /*----------------------------------------------------------------------------
4430 | Returns 1 if the extended double-precision floating-point value `a' is less
4431 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4432 | an exception.  Otherwise, the comparison is performed according to the
4433 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4434 *----------------------------------------------------------------------------*/
4435
4436 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4437 {
4438     flag aSign, bSign;
4439
4440     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4441               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4442          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4443               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4444        ) {
4445         if (    floatx80_is_signaling_nan( a )
4446              || floatx80_is_signaling_nan( b ) ) {
4447             float_raise( float_flag_invalid STATUS_VAR);
4448         }
4449         return 0;
4450     }
4451     aSign = extractFloatx80Sign( a );
4452     bSign = extractFloatx80Sign( b );
4453     if ( aSign != bSign ) {
4454         return
4455                aSign
4456             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4457                  != 0 );
4458     }
4459     return
4460           aSign ? lt128( b.high, b.low, a.high, a.low )
4461         : lt128( a.high, a.low, b.high, b.low );
4462
4463 }
4464
4465 #endif
4466
4467 #ifdef FLOAT128
4468
4469 /*----------------------------------------------------------------------------
4470 | Returns the result of converting the quadruple-precision floating-point
4471 | value `a' to the 32-bit two's complement integer format.  The conversion
4472 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4473 | Arithmetic---which means in particular that the conversion is rounded
4474 | according to the current rounding mode.  If `a' is a NaN, the largest
4475 | positive integer is returned.  Otherwise, if the conversion overflows, the
4476 | largest integer with the same sign as `a' is returned.
4477 *----------------------------------------------------------------------------*/
4478
4479 int32 float128_to_int32( float128 a STATUS_PARAM )
4480 {
4481     flag aSign;
4482     int32 aExp, shiftCount;
4483     bits64 aSig0, aSig1;
4484
4485     aSig1 = extractFloat128Frac1( a );
4486     aSig0 = extractFloat128Frac0( a );
4487     aExp = extractFloat128Exp( a );
4488     aSign = extractFloat128Sign( a );
4489     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4490     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4491     aSig0 |= ( aSig1 != 0 );
4492     shiftCount = 0x4028 - aExp;
4493     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4494     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4495
4496 }
4497
4498 /*----------------------------------------------------------------------------
4499 | Returns the result of converting the quadruple-precision floating-point
4500 | value `a' to the 32-bit two's complement integer format.  The conversion
4501 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4502 | Arithmetic, except that the conversion is always rounded toward zero.  If
4503 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4504 | conversion overflows, the largest integer with the same sign as `a' is
4505 | returned.
4506 *----------------------------------------------------------------------------*/
4507
4508 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4509 {
4510     flag aSign;
4511     int32 aExp, shiftCount;
4512     bits64 aSig0, aSig1, savedASig;
4513     int32 z;
4514
4515     aSig1 = extractFloat128Frac1( a );
4516     aSig0 = extractFloat128Frac0( a );
4517     aExp = extractFloat128Exp( a );
4518     aSign = extractFloat128Sign( a );
4519     aSig0 |= ( aSig1 != 0 );
4520     if ( 0x401E < aExp ) {
4521         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4522         goto invalid;
4523     }
4524     else if ( aExp < 0x3FFF ) {
4525         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4526         return 0;
4527     }
4528     aSig0 |= LIT64( 0x0001000000000000 );
4529     shiftCount = 0x402F - aExp;
4530     savedASig = aSig0;
4531     aSig0 >>= shiftCount;
4532     z = aSig0;
4533     if ( aSign ) z = - z;
4534     if ( ( z < 0 ) ^ aSign ) {
4535  invalid:
4536         float_raise( float_flag_invalid STATUS_VAR);
4537         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4538     }
4539     if ( ( aSig0<<shiftCount ) != savedASig ) {
4540         STATUS(float_exception_flags) |= float_flag_inexact;
4541     }
4542     return z;
4543
4544 }
4545
4546 /*----------------------------------------------------------------------------
4547 | Returns the result of converting the quadruple-precision floating-point
4548 | value `a' to the 64-bit two's complement integer format.  The conversion
4549 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4550 | Arithmetic---which means in particular that the conversion is rounded
4551 | according to the current rounding mode.  If `a' is a NaN, the largest
4552 | positive integer is returned.  Otherwise, if the conversion overflows, the
4553 | largest integer with the same sign as `a' is returned.
4554 *----------------------------------------------------------------------------*/
4555
4556 int64 float128_to_int64( float128 a STATUS_PARAM )
4557 {
4558     flag aSign;
4559     int32 aExp, shiftCount;
4560     bits64 aSig0, aSig1;
4561
4562     aSig1 = extractFloat128Frac1( a );
4563     aSig0 = extractFloat128Frac0( a );
4564     aExp = extractFloat128Exp( a );
4565     aSign = extractFloat128Sign( a );
4566     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4567     shiftCount = 0x402F - aExp;
4568     if ( shiftCount <= 0 ) {
4569         if ( 0x403E < aExp ) {
4570             float_raise( float_flag_invalid STATUS_VAR);
4571             if (    ! aSign
4572                  || (    ( aExp == 0x7FFF )
4573                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4574                     )
4575                ) {
4576                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4577             }
4578             return (sbits64) LIT64( 0x8000000000000000 );
4579         }
4580         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4581     }
4582     else {
4583         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4584     }
4585     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4586
4587 }
4588
4589 /*----------------------------------------------------------------------------
4590 | Returns the result of converting the quadruple-precision floating-point
4591 | value `a' to the 64-bit two's complement integer format.  The conversion
4592 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4593 | Arithmetic, except that the conversion is always rounded toward zero.
4594 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4595 | the conversion overflows, the largest integer with the same sign as `a' is
4596 | returned.
4597 *----------------------------------------------------------------------------*/
4598
4599 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4600 {
4601     flag aSign;
4602     int32 aExp, shiftCount;
4603     bits64 aSig0, aSig1;
4604     int64 z;
4605
4606     aSig1 = extractFloat128Frac1( a );
4607     aSig0 = extractFloat128Frac0( a );
4608     aExp = extractFloat128Exp( a );
4609     aSign = extractFloat128Sign( a );
4610     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4611     shiftCount = aExp - 0x402F;
4612     if ( 0 < shiftCount ) {
4613         if ( 0x403E <= aExp ) {
4614             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4615             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4616                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4617                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4618             }
4619             else {
4620                 float_raise( float_flag_invalid STATUS_VAR);
4621                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4622                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4623                 }
4624             }
4625             return (sbits64) LIT64( 0x8000000000000000 );
4626         }
4627         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4628         if ( (bits64) ( aSig1<<shiftCount ) ) {
4629             STATUS(float_exception_flags) |= float_flag_inexact;
4630         }
4631     }
4632     else {
4633         if ( aExp < 0x3FFF ) {
4634             if ( aExp | aSig0 | aSig1 ) {
4635                 STATUS(float_exception_flags) |= float_flag_inexact;
4636             }
4637             return 0;
4638         }
4639         z = aSig0>>( - shiftCount );
4640         if (    aSig1
4641              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4642             STATUS(float_exception_flags) |= float_flag_inexact;
4643         }
4644     }
4645     if ( aSign ) z = - z;
4646     return z;
4647
4648 }
4649
4650 /*----------------------------------------------------------------------------
4651 | Returns the result of converting the quadruple-precision floating-point
4652 | value `a' to the single-precision floating-point format.  The conversion
4653 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4654 | Arithmetic.
4655 *----------------------------------------------------------------------------*/
4656
4657 float32 float128_to_float32( float128 a STATUS_PARAM )
4658 {
4659     flag aSign;
4660     int32 aExp;
4661     bits64 aSig0, aSig1;
4662     bits32 zSig;
4663
4664     aSig1 = extractFloat128Frac1( a );
4665     aSig0 = extractFloat128Frac0( a );
4666     aExp = extractFloat128Exp( a );
4667     aSign = extractFloat128Sign( a );
4668     if ( aExp == 0x7FFF ) {
4669         if ( aSig0 | aSig1 ) {
4670             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4671         }
4672         return packFloat32( aSign, 0xFF, 0 );
4673     }
4674     aSig0 |= ( aSig1 != 0 );
4675     shift64RightJamming( aSig0, 18, &aSig0 );
4676     zSig = aSig0;
4677     if ( aExp || zSig ) {
4678         zSig |= 0x40000000;
4679         aExp -= 0x3F81;
4680     }
4681     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4682
4683 }
4684
4685 /*----------------------------------------------------------------------------
4686 | Returns the result of converting the quadruple-precision floating-point
4687 | value `a' to the double-precision floating-point format.  The conversion
4688 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4689 | Arithmetic.
4690 *----------------------------------------------------------------------------*/
4691
4692 float64 float128_to_float64( float128 a STATUS_PARAM )
4693 {
4694     flag aSign;
4695     int32 aExp;
4696     bits64 aSig0, aSig1;
4697
4698     aSig1 = extractFloat128Frac1( a );
4699     aSig0 = extractFloat128Frac0( a );
4700     aExp = extractFloat128Exp( a );
4701     aSign = extractFloat128Sign( a );
4702     if ( aExp == 0x7FFF ) {
4703         if ( aSig0 | aSig1 ) {
4704             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4705         }
4706         return packFloat64( aSign, 0x7FF, 0 );
4707     }
4708     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4709     aSig0 |= ( aSig1 != 0 );
4710     if ( aExp || aSig0 ) {
4711         aSig0 |= LIT64( 0x4000000000000000 );
4712         aExp -= 0x3C01;
4713     }
4714     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4715
4716 }
4717
4718 #ifdef FLOATX80
4719
4720 /*----------------------------------------------------------------------------
4721 | Returns the result of converting the quadruple-precision floating-point
4722 | value `a' to the extended double-precision floating-point format.  The
4723 | conversion is performed according to the IEC/IEEE Standard for Binary
4724 | Floating-Point Arithmetic.
4725 *----------------------------------------------------------------------------*/
4726
4727 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4728 {
4729     flag aSign;
4730     int32 aExp;
4731     bits64 aSig0, aSig1;
4732
4733     aSig1 = extractFloat128Frac1( a );
4734     aSig0 = extractFloat128Frac0( a );
4735     aExp = extractFloat128Exp( a );
4736     aSign = extractFloat128Sign( a );
4737     if ( aExp == 0x7FFF ) {
4738         if ( aSig0 | aSig1 ) {
4739             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4740         }
4741         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4742     }
4743     if ( aExp == 0 ) {
4744         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4745         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4746     }
4747     else {
4748         aSig0 |= LIT64( 0x0001000000000000 );
4749     }
4750     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4751     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4752
4753 }
4754
4755 #endif
4756
4757 /*----------------------------------------------------------------------------
4758 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4759 | returns the result as a quadruple-precision floating-point value.  The
4760 | operation is performed according to the IEC/IEEE Standard for Binary
4761 | Floating-Point Arithmetic.
4762 *----------------------------------------------------------------------------*/
4763
4764 float128 float128_round_to_int( float128 a STATUS_PARAM )
4765 {
4766     flag aSign;
4767     int32 aExp;
4768     bits64 lastBitMask, roundBitsMask;
4769     int8 roundingMode;
4770     float128 z;
4771
4772     aExp = extractFloat128Exp( a );
4773     if ( 0x402F <= aExp ) {
4774         if ( 0x406F <= aExp ) {
4775             if (    ( aExp == 0x7FFF )
4776                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4777                ) {
4778                 return propagateFloat128NaN( a, a STATUS_VAR );
4779             }
4780             return a;
4781         }
4782         lastBitMask = 1;
4783         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4784         roundBitsMask = lastBitMask - 1;
4785         z = a;
4786         roundingMode = STATUS(float_rounding_mode);
4787         if ( roundingMode == float_round_nearest_even ) {
4788             if ( lastBitMask ) {
4789                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4790                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4791             }
4792             else {
4793                 if ( (sbits64) z.low < 0 ) {
4794                     ++z.high;
4795                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4796                 }
4797             }
4798         }
4799         else if ( roundingMode != float_round_to_zero ) {
4800             if (   extractFloat128Sign( z )
4801                  ^ ( roundingMode == float_round_up ) ) {
4802                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4803             }
4804         }
4805         z.low &= ~ roundBitsMask;
4806     }
4807     else {
4808         if ( aExp < 0x3FFF ) {
4809             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4810             STATUS(float_exception_flags) |= float_flag_inexact;
4811             aSign = extractFloat128Sign( a );
4812             switch ( STATUS(float_rounding_mode) ) {
4813              case float_round_nearest_even:
4814                 if (    ( aExp == 0x3FFE )
4815                      && (   extractFloat128Frac0( a )
4816                           | extractFloat128Frac1( a ) )
4817                    ) {
4818                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4819                 }
4820                 break;
4821              case float_round_down:
4822                 return
4823                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4824                     : packFloat128( 0, 0, 0, 0 );
4825              case float_round_up:
4826                 return
4827                       aSign ? packFloat128( 1, 0, 0, 0 )
4828                     : packFloat128( 0, 0x3FFF, 0, 0 );
4829             }
4830             return packFloat128( aSign, 0, 0, 0 );
4831         }
4832         lastBitMask = 1;
4833         lastBitMask <<= 0x402F - aExp;
4834         roundBitsMask = lastBitMask - 1;
4835         z.low = 0;
4836         z.high = a.high;
4837         roundingMode = STATUS(float_rounding_mode);
4838         if ( roundingMode == float_round_nearest_even ) {
4839             z.high += lastBitMask>>1;
4840             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4841                 z.high &= ~ lastBitMask;
4842             }
4843         }
4844         else if ( roundingMode != float_round_to_zero ) {
4845             if (   extractFloat128Sign( z )
4846                  ^ ( roundingMode == float_round_up ) ) {
4847                 z.high |= ( a.low != 0 );
4848                 z.high += roundBitsMask;
4849             }
4850         }
4851         z.high &= ~ roundBitsMask;
4852     }
4853     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4854         STATUS(float_exception_flags) |= float_flag_inexact;
4855     }
4856     return z;
4857
4858 }
4859
4860 /*----------------------------------------------------------------------------
4861 | Returns the result of adding the absolute values of the quadruple-precision
4862 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4863 | before being returned.  `zSign' is ignored if the result is a NaN.
4864 | The addition is performed according to the IEC/IEEE Standard for Binary
4865 | Floating-Point Arithmetic.
4866 *----------------------------------------------------------------------------*/
4867
4868 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4869 {
4870     int32 aExp, bExp, zExp;
4871     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4872     int32 expDiff;
4873
4874     aSig1 = extractFloat128Frac1( a );
4875     aSig0 = extractFloat128Frac0( a );
4876     aExp = extractFloat128Exp( a );
4877     bSig1 = extractFloat128Frac1( b );
4878     bSig0 = extractFloat128Frac0( b );
4879     bExp = extractFloat128Exp( b );
4880     expDiff = aExp - bExp;
4881     if ( 0 < expDiff ) {
4882         if ( aExp == 0x7FFF ) {
4883             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4884             return a;
4885         }
4886         if ( bExp == 0 ) {
4887             --expDiff;
4888         }
4889         else {
4890             bSig0 |= LIT64( 0x0001000000000000 );
4891         }
4892         shift128ExtraRightJamming(
4893             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4894         zExp = aExp;
4895     }
4896     else if ( expDiff < 0 ) {
4897         if ( bExp == 0x7FFF ) {
4898             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4899             return packFloat128( zSign, 0x7FFF, 0, 0 );
4900         }
4901         if ( aExp == 0 ) {
4902             ++expDiff;
4903         }
4904         else {
4905             aSig0 |= LIT64( 0x0001000000000000 );
4906         }
4907         shift128ExtraRightJamming(
4908             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4909         zExp = bExp;
4910     }
4911     else {
4912         if ( aExp == 0x7FFF ) {
4913             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4914                 return propagateFloat128NaN( a, b STATUS_VAR );
4915             }
4916             return a;
4917         }
4918         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4919         if ( aExp == 0 ) {
4920             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
4921             return packFloat128( zSign, 0, zSig0, zSig1 );
4922         }
4923         zSig2 = 0;
4924         zSig0 |= LIT64( 0x0002000000000000 );
4925         zExp = aExp;
4926         goto shiftRight1;
4927     }
4928     aSig0 |= LIT64( 0x0001000000000000 );
4929     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4930     --zExp;
4931     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4932     ++zExp;
4933  shiftRight1:
4934     shift128ExtraRightJamming(
4935         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4936  roundAndPack:
4937     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4938
4939 }
4940
4941 /*----------------------------------------------------------------------------
4942 | Returns the result of subtracting the absolute values of the quadruple-
4943 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
4944 | difference is negated before being returned.  `zSign' is ignored if the
4945 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4946 | Standard for Binary Floating-Point Arithmetic.
4947 *----------------------------------------------------------------------------*/
4948
4949 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4950 {
4951     int32 aExp, bExp, zExp;
4952     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4953     int32 expDiff;
4954     float128 z;
4955
4956     aSig1 = extractFloat128Frac1( a );
4957     aSig0 = extractFloat128Frac0( a );
4958     aExp = extractFloat128Exp( a );
4959     bSig1 = extractFloat128Frac1( b );
4960     bSig0 = extractFloat128Frac0( b );
4961     bExp = extractFloat128Exp( b );
4962     expDiff = aExp - bExp;
4963     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4964     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4965     if ( 0 < expDiff ) goto aExpBigger;
4966     if ( expDiff < 0 ) goto bExpBigger;
4967     if ( aExp == 0x7FFF ) {
4968         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4969             return propagateFloat128NaN( a, b STATUS_VAR );
4970         }
4971         float_raise( float_flag_invalid STATUS_VAR);
4972         z.low = float128_default_nan_low;
4973         z.high = float128_default_nan_high;
4974         return z;
4975     }
4976     if ( aExp == 0 ) {
4977         aExp = 1;
4978         bExp = 1;
4979     }
4980     if ( bSig0 < aSig0 ) goto aBigger;
4981     if ( aSig0 < bSig0 ) goto bBigger;
4982     if ( bSig1 < aSig1 ) goto aBigger;
4983     if ( aSig1 < bSig1 ) goto bBigger;
4984     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4985  bExpBigger:
4986     if ( bExp == 0x7FFF ) {
4987         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4988         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4989     }
4990     if ( aExp == 0 ) {
4991         ++expDiff;
4992     }
4993     else {
4994         aSig0 |= LIT64( 0x4000000000000000 );
4995     }
4996     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4997     bSig0 |= LIT64( 0x4000000000000000 );
4998  bBigger:
4999     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5000     zExp = bExp;
5001     zSign ^= 1;
5002     goto normalizeRoundAndPack;
5003  aExpBigger:
5004     if ( aExp == 0x7FFF ) {
5005         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5006         return a;
5007     }
5008     if ( bExp == 0 ) {
5009         --expDiff;
5010     }
5011     else {
5012         bSig0 |= LIT64( 0x4000000000000000 );
5013     }
5014     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5015     aSig0 |= LIT64( 0x4000000000000000 );
5016  aBigger:
5017     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5018     zExp = aExp;
5019  normalizeRoundAndPack:
5020     --zExp;
5021     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5022
5023 }
5024
5025 /*----------------------------------------------------------------------------
5026 | Returns the result of adding the quadruple-precision floating-point values
5027 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5028 | for Binary Floating-Point Arithmetic.
5029 *----------------------------------------------------------------------------*/
5030
5031 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5032 {
5033     flag aSign, bSign;
5034
5035     aSign = extractFloat128Sign( a );
5036     bSign = extractFloat128Sign( b );
5037     if ( aSign == bSign ) {
5038         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5039     }
5040     else {
5041         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5042     }
5043
5044 }
5045
5046 /*----------------------------------------------------------------------------
5047 | Returns the result of subtracting the quadruple-precision floating-point
5048 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5049 | Standard for Binary Floating-Point Arithmetic.
5050 *----------------------------------------------------------------------------*/
5051
5052 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5053 {
5054     flag aSign, bSign;
5055
5056     aSign = extractFloat128Sign( a );
5057     bSign = extractFloat128Sign( b );
5058     if ( aSign == bSign ) {
5059         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5060     }
5061     else {
5062         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5063     }
5064
5065 }
5066
5067 /*----------------------------------------------------------------------------
5068 | Returns the result of multiplying the quadruple-precision floating-point
5069 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5070 | Standard for Binary Floating-Point Arithmetic.
5071 *----------------------------------------------------------------------------*/
5072
5073 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5074 {
5075     flag aSign, bSign, zSign;
5076     int32 aExp, bExp, zExp;
5077     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5078     float128 z;
5079
5080     aSig1 = extractFloat128Frac1( a );
5081     aSig0 = extractFloat128Frac0( a );
5082     aExp = extractFloat128Exp( a );
5083     aSign = extractFloat128Sign( a );
5084     bSig1 = extractFloat128Frac1( b );
5085     bSig0 = extractFloat128Frac0( b );
5086     bExp = extractFloat128Exp( b );
5087     bSign = extractFloat128Sign( b );
5088     zSign = aSign ^ bSign;
5089     if ( aExp == 0x7FFF ) {
5090         if (    ( aSig0 | aSig1 )
5091              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5092             return propagateFloat128NaN( a, b STATUS_VAR );
5093         }
5094         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5095         return packFloat128( zSign, 0x7FFF, 0, 0 );
5096     }
5097     if ( bExp == 0x7FFF ) {
5098         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5099         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5100  invalid:
5101             float_raise( float_flag_invalid STATUS_VAR);
5102             z.low = float128_default_nan_low;
5103             z.high = float128_default_nan_high;
5104             return z;
5105         }
5106         return packFloat128( zSign, 0x7FFF, 0, 0 );
5107     }
5108     if ( aExp == 0 ) {
5109         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5110         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5111     }
5112     if ( bExp == 0 ) {
5113         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5114         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5115     }
5116     zExp = aExp + bExp - 0x4000;
5117     aSig0 |= LIT64( 0x0001000000000000 );
5118     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5119     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5120     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5121     zSig2 |= ( zSig3 != 0 );
5122     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5123         shift128ExtraRightJamming(
5124             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5125         ++zExp;
5126     }
5127     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5128
5129 }
5130
5131 /*----------------------------------------------------------------------------
5132 | Returns the result of dividing the quadruple-precision floating-point value
5133 | `a' by the corresponding value `b'.  The operation is performed according to
5134 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5135 *----------------------------------------------------------------------------*/
5136
5137 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5138 {
5139     flag aSign, bSign, zSign;
5140     int32 aExp, bExp, zExp;
5141     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5142     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5143     float128 z;
5144
5145     aSig1 = extractFloat128Frac1( a );
5146     aSig0 = extractFloat128Frac0( a );
5147     aExp = extractFloat128Exp( a );
5148     aSign = extractFloat128Sign( a );
5149     bSig1 = extractFloat128Frac1( b );
5150     bSig0 = extractFloat128Frac0( b );
5151     bExp = extractFloat128Exp( b );
5152     bSign = extractFloat128Sign( b );
5153     zSign = aSign ^ bSign;
5154     if ( aExp == 0x7FFF ) {
5155         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5156         if ( bExp == 0x7FFF ) {
5157             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5158             goto invalid;
5159         }
5160         return packFloat128( zSign, 0x7FFF, 0, 0 );
5161     }
5162     if ( bExp == 0x7FFF ) {
5163         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5164         return packFloat128( zSign, 0, 0, 0 );
5165     }
5166     if ( bExp == 0 ) {
5167         if ( ( bSig0 | bSig1 ) == 0 ) {
5168             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5169  invalid:
5170                 float_raise( float_flag_invalid STATUS_VAR);
5171                 z.low = float128_default_nan_low;
5172                 z.high = float128_default_nan_high;
5173                 return z;
5174             }
5175             float_raise( float_flag_divbyzero STATUS_VAR);
5176             return packFloat128( zSign, 0x7FFF, 0, 0 );
5177         }
5178         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5179     }
5180     if ( aExp == 0 ) {
5181         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5182         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5183     }
5184     zExp = aExp - bExp + 0x3FFD;
5185     shortShift128Left(
5186         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5187     shortShift128Left(
5188         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5189     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5190         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5191         ++zExp;
5192     }
5193     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5194     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5195     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5196     while ( (sbits64) rem0 < 0 ) {
5197         --zSig0;
5198         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5199     }
5200     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5201     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5202         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5203         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5204         while ( (sbits64) rem1 < 0 ) {
5205             --zSig1;
5206             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5207         }
5208         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5209     }
5210     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5211     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5212
5213 }
5214
5215 /*----------------------------------------------------------------------------
5216 | Returns the remainder of the quadruple-precision floating-point value `a'
5217 | with respect to the corresponding value `b'.  The operation is performed
5218 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5219 *----------------------------------------------------------------------------*/
5220
5221 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5222 {
5223     flag aSign, zSign;
5224     int32 aExp, bExp, expDiff;
5225     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5226     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5227     sbits64 sigMean0;
5228     float128 z;
5229
5230     aSig1 = extractFloat128Frac1( a );
5231     aSig0 = extractFloat128Frac0( a );
5232     aExp = extractFloat128Exp( a );
5233     aSign = extractFloat128Sign( a );
5234     bSig1 = extractFloat128Frac1( b );
5235     bSig0 = extractFloat128Frac0( b );
5236     bExp = extractFloat128Exp( b );
5237     if ( aExp == 0x7FFF ) {
5238         if (    ( aSig0 | aSig1 )
5239              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5240             return propagateFloat128NaN( a, b STATUS_VAR );
5241         }
5242         goto invalid;
5243     }
5244     if ( bExp == 0x7FFF ) {
5245         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5246         return a;
5247     }
5248     if ( bExp == 0 ) {
5249         if ( ( bSig0 | bSig1 ) == 0 ) {
5250  invalid:
5251             float_raise( float_flag_invalid STATUS_VAR);
5252             z.low = float128_default_nan_low;
5253             z.high = float128_default_nan_high;
5254             return z;
5255         }
5256         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5257     }
5258     if ( aExp == 0 ) {
5259         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5260         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5261     }
5262     expDiff = aExp - bExp;
5263     if ( expDiff < -1 ) return a;
5264     shortShift128Left(
5265         aSig0 | LIT64( 0x0001000000000000 ),
5266         aSig1,
5267         15 - ( expDiff < 0 ),
5268         &aSig0,
5269         &aSig1
5270     );
5271     shortShift128Left(
5272         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5273     q = le128( bSig0, bSig1, aSig0, aSig1 );
5274     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5275     expDiff -= 64;
5276     while ( 0 < expDiff ) {
5277         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5278         q = ( 4 < q ) ? q - 4 : 0;
5279         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5280         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5281         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5282         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5283         expDiff -= 61;
5284     }
5285     if ( -64 < expDiff ) {
5286         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5287         q = ( 4 < q ) ? q - 4 : 0;
5288         q >>= - expDiff;
5289         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5290         expDiff += 52;
5291         if ( expDiff < 0 ) {
5292             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5293         }
5294         else {
5295             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5296         }
5297         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5298         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5299     }
5300     else {
5301         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5302         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5303     }
5304     do {
5305         alternateASig0 = aSig0;
5306         alternateASig1 = aSig1;
5307         ++q;
5308         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5309     } while ( 0 <= (sbits64) aSig0 );
5310     add128(
5311         aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5312     if (    ( sigMean0 < 0 )
5313          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5314         aSig0 = alternateASig0;
5315         aSig1 = alternateASig1;
5316     }
5317     zSign = ( (sbits64) aSig0 < 0 );
5318     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5319     return
5320         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5321
5322 }
5323
5324 /*----------------------------------------------------------------------------
5325 | Returns the square root of the quadruple-precision floating-point value `a'.
5326 | The operation is performed according to the IEC/IEEE Standard for Binary
5327 | Floating-Point Arithmetic.
5328 *----------------------------------------------------------------------------*/
5329
5330 float128 float128_sqrt( float128 a STATUS_PARAM )
5331 {
5332     flag aSign;
5333     int32 aExp, zExp;
5334     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5335     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5336     float128 z;
5337
5338     aSig1 = extractFloat128Frac1( a );
5339     aSig0 = extractFloat128Frac0( a );
5340     aExp = extractFloat128Exp( a );
5341     aSign = extractFloat128Sign( a );
5342     if ( aExp == 0x7FFF ) {
5343         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5344         if ( ! aSign ) return a;
5345         goto invalid;
5346     }
5347     if ( aSign ) {
5348         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5349  invalid:
5350         float_raise( float_flag_invalid STATUS_VAR);
5351         z.low = float128_default_nan_low;
5352         z.high = float128_default_nan_high;
5353         return z;
5354     }
5355     if ( aExp == 0 ) {
5356         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5357         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5358     }
5359     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5360     aSig0 |= LIT64( 0x0001000000000000 );
5361     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5362     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5363     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5364     doubleZSig0 = zSig0<<1;
5365     mul64To128( zSig0, zSig0, &term0, &term1 );
5366     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5367     while ( (sbits64) rem0 < 0 ) {
5368         --zSig0;
5369         doubleZSig0 -= 2;
5370         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5371     }
5372     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5373     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5374         if ( zSig1 == 0 ) zSig1 = 1;
5375         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5376         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5377         mul64To128( zSig1, zSig1, &term2, &term3 );
5378         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5379         while ( (sbits64) rem1 < 0 ) {
5380             --zSig1;
5381             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5382             term3 |= 1;
5383             term2 |= doubleZSig0;
5384             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5385         }
5386         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5387     }
5388     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5389     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5390
5391 }
5392
5393 /*----------------------------------------------------------------------------
5394 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5395 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5396 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5397 *----------------------------------------------------------------------------*/
5398
5399 int float128_eq( float128 a, float128 b STATUS_PARAM )
5400 {
5401
5402     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5403               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5404          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5405               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5406        ) {
5407         if (    float128_is_signaling_nan( a )
5408              || float128_is_signaling_nan( b ) ) {
5409             float_raise( float_flag_invalid STATUS_VAR);
5410         }
5411         return 0;
5412     }
5413     return
5414            ( a.low == b.low )
5415         && (    ( a.high == b.high )
5416              || (    ( a.low == 0 )
5417                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5418            );
5419
5420 }
5421
5422 /*----------------------------------------------------------------------------
5423 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5424 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
5425 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5426 | Arithmetic.
5427 *----------------------------------------------------------------------------*/
5428
5429 int float128_le( float128 a, float128 b STATUS_PARAM )
5430 {
5431     flag aSign, bSign;
5432
5433     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5434               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5435          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5436               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5437        ) {
5438         float_raise( float_flag_invalid STATUS_VAR);
5439         return 0;
5440     }
5441     aSign = extractFloat128Sign( a );
5442     bSign = extractFloat128Sign( b );
5443     if ( aSign != bSign ) {
5444         return
5445                aSign
5446             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5447                  == 0 );
5448     }
5449     return
5450           aSign ? le128( b.high, b.low, a.high, a.low )
5451         : le128( a.high, a.low, b.high, b.low );
5452
5453 }
5454
5455 /*----------------------------------------------------------------------------
5456 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5457 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5458 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5459 *----------------------------------------------------------------------------*/
5460
5461 int float128_lt( float128 a, float128 b STATUS_PARAM )
5462 {
5463     flag aSign, bSign;
5464
5465     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5466               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5467          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5468               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5469        ) {
5470         float_raise( float_flag_invalid STATUS_VAR);
5471         return 0;
5472     }
5473     aSign = extractFloat128Sign( a );
5474     bSign = extractFloat128Sign( b );
5475     if ( aSign != bSign ) {
5476         return
5477                aSign
5478             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5479                  != 0 );
5480     }
5481     return
5482           aSign ? lt128( b.high, b.low, a.high, a.low )
5483         : lt128( a.high, a.low, b.high, b.low );
5484
5485 }
5486
5487 /*----------------------------------------------------------------------------
5488 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5489 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5490 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5491 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5492 *----------------------------------------------------------------------------*/
5493
5494 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5495 {
5496
5497     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5498               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5499          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5500               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5501        ) {
5502         float_raise( float_flag_invalid STATUS_VAR);
5503         return 0;
5504     }
5505     return
5506            ( a.low == b.low )
5507         && (    ( a.high == b.high )
5508              || (    ( a.low == 0 )
5509                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5510            );
5511
5512 }
5513
5514 /*----------------------------------------------------------------------------
5515 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5516 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5517 | cause an exception.  Otherwise, the comparison is performed according to the
5518 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5519 *----------------------------------------------------------------------------*/
5520
5521 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5522 {
5523     flag aSign, bSign;
5524
5525     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5526               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5527          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5528               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5529        ) {
5530         if (    float128_is_signaling_nan( a )
5531              || float128_is_signaling_nan( b ) ) {
5532             float_raise( float_flag_invalid STATUS_VAR);
5533         }
5534         return 0;
5535     }
5536     aSign = extractFloat128Sign( a );
5537     bSign = extractFloat128Sign( b );
5538     if ( aSign != bSign ) {
5539         return
5540                aSign
5541             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5542                  == 0 );
5543     }
5544     return
5545           aSign ? le128( b.high, b.low, a.high, a.low )
5546         : le128( a.high, a.low, b.high, b.low );
5547
5548 }
5549
5550 /*----------------------------------------------------------------------------
5551 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5552 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5553 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5554 | Standard for Binary Floating-Point Arithmetic.
5555 *----------------------------------------------------------------------------*/
5556
5557 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5558 {
5559     flag aSign, bSign;
5560
5561     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5562               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5563          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5564               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5565        ) {
5566         if (    float128_is_signaling_nan( a )
5567              || float128_is_signaling_nan( b ) ) {
5568             float_raise( float_flag_invalid STATUS_VAR);
5569         }
5570         return 0;
5571     }
5572     aSign = extractFloat128Sign( a );
5573     bSign = extractFloat128Sign( b );
5574     if ( aSign != bSign ) {
5575         return
5576                aSign
5577             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5578                  != 0 );
5579     }
5580     return
5581           aSign ? lt128( b.high, b.low, a.high, a.low )
5582         : lt128( a.high, a.low, b.high, b.low );
5583
5584 }
5585
5586 #endif
5587
5588 /* misc functions */
5589 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5590 {
5591     return int64_to_float32(a STATUS_VAR);
5592 }
5593
5594 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5595 {
5596     return int64_to_float64(a STATUS_VAR);
5597 }
5598
5599 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5600 {
5601     int64_t v;
5602     unsigned int res;
5603
5604     v = float32_to_int64(a STATUS_VAR);
5605     if (v < 0) {
5606         res = 0;
5607         float_raise( float_flag_invalid STATUS_VAR);
5608     } else if (v > 0xffffffff) {
5609         res = 0xffffffff;
5610         float_raise( float_flag_invalid STATUS_VAR);
5611     } else {
5612         res = v;
5613     }
5614     return res;
5615 }
5616
5617 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5618 {
5619     int64_t v;
5620     unsigned int res;
5621
5622     v = float32_to_int64_round_to_zero(a STATUS_VAR);
5623     if (v < 0) {
5624         res = 0;
5625         float_raise( float_flag_invalid STATUS_VAR);
5626     } else if (v > 0xffffffff) {
5627         res = 0xffffffff;
5628         float_raise( float_flag_invalid STATUS_VAR);
5629     } else {
5630         res = v;
5631     }
5632     return res;
5633 }
5634
5635 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5636 {
5637     int64_t v;
5638     unsigned int res;
5639
5640     v = float64_to_int64(a STATUS_VAR);
5641     if (v < 0) {
5642         res = 0;
5643         float_raise( float_flag_invalid STATUS_VAR);
5644     } else if (v > 0xffffffff) {
5645         res = 0xffffffff;
5646         float_raise( float_flag_invalid STATUS_VAR);
5647     } else {
5648         res = v;
5649     }
5650     return res;
5651 }
5652
5653 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5654 {
5655     int64_t v;
5656     unsigned int res;
5657
5658     v = float64_to_int64_round_to_zero(a STATUS_VAR);
5659     if (v < 0) {
5660         res = 0;
5661         float_raise( float_flag_invalid STATUS_VAR);
5662     } else if (v > 0xffffffff) {
5663         res = 0xffffffff;
5664         float_raise( float_flag_invalid STATUS_VAR);
5665     } else {
5666         res = v;
5667     }
5668     return res;
5669 }
5670
5671 /* FIXME: This looks broken.  */
5672 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5673 {
5674     int64_t v;
5675
5676     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5677     v += float64_val(a);
5678     v = float64_to_int64(make_float64(v) STATUS_VAR);
5679
5680     return v - INT64_MIN;
5681 }
5682
5683 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5684 {
5685     int64_t v;
5686
5687     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5688     v += float64_val(a);
5689     v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5690
5691     return v - INT64_MIN;
5692 }
5693
5694 #define COMPARE(s, nan_exp)                                                  \
5695 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
5696                                       int is_quiet STATUS_PARAM )            \
5697 {                                                                            \
5698     flag aSign, bSign;                                                       \
5699     bits ## s av, bv;                                                        \
5700                                                                              \
5701     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5702          extractFloat ## s ## Frac( a ) ) ||                                 \
5703         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5704           extractFloat ## s ## Frac( b ) )) {                                \
5705         if (!is_quiet ||                                                     \
5706             float ## s ## _is_signaling_nan( a ) ||                          \
5707             float ## s ## _is_signaling_nan( b ) ) {                         \
5708             float_raise( float_flag_invalid STATUS_VAR);                     \
5709         }                                                                    \
5710         return float_relation_unordered;                                     \
5711     }                                                                        \
5712     aSign = extractFloat ## s ## Sign( a );                                  \
5713     bSign = extractFloat ## s ## Sign( b );                                  \
5714     av = float ## s ## _val(a);                                              \
5715     bv = float ## s ## _val(b);                                              \
5716     if ( aSign != bSign ) {                                                  \
5717         if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) {                         \
5718             /* zero case */                                                  \
5719             return float_relation_equal;                                     \
5720         } else {                                                             \
5721             return 1 - (2 * aSign);                                          \
5722         }                                                                    \
5723     } else {                                                                 \
5724         if (av == bv) {                                                      \
5725             return float_relation_equal;                                     \
5726         } else {                                                             \
5727             return 1 - 2 * (aSign ^ ( av < bv ));                            \
5728         }                                                                    \
5729     }                                                                        \
5730 }                                                                            \
5731                                                                              \
5732 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
5733 {                                                                            \
5734     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
5735 }                                                                            \
5736                                                                              \
5737 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
5738 {                                                                            \
5739     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
5740 }
5741
5742 COMPARE(32, 0xff)
5743 COMPARE(64, 0x7ff)
5744
5745 INLINE int float128_compare_internal( float128 a, float128 b,
5746                                       int is_quiet STATUS_PARAM )
5747 {
5748     flag aSign, bSign;
5749
5750     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
5751           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
5752         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
5753           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
5754         if (!is_quiet ||
5755             float128_is_signaling_nan( a ) ||
5756             float128_is_signaling_nan( b ) ) {
5757             float_raise( float_flag_invalid STATUS_VAR);
5758         }
5759         return float_relation_unordered;
5760     }
5761     aSign = extractFloat128Sign( a );
5762     bSign = extractFloat128Sign( b );
5763     if ( aSign != bSign ) {
5764         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
5765             /* zero case */
5766             return float_relation_equal;
5767         } else {
5768             return 1 - (2 * aSign);
5769         }
5770     } else {
5771         if (a.low == b.low && a.high == b.high) {
5772             return float_relation_equal;
5773         } else {
5774             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
5775         }
5776     }
5777 }
5778
5779 int float128_compare( float128 a, float128 b STATUS_PARAM )
5780 {
5781     return float128_compare_internal(a, b, 0 STATUS_VAR);
5782 }
5783
5784 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
5785 {
5786     return float128_compare_internal(a, b, 1 STATUS_VAR);
5787 }
5788
5789 /* Multiply A by 2 raised to the power N.  */
5790 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
5791 {
5792     flag aSign;
5793     int16 aExp;
5794     bits32 aSig;
5795
5796     aSig = extractFloat32Frac( a );
5797     aExp = extractFloat32Exp( a );
5798     aSign = extractFloat32Sign( a );
5799
5800     if ( aExp == 0xFF ) {
5801         return a;
5802     }
5803     if ( aExp != 0 )
5804         aSig |= 0x00800000;
5805     else if ( aSig == 0 )
5806         return a;
5807
5808     aExp += n - 1;
5809     aSig <<= 7;
5810     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
5811 }
5812
5813 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
5814 {
5815     flag aSign;
5816     int16 aExp;
5817     bits64 aSig;
5818
5819     aSig = extractFloat64Frac( a );
5820     aExp = extractFloat64Exp( a );
5821     aSign = extractFloat64Sign( a );
5822
5823     if ( aExp == 0x7FF ) {
5824         return a;
5825     }
5826     if ( aExp != 0 )
5827         aSig |= LIT64( 0x0010000000000000 );
5828     else if ( aSig == 0 )
5829         return a;
5830
5831     aExp += n - 1;
5832     aSig <<= 10;
5833     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
5834 }
5835
5836 #ifdef FLOATX80
5837 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
5838 {
5839     flag aSign;
5840     int16 aExp;
5841     bits64 aSig;
5842
5843     aSig = extractFloatx80Frac( a );
5844     aExp = extractFloatx80Exp( a );
5845     aSign = extractFloatx80Sign( a );
5846
5847     if ( aExp == 0x7FF ) {
5848         return a;
5849     }
5850     if (aExp == 0 && aSig == 0)
5851         return a;
5852
5853     aExp += n;
5854     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
5855                                           aSign, aExp, aSig, 0 STATUS_VAR );
5856 }
5857 #endif
5858
5859 #ifdef FLOAT128
5860 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
5861 {
5862     flag aSign;
5863     int32 aExp;
5864     bits64 aSig0, aSig1;
5865
5866     aSig1 = extractFloat128Frac1( a );
5867     aSig0 = extractFloat128Frac0( a );
5868     aExp = extractFloat128Exp( a );
5869     aSign = extractFloat128Sign( a );
5870     if ( aExp == 0x7FFF ) {
5871         return a;
5872     }
5873     if ( aExp != 0 )
5874         aSig0 |= LIT64( 0x0001000000000000 );
5875     else if ( aSig0 == 0 && aSig1 == 0 )
5876         return a;
5877
5878     aExp += n - 1;
5879     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
5880                                           STATUS_VAR );
5881
5882 }
5883 #endif