release/src-rt-6.x.4708/linux/linux-2.6.36/arch/arm/nwfpe/softfloat.c

   1 /*
   2 ===============================================================================
   3
   4 This C source file is part of the SoftFloat IEC/IEEE Floating-point
   5 Arithmetic Package, Release 2.
   6
   7 Written by John R. Hauser.  This work was made possible in part by the
   8 International Computer Science Institute, located at Suite 600, 1947 Center
   9 Street, Berkeley, California 94704.  Funding was partially provided by the
  10 National Science Foundation under grant MIP-9311980.  The original version
  11 of this code was written as part of a project to build a fixed-point vector
  12 processor in collaboration with the University of California at Berkeley,
  13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  14 is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  15 arithmetic/softfloat.html'.
  16
  17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  18 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  20 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22
  23 Derivative works are acceptable, even for commercial purposes, so long as
  24 (1) they include prominent notice that the work is derivative, and (2) they
  25 include prominent notice akin to these three paragraphs for those parts of
  26 this code that are retained.
  27
  28 ===============================================================================
  29 */
  30
  31 #include <asm/div64.h>
  32
  33 #include "fpa11.h"
  34 //#include "milieu.h"
  35 //#include "softfloat.h"
  36
  37 /*
  38 -------------------------------------------------------------------------------
  39 Primitive arithmetic functions, including multi-word arithmetic, and
  40 division and square root approximations.  (Can be specialized to target if
  41 desired.)
  42 -------------------------------------------------------------------------------
  43 */
  44 #include "softfloat-macros"
  45
  46 /*
  47 -------------------------------------------------------------------------------
  48 Functions and definitions to determine:  (1) whether tininess for underflow
  49 is detected before or after rounding by default, (2) what (if anything)
  50 happens when exceptions are raised, (3) how signaling NaNs are distinguished
  51 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  52 are propagated from function inputs to output.  These details are target-
  53 specific.
  54 -------------------------------------------------------------------------------
  55 */
  56 #include "softfloat-specialize"
  57
  58 /*
  59 -------------------------------------------------------------------------------
  60 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  61 and 7, and returns the properly rounded 32-bit integer corresponding to the
  62 input.  If `zSign' is nonzero, the input is negated before being converted
  63 to an integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point
  64 input is simply rounded to an integer, with the inexact exception raised if
  65 the input cannot be represented exactly as an integer.  If the fixed-point
  66 input is too large, however, the invalid exception is raised and the largest
  67 positive or negative integer is returned.
  68 -------------------------------------------------------------------------------
  69 */
  70 static int32 roundAndPackInt32( struct roundingData *roundData, flag zSign, bits64 absZ )
  71 {
  72     int8 roundingMode;
  73     flag roundNearestEven;
  74     int8 roundIncrement, roundBits;
  75     int32 z;
  76
  77     roundingMode = roundData->mode;
  78     roundNearestEven = ( roundingMode == float_round_nearest_even );
  79     roundIncrement = 0x40;
  80     if ( ! roundNearestEven ) {
  81         if ( roundingMode == float_round_to_zero ) {
  82             roundIncrement = 0;
  83         }
  84         else {
  85             roundIncrement = 0x7F;
  86             if ( zSign ) {
  87                 if ( roundingMode == float_round_up ) roundIncrement = 0;
  88             }
  89             else {
  90                 if ( roundingMode == float_round_down ) roundIncrement = 0;
  91             }
  92         }
  93     }
  94     roundBits = absZ & 0x7F;
  95     absZ = ( absZ + roundIncrement )>>7;
  96     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
  97     z = absZ;
  98     if ( zSign ) z = - z;
  99     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 100         roundData->exception |= float_flag_invalid;
 101         return zSign ? 0x80000000 : 0x7FFFFFFF;
 102     }
 103     if ( roundBits ) roundData->exception |= float_flag_inexact;
 104     return z;
 105
 106 }
 107
 108 /*
 109 -------------------------------------------------------------------------------
 110 Returns the fraction bits of the single-precision floating-point value `a'.
 111 -------------------------------------------------------------------------------
 112 */
 113 INLINE bits32 extractFloat32Frac( float32 a )
 114 {
 115
 116     return a & 0x007FFFFF;
 117
 118 }
 119
 120 /*
 121 -------------------------------------------------------------------------------
 122 Returns the exponent bits of the single-precision floating-point value `a'.
 123 -------------------------------------------------------------------------------
 124 */
 125 INLINE int16 extractFloat32Exp( float32 a )
 126 {
 127
 128     return ( a>>23 ) & 0xFF;
 129
 130 }
 131
 132 /*
 133 -------------------------------------------------------------------------------
 134 Returns the sign bit of the single-precision floating-point value `a'.
 135 -------------------------------------------------------------------------------
 136 */
 137
 138 /*
 139 -------------------------------------------------------------------------------
 140 Normalizes the subnormal single-precision floating-point value represented
 141 by the denormalized significand `aSig'.  The normalized exponent and
 142 significand are stored at the locations pointed to by `zExpPtr' and
 143 `zSigPtr', respectively.
 144 -------------------------------------------------------------------------------
 145 */
 146 static void
 147  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 148 {
 149     int8 shiftCount;
 150
 151     shiftCount = countLeadingZeros32( aSig ) - 8;
 152     *zSigPtr = aSig<<shiftCount;
 153     *zExpPtr = 1 - shiftCount;
 154
 155 }
 156
 157 /*
 158 -------------------------------------------------------------------------------
 159 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 160 single-precision floating-point value, returning the result.  After being
 161 shifted into the proper positions, the three fields are simply added
 162 together to form the result.  This means that any integer portion of `zSig'
 163 will be added into the exponent.  Since a properly normalized significand
 164 will have an integer portion equal to 1, the `zExp' input should be 1 less
 165 than the desired result exponent whenever `zSig' is a complete, normalized
 166 significand.
 167 -------------------------------------------------------------------------------
 168 */
 169 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 170 {
 171     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
 172 }
 173
 174 /*
 175 -------------------------------------------------------------------------------
 176 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 177 and significand `zSig', and returns the proper single-precision floating-
 178 point value corresponding to the abstract input.  Ordinarily, the abstract
 179 value is simply rounded and packed into the single-precision format, with
 180 the inexact exception raised if the abstract input cannot be represented
 181 exactly.  If the abstract value is too large, however, the overflow and
 182 inexact exceptions are raised and an infinity or maximal finite value is
 183 returned.  If the abstract value is too small, the input value is rounded to
 184 a subnormal number, and the underflow and inexact exceptions are raised if
 185 the abstract input cannot be represented exactly as a subnormal single-
 186 precision floating-point number.
 187     The input significand `zSig' has its binary point between bits 30
 188 and 29, which is 7 bits to the left of the usual location.  This shifted
 189 significand must be normalized or smaller.  If `zSig' is not normalized,
 190 `zExp' must be 0; in that case, the result returned is a subnormal number,
 191 and it must not require rounding.  In the usual case that `zSig' is
 192 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 193 The handling of underflow and overflow follows the IEC/IEEE Standard for
 194 Binary Floating-point Arithmetic.
 195 -------------------------------------------------------------------------------
 196 */
 197 static float32 roundAndPackFloat32( struct roundingData *roundData, flag zSign, int16 zExp, bits32 zSig )
 198 {
 199     int8 roundingMode;
 200     flag roundNearestEven;
 201     int8 roundIncrement, roundBits;
 202     flag isTiny;
 203
 204     roundingMode = roundData->mode;
 205     roundNearestEven = ( roundingMode == float_round_nearest_even );
 206     roundIncrement = 0x40;
 207     if ( ! roundNearestEven ) {
 208         if ( roundingMode == float_round_to_zero ) {
 209             roundIncrement = 0;
 210         }
 211         else {
 212             roundIncrement = 0x7F;
 213             if ( zSign ) {
 214                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 215             }
 216             else {
 217                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 218             }
 219         }
 220     }
 221     roundBits = zSig & 0x7F;
 222     if ( 0xFD <= (bits16) zExp ) {
 223         if (    ( 0xFD < zExp )
 224              || (    ( zExp == 0xFD )
 225                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 226            ) {
 227             roundData->exception |= float_flag_overflow | float_flag_inexact;
 228             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
 229         }
 230         if ( zExp < 0 ) {
 231             isTiny =
 232                    ( float_detect_tininess == float_tininess_before_rounding )
 233                 || ( zExp < -1 )
 234                 || ( zSig + roundIncrement < 0x80000000 );
 235             shift32RightJamming( zSig, - zExp, &zSig );
 236             zExp = 0;
 237             roundBits = zSig & 0x7F;
 238             if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
 239         }
 240     }
 241     if ( roundBits ) roundData->exception |= float_flag_inexact;
 242     zSig = ( zSig + roundIncrement )>>7;
 243     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 244     if ( zSig == 0 ) zExp = 0;
 245     return packFloat32( zSign, zExp, zSig );
 246
 247 }
 248
 249 /*
 250 -------------------------------------------------------------------------------
 251 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 252 and significand `zSig', and returns the proper single-precision floating-
 253 point value corresponding to the abstract input.  This routine is just like
 254 `roundAndPackFloat32' except that `zSig' does not have to be normalized in
 255 any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
 256 point exponent.
 257 -------------------------------------------------------------------------------
 258 */
 259 static float32
 260  normalizeRoundAndPackFloat32( struct roundingData *roundData, flag zSign, int16 zExp, bits32 zSig )
 261 {
 262     int8 shiftCount;
 263
 264     shiftCount = countLeadingZeros32( zSig ) - 1;
 265     return roundAndPackFloat32( roundData, zSign, zExp - shiftCount, zSig<<shiftCount );
 266
 267 }
 268
 269 /*
 270 -------------------------------------------------------------------------------
 271 Returns the fraction bits of the double-precision floating-point value `a'.
 272 -------------------------------------------------------------------------------
 273 */
 274 INLINE bits64 extractFloat64Frac( float64 a )
 275 {
 276
 277     return a & LIT64( 0x000FFFFFFFFFFFFF );
 278
 279 }
 280
 281 /*
 282 -------------------------------------------------------------------------------
 283 Returns the exponent bits of the double-precision floating-point value `a'.
 284 -------------------------------------------------------------------------------
 285 */
 286 INLINE int16 extractFloat64Exp( float64 a )
 287 {
 288
 289     return ( a>>52 ) & 0x7FF;
 290
 291 }
 292
 293 /*
 294 -------------------------------------------------------------------------------
 295 Returns the sign bit of the double-precision floating-point value `a'.
 296 -------------------------------------------------------------------------------
 297 */
 298
 299 /*
 300 -------------------------------------------------------------------------------
 301 Normalizes the subnormal double-precision floating-point value represented
 302 by the denormalized significand `aSig'.  The normalized exponent and
 303 significand are stored at the locations pointed to by `zExpPtr' and
 304 `zSigPtr', respectively.
 305 -------------------------------------------------------------------------------
 306 */
 307 static void
 308  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 309 {
 310     int8 shiftCount;
 311
 312     shiftCount = countLeadingZeros64( aSig ) - 11;
 313     *zSigPtr = aSig<<shiftCount;
 314     *zExpPtr = 1 - shiftCount;
 315
 316 }
 317
 318 /*
 319 -------------------------------------------------------------------------------
 320 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 321 double-precision floating-point value, returning the result.  After being
 322 shifted into the proper positions, the three fields are simply added
 323 together to form the result.  This means that any integer portion of `zSig'
 324 will be added into the exponent.  Since a properly normalized significand
 325 will have an integer portion equal to 1, the `zExp' input should be 1 less
 326 than the desired result exponent whenever `zSig' is a complete, normalized
 327 significand.
 328 -------------------------------------------------------------------------------
 329 */
 330 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 331 {
 332
 333     return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
 334
 335 }
 336
 337 /*
 338 -------------------------------------------------------------------------------
 339 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 340 and significand `zSig', and returns the proper double-precision floating-
 341 point value corresponding to the abstract input.  Ordinarily, the abstract
 342 value is simply rounded and packed into the double-precision format, with
 343 the inexact exception raised if the abstract input cannot be represented
 344 exactly.  If the abstract value is too large, however, the overflow and
 345 inexact exceptions are raised and an infinity or maximal finite value is
 346 returned.  If the abstract value is too small, the input value is rounded to
 347 a subnormal number, and the underflow and inexact exceptions are raised if
 348 the abstract input cannot be represented exactly as a subnormal double-
 349 precision floating-point number.
 350     The input significand `zSig' has its binary point between bits 62
 351 and 61, which is 10 bits to the left of the usual location.  This shifted
 352 significand must be normalized or smaller.  If `zSig' is not normalized,
 353 `zExp' must be 0; in that case, the result returned is a subnormal number,
 354 and it must not require rounding.  In the usual case that `zSig' is
 355 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 356 The handling of underflow and overflow follows the IEC/IEEE Standard for
 357 Binary Floating-point Arithmetic.
 358 -------------------------------------------------------------------------------
 359 */
 360 static float64 roundAndPackFloat64( struct roundingData *roundData, flag zSign, int16 zExp, bits64 zSig )
 361 {
 362     int8 roundingMode;
 363     flag roundNearestEven;
 364     int16 roundIncrement, roundBits;
 365     flag isTiny;
 366
 367     roundingMode = roundData->mode;
 368     roundNearestEven = ( roundingMode == float_round_nearest_even );
 369     roundIncrement = 0x200;
 370     if ( ! roundNearestEven ) {
 371         if ( roundingMode == float_round_to_zero ) {
 372             roundIncrement = 0;
 373         }
 374         else {
 375             roundIncrement = 0x3FF;
 376             if ( zSign ) {
 377                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 378             }
 379             else {
 380                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 381             }
 382         }
 383     }
 384     roundBits = zSig & 0x3FF;
 385     if ( 0x7FD <= (bits16) zExp ) {
 386         if (    ( 0x7FD < zExp )
 387              || (    ( zExp == 0x7FD )
 388                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 389            ) {
 390             //register int lr = __builtin_return_address(0);
 391             //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
 392             roundData->exception |= float_flag_overflow | float_flag_inexact;
 393             return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
 394         }
 395         if ( zExp < 0 ) {
 396             isTiny =
 397                    ( float_detect_tininess == float_tininess_before_rounding )
 398                 || ( zExp < -1 )
 399                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 400             shift64RightJamming( zSig, - zExp, &zSig );
 401             zExp = 0;
 402             roundBits = zSig & 0x3FF;
 403             if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
 404         }
 405     }
 406     if ( roundBits ) roundData->exception |= float_flag_inexact;
 407     zSig = ( zSig + roundIncrement )>>10;
 408     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 409     if ( zSig == 0 ) zExp = 0;
 410     return packFloat64( zSign, zExp, zSig );
 411
 412 }
 413
 414 /*
 415 -------------------------------------------------------------------------------
 416 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 417 and significand `zSig', and returns the proper double-precision floating-
 418 point value corresponding to the abstract input.  This routine is just like
 419 `roundAndPackFloat64' except that `zSig' does not have to be normalized in
 420 any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
 421 point exponent.
 422 -------------------------------------------------------------------------------
 423 */
 424 static float64
 425  normalizeRoundAndPackFloat64( struct roundingData *roundData, flag zSign, int16 zExp, bits64 zSig )
 426 {
 427     int8 shiftCount;
 428
 429     shiftCount = countLeadingZeros64( zSig ) - 1;
 430     return roundAndPackFloat64( roundData, zSign, zExp - shiftCount, zSig<<shiftCount );
 431
 432 }
 433
 434 #ifdef FLOATX80
 435
 436 /*
 437 -------------------------------------------------------------------------------
 438 Returns the fraction bits of the extended double-precision floating-point
 439 value `a'.
 440 -------------------------------------------------------------------------------
 441 */
 442 INLINE bits64 extractFloatx80Frac( floatx80 a )
 443 {
 444
 445     return a.low;
 446
 447 }
 448
 449 /*
 450 -------------------------------------------------------------------------------
 451 Returns the exponent bits of the extended double-precision floating-point
 452 value `a'.
 453 -------------------------------------------------------------------------------
 454 */
 455 INLINE int32 extractFloatx80Exp( floatx80 a )
 456 {
 457
 458     return a.high & 0x7FFF;
 459
 460 }
 461
 462 /*
 463 -------------------------------------------------------------------------------
 464 Returns the sign bit of the extended double-precision floating-point value
 465 `a'.
 466 -------------------------------------------------------------------------------
 467 */
 468 INLINE flag extractFloatx80Sign( floatx80 a )
 469 {
 470
 471     return a.high>>15;
 472
 473 }
 474
 475 /*
 476 -------------------------------------------------------------------------------
 477 Normalizes the subnormal extended double-precision floating-point value
 478 represented by the denormalized significand `aSig'.  The normalized exponent
 479 and significand are stored at the locations pointed to by `zExpPtr' and
 480 `zSigPtr', respectively.
 481 -------------------------------------------------------------------------------
 482 */
 483 static void
 484  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 485 {
 486     int8 shiftCount;
 487
 488     shiftCount = countLeadingZeros64( aSig );
 489     *zSigPtr = aSig<<shiftCount;
 490     *zExpPtr = 1 - shiftCount;
 491
 492 }
 493
 494 /*
 495 -------------------------------------------------------------------------------
 496 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 497 extended double-precision floating-point value, returning the result.
 498 -------------------------------------------------------------------------------
 499 */
 500 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 501 {
 502     floatx80 z;
 503
 504     z.low = zSig;
 505     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 506     z.__padding = 0;
 507     return z;
 508
 509 }
 510
 511 /*
 512 -------------------------------------------------------------------------------
 513 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 514 and extended significand formed by the concatenation of `zSig0' and `zSig1',
 515 and returns the proper extended double-precision floating-point value
 516 corresponding to the abstract input.  Ordinarily, the abstract value is
 517 rounded and packed into the extended double-precision format, with the
 518 inexact exception raised if the abstract input cannot be represented
 519 exactly.  If the abstract value is too large, however, the overflow and
 520 inexact exceptions are raised and an infinity or maximal finite value is
 521 returned.  If the abstract value is too small, the input value is rounded to
 522 a subnormal number, and the underflow and inexact exceptions are raised if
 523 the abstract input cannot be represented exactly as a subnormal extended
 524 double-precision floating-point number.
 525     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 526 number of bits as single or double precision, respectively.  Otherwise, the
 527 result is rounded to the full precision of the extended double-precision
 528 format.
 529     The input significand must be normalized or smaller.  If the input
 530 significand is not normalized, `zExp' must be 0; in that case, the result
 531 returned is a subnormal number, and it must not require rounding.  The
 532 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 533 Floating-point Arithmetic.
 534 -------------------------------------------------------------------------------
 535 */
 536 static floatx80
 537  roundAndPackFloatx80(
 538      struct roundingData *roundData, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 539  )
 540 {
 541     int8 roundingMode, roundingPrecision;
 542     flag roundNearestEven, increment, isTiny;
 543     int64 roundIncrement, roundMask, roundBits;
 544
 545     roundingMode = roundData->mode;
 546     roundingPrecision = roundData->precision;
 547     roundNearestEven = ( roundingMode == float_round_nearest_even );
 548     if ( roundingPrecision == 80 ) goto precision80;
 549     if ( roundingPrecision == 64 ) {
 550         roundIncrement = LIT64( 0x0000000000000400 );
 551         roundMask = LIT64( 0x00000000000007FF );
 552     }
 553     else if ( roundingPrecision == 32 ) {
 554         roundIncrement = LIT64( 0x0000008000000000 );
 555         roundMask = LIT64( 0x000000FFFFFFFFFF );
 556     }
 557     else {
 558         goto precision80;
 559     }
 560     zSig0 |= ( zSig1 != 0 );
 561     if ( ! roundNearestEven ) {
 562         if ( roundingMode == float_round_to_zero ) {
 563             roundIncrement = 0;
 564         }
 565         else {
 566             roundIncrement = roundMask;
 567             if ( zSign ) {
 568                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 569             }
 570             else {
 571                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 572             }
 573         }
 574     }
 575     roundBits = zSig0 & roundMask;
 576     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 577         if (    ( 0x7FFE < zExp )
 578              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 579            ) {
 580             goto overflow;
 581         }
 582         if ( zExp <= 0 ) {
 583             isTiny =
 584                    ( float_detect_tininess == float_tininess_before_rounding )
 585                 || ( zExp < 0 )
 586                 || ( zSig0 <= zSig0 + roundIncrement );
 587             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 588             zExp = 0;
 589             roundBits = zSig0 & roundMask;
 590             if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
 591             if ( roundBits ) roundData->exception |= float_flag_inexact;
 592             zSig0 += roundIncrement;
 593             if ( (sbits64) zSig0 < 0 ) zExp = 1;
 594             roundIncrement = roundMask + 1;
 595             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 596                 roundMask |= roundIncrement;
 597             }
 598             zSig0 &= ~ roundMask;
 599             return packFloatx80( zSign, zExp, zSig0 );
 600         }
 601     }
 602     if ( roundBits ) roundData->exception |= float_flag_inexact;
 603     zSig0 += roundIncrement;
 604     if ( zSig0 < roundIncrement ) {
 605         ++zExp;
 606         zSig0 = LIT64( 0x8000000000000000 );
 607     }
 608     roundIncrement = roundMask + 1;
 609     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 610         roundMask |= roundIncrement;
 611     }
 612     zSig0 &= ~ roundMask;
 613     if ( zSig0 == 0 ) zExp = 0;
 614     return packFloatx80( zSign, zExp, zSig0 );
 615  precision80:
 616     increment = ( (sbits64) zSig1 < 0 );
 617     if ( ! roundNearestEven ) {
 618         if ( roundingMode == float_round_to_zero ) {
 619             increment = 0;
 620         }
 621         else {
 622             if ( zSign ) {
 623                 increment = ( roundingMode == float_round_down ) && zSig1;
 624             }
 625             else {
 626                 increment = ( roundingMode == float_round_up ) && zSig1;
 627             }
 628         }
 629     }
 630     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 631         if (    ( 0x7FFE < zExp )
 632              || (    ( zExp == 0x7FFE )
 633                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 634                   && increment
 635                 )
 636            ) {
 637             roundMask = 0;
 638  overflow:
 639             roundData->exception |= float_flag_overflow | float_flag_inexact;
 640             if (    ( roundingMode == float_round_to_zero )
 641                  || ( zSign && ( roundingMode == float_round_up ) )
 642                  || ( ! zSign && ( roundingMode == float_round_down ) )
 643                ) {
 644                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 645             }
 646             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 647         }
 648         if ( zExp <= 0 ) {
 649             isTiny =
 650                    ( float_detect_tininess == float_tininess_before_rounding )
 651                 || ( zExp < 0 )
 652                 || ! increment
 653                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 654             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 655             zExp = 0;
 656             if ( isTiny && zSig1 ) roundData->exception |= float_flag_underflow;
 657             if ( zSig1 ) roundData->exception |= float_flag_inexact;
 658             if ( roundNearestEven ) {
 659                 increment = ( (sbits64) zSig1 < 0 );
 660             }
 661             else {
 662                 if ( zSign ) {
 663                     increment = ( roundingMode == float_round_down ) && zSig1;
 664                 }
 665                 else {
 666                     increment = ( roundingMode == float_round_up ) && zSig1;
 667                 }
 668             }
 669             if ( increment ) {
 670                 ++zSig0;
 671                 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
 672                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
 673             }
 674             return packFloatx80( zSign, zExp, zSig0 );
 675         }
 676     }
 677     if ( zSig1 ) roundData->exception |= float_flag_inexact;
 678     if ( increment ) {
 679         ++zSig0;
 680         if ( zSig0 == 0 ) {
 681             ++zExp;
 682             zSig0 = LIT64( 0x8000000000000000 );
 683         }
 684         else {
 685             zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
 686         }
 687     }
 688     else {
 689         if ( zSig0 == 0 ) zExp = 0;
 690     }
 691
 692     return packFloatx80( zSign, zExp, zSig0 );
 693 }
 694
 695 /*
 696 -------------------------------------------------------------------------------
 697 Takes an abstract floating-point value having sign `zSign', exponent
 698 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 699 and returns the proper extended double-precision floating-point value
 700 corresponding to the abstract input.  This routine is just like
 701 `roundAndPackFloatx80' except that the input significand does not have to be
 702 normalized.
 703 -------------------------------------------------------------------------------
 704 */
 705 static floatx80
 706  normalizeRoundAndPackFloatx80(
 707      struct roundingData *roundData, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 708  )
 709 {
 710     int8 shiftCount;
 711
 712     if ( zSig0 == 0 ) {
 713         zSig0 = zSig1;
 714         zSig1 = 0;
 715         zExp -= 64;
 716     }
 717     shiftCount = countLeadingZeros64( zSig0 );
 718     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 719     zExp -= shiftCount;
 720     return
 721         roundAndPackFloatx80( roundData, zSign, zExp, zSig0, zSig1 );
 722
 723 }
 724
 725 #endif
 726
 727 /*
 728 -------------------------------------------------------------------------------
 729 Returns the result of converting the 32-bit two's complement integer `a' to
 730 the single-precision floating-point format.  The conversion is performed
 731 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
 732 -------------------------------------------------------------------------------
 733 */
 734 float32 int32_to_float32(struct roundingData *roundData, int32 a)
 735 {
 736     flag zSign;
 737
 738     if ( a == 0 ) return 0;
 739     if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
 740     zSign = ( a < 0 );
 741     return normalizeRoundAndPackFloat32( roundData, zSign, 0x9C, zSign ? - a : a );
 742
 743 }
 744
 745 /*
 746 -------------------------------------------------------------------------------
 747 Returns the result of converting the 32-bit two's complement integer `a' to
 748 the double-precision floating-point format.  The conversion is performed
 749 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
 750 -------------------------------------------------------------------------------
 751 */
 752 float64 int32_to_float64( int32 a )
 753 {
 754     flag aSign;
 755     uint32 absA;
 756     int8 shiftCount;
 757     bits64 zSig;
 758
 759     if ( a == 0 ) return 0;
 760     aSign = ( a < 0 );
 761     absA = aSign ? - a : a;
 762     shiftCount = countLeadingZeros32( absA ) + 21;
 763     zSig = absA;
 764     return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount );
 765
 766 }
 767
 768 #ifdef FLOATX80
 769
 770 /*
 771 -------------------------------------------------------------------------------
 772 Returns the result of converting the 32-bit two's complement integer `a'
 773 to the extended double-precision floating-point format.  The conversion
 774 is performed according to the IEC/IEEE Standard for Binary Floating-point
 775 Arithmetic.
 776 -------------------------------------------------------------------------------
 777 */
 778 floatx80 int32_to_floatx80( int32 a )
 779 {
 780     flag zSign;
 781     uint32 absA;
 782     int8 shiftCount;
 783     bits64 zSig;
 784
 785     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
 786     zSign = ( a < 0 );
 787     absA = zSign ? - a : a;
 788     shiftCount = countLeadingZeros32( absA ) + 32;
 789     zSig = absA;
 790     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
 791
 792 }
 793
 794 #endif
 795
 796 /*
 797 -------------------------------------------------------------------------------
 798 Returns the result of converting the single-precision floating-point value
 799 `a' to the 32-bit two's complement integer format.  The conversion is
 800 performed according to the IEC/IEEE Standard for Binary Floating-point
 801 Arithmetic---which means in particular that the conversion is rounded
 802 according to the current rounding mode.  If `a' is a NaN, the largest
 803 positive integer is returned.  Otherwise, if the conversion overflows, the
 804 largest integer with the same sign as `a' is returned.
 805 -------------------------------------------------------------------------------
 806 */
 807 int32 float32_to_int32( struct roundingData *roundData, float32 a )
 808 {
 809     flag aSign;
 810     int16 aExp, shiftCount;
 811     bits32 aSig;
 812     bits64 zSig;
 813
 814     aSig = extractFloat32Frac( a );
 815     aExp = extractFloat32Exp( a );
 816     aSign = extractFloat32Sign( a );
 817     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
 818     if ( aExp ) aSig |= 0x00800000;
 819     shiftCount = 0xAF - aExp;
 820     zSig = aSig;
 821     zSig <<= 32;
 822     if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig );
 823     return roundAndPackInt32( roundData, aSign, zSig );
 824
 825 }
 826
 827 /*
 828 -------------------------------------------------------------------------------
 829 Returns the result of converting the single-precision floating-point value
 830 `a' to the 32-bit two's complement integer format.  The conversion is
 831 performed according to the IEC/IEEE Standard for Binary Floating-point
 832 Arithmetic, except that the conversion is always rounded toward zero.  If
 833 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
 834 conversion overflows, the largest integer with the same sign as `a' is
 835 returned.
 836 -------------------------------------------------------------------------------
 837 */
 838 int32 float32_to_int32_round_to_zero( float32 a )
 839 {
 840     flag aSign;
 841     int16 aExp, shiftCount;
 842     bits32 aSig;
 843     int32 z;
 844
 845     aSig = extractFloat32Frac( a );
 846     aExp = extractFloat32Exp( a );
 847     aSign = extractFloat32Sign( a );
 848     shiftCount = aExp - 0x9E;
 849     if ( 0 <= shiftCount ) {
 850         if ( a == 0xCF000000 ) return 0x80000000;
 851         float_raise( float_flag_invalid );
 852         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
 853         return 0x80000000;
 854     }
 855     else if ( aExp <= 0x7E ) {
 856         if ( aExp | aSig ) float_raise( float_flag_inexact );
 857         return 0;
 858     }
 859     aSig = ( aSig | 0x00800000 )<<8;
 860     z = aSig>>( - shiftCount );
 861     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
 862         float_raise( float_flag_inexact );
 863     }
 864     return aSign ? - z : z;
 865
 866 }
 867
 868 /*
 869 -------------------------------------------------------------------------------
 870 Returns the result of converting the single-precision floating-point value
 871 `a' to the double-precision floating-point format.  The conversion is
 872 performed according to the IEC/IEEE Standard for Binary Floating-point
 873 Arithmetic.
 874 -------------------------------------------------------------------------------
 875 */
 876 float64 float32_to_float64( float32 a )
 877 {
 878     flag aSign;
 879     int16 aExp;
 880     bits32 aSig;
 881
 882     aSig = extractFloat32Frac( a );
 883     aExp = extractFloat32Exp( a );
 884     aSign = extractFloat32Sign( a );
 885     if ( aExp == 0xFF ) {
 886         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
 887         return packFloat64( aSign, 0x7FF, 0 );
 888     }
 889     if ( aExp == 0 ) {
 890         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
 891         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
 892         --aExp;
 893     }
 894     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
 895
 896 }
 897
 898 #ifdef FLOATX80
 899
 900 /*
 901 -------------------------------------------------------------------------------
 902 Returns the result of converting the single-precision floating-point value
 903 `a' to the extended double-precision floating-point format.  The conversion
 904 is performed according to the IEC/IEEE Standard for Binary Floating-point
 905 Arithmetic.
 906 -------------------------------------------------------------------------------
 907 */
 908 floatx80 float32_to_floatx80( float32 a )
 909 {
 910     flag aSign;
 911     int16 aExp;
 912     bits32 aSig;
 913
 914     aSig = extractFloat32Frac( a );
 915     aExp = extractFloat32Exp( a );
 916     aSign = extractFloat32Sign( a );
 917     if ( aExp == 0xFF ) {
 918         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
 919         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 920     }
 921     if ( aExp == 0 ) {
 922         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
 923         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
 924     }
 925     aSig |= 0x00800000;
 926     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
 927
 928 }
 929
 930 #endif
 931
 932 /*
 933 -------------------------------------------------------------------------------
 934 Rounds the single-precision floating-point value `a' to an integer, and
 935 returns the result as a single-precision floating-point value.  The
 936 operation is performed according to the IEC/IEEE Standard for Binary
 937 Floating-point Arithmetic.
 938 -------------------------------------------------------------------------------
 939 */
 940 float32 float32_round_to_int( struct roundingData *roundData, float32 a )
 941 {
 942     flag aSign;
 943     int16 aExp;
 944     bits32 lastBitMask, roundBitsMask;
 945     int8 roundingMode;
 946     float32 z;
 947
 948     aExp = extractFloat32Exp( a );
 949     if ( 0x96 <= aExp ) {
 950         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
 951             return propagateFloat32NaN( a, a );
 952         }
 953         return a;
 954     }
 955     roundingMode = roundData->mode;
 956     if ( aExp <= 0x7E ) {
 957         if ( (bits32) ( a<<1 ) == 0 ) return a;
 958         roundData->exception |= float_flag_inexact;
 959         aSign = extractFloat32Sign( a );
 960         switch ( roundingMode ) {
 961          case float_round_nearest_even:
 962             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
 963                 return packFloat32( aSign, 0x7F, 0 );
 964             }
 965             break;
 966          case float_round_down:
 967             return aSign ? 0xBF800000 : 0;
 968          case float_round_up:
 969             return aSign ? 0x80000000 : 0x3F800000;
 970         }
 971         return packFloat32( aSign, 0, 0 );
 972     }
 973     lastBitMask = 1;
 974     lastBitMask <<= 0x96 - aExp;
 975     roundBitsMask = lastBitMask - 1;
 976     z = a;
 977     if ( roundingMode == float_round_nearest_even ) {
 978         z += lastBitMask>>1;
 979         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
 980     }
 981     else if ( roundingMode != float_round_to_zero ) {
 982         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
 983             z += roundBitsMask;
 984         }
 985     }
 986     z &= ~ roundBitsMask;
 987     if ( z != a ) roundData->exception |= float_flag_inexact;
 988     return z;
 989
 990 }
 991
 992 /*
 993 -------------------------------------------------------------------------------
 994 Returns the result of adding the absolute values of the single-precision
 995 floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
 996 before being returned.  `zSign' is ignored if the result is a NaN.  The
 997 addition is performed according to the IEC/IEEE Standard for Binary
 998 Floating-point Arithmetic.
 999 -------------------------------------------------------------------------------
1000 */
1001 static float32 addFloat32Sigs( struct roundingData *roundData, float32 a, float32 b, flag zSign )
1002 {
1003     int16 aExp, bExp, zExp;
1004     bits32 aSig, bSig, zSig;
1005     int16 expDiff;
1006
1007     aSig = extractFloat32Frac( a );
1008     aExp = extractFloat32Exp( a );
1009     bSig = extractFloat32Frac( b );
1010     bExp = extractFloat32Exp( b );
1011     expDiff = aExp - bExp;
1012     aSig <<= 6;
1013     bSig <<= 6;
1014     if ( 0 < expDiff ) {
1015         if ( aExp == 0xFF ) {
1016             if ( aSig ) return propagateFloat32NaN( a, b );
1017             return a;
1018         }
1019         if ( bExp == 0 ) {
1020             --expDiff;
1021         }
1022         else {
1023             bSig |= 0x20000000;
1024         }
1025         shift32RightJamming( bSig, expDiff, &bSig );
1026         zExp = aExp;
1027     }
1028     else if ( expDiff < 0 ) {
1029         if ( bExp == 0xFF ) {
1030             if ( bSig ) return propagateFloat32NaN( a, b );
1031             return packFloat32( zSign, 0xFF, 0 );
1032         }
1033         if ( aExp == 0 ) {
1034             ++expDiff;
1035         }
1036         else {
1037             aSig |= 0x20000000;
1038         }
1039         shift32RightJamming( aSig, - expDiff, &aSig );
1040         zExp = bExp;
1041     }
1042     else {
1043         if ( aExp == 0xFF ) {
1044             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1045             return a;
1046         }
1047         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1048         zSig = 0x40000000 + aSig + bSig;
1049         zExp = aExp;
1050         goto roundAndPack;
1051     }
1052     aSig |= 0x20000000;
1053     zSig = ( aSig + bSig )<<1;
1054     --zExp;
1055     if ( (sbits32) zSig < 0 ) {
1056         zSig = aSig + bSig;
1057         ++zExp;
1058     }
1059  roundAndPack:
1060     return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1061
1062 }
1063
1064 /*
1065 -------------------------------------------------------------------------------
1066 Returns the result of subtracting the absolute values of the single-
1067 precision floating-point values `a' and `b'.  If `zSign' is true, the
1068 difference is negated before being returned.  `zSign' is ignored if the
1069 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1070 Standard for Binary Floating-point Arithmetic.
1071 -------------------------------------------------------------------------------
1072 */
1073 static float32 subFloat32Sigs( struct roundingData *roundData, float32 a, float32 b, flag zSign )
1074 {
1075     int16 aExp, bExp, zExp;
1076     bits32 aSig, bSig, zSig;
1077     int16 expDiff;
1078
1079     aSig = extractFloat32Frac( a );
1080     aExp = extractFloat32Exp( a );
1081     bSig = extractFloat32Frac( b );
1082     bExp = extractFloat32Exp( b );
1083     expDiff = aExp - bExp;
1084     aSig <<= 7;
1085     bSig <<= 7;
1086     if ( 0 < expDiff ) goto aExpBigger;
1087     if ( expDiff < 0 ) goto bExpBigger;
1088     if ( aExp == 0xFF ) {
1089         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1090         roundData->exception |= float_flag_invalid;
1091         return float32_default_nan;
1092     }
1093     if ( aExp == 0 ) {
1094         aExp = 1;
1095         bExp = 1;
1096     }
1097     if ( bSig < aSig ) goto aBigger;
1098     if ( aSig < bSig ) goto bBigger;
1099     return packFloat32( roundData->mode == float_round_down, 0, 0 );
1100  bExpBigger:
1101     if ( bExp == 0xFF ) {
1102         if ( bSig ) return propagateFloat32NaN( a, b );
1103         return packFloat32( zSign ^ 1, 0xFF, 0 );
1104     }
1105     if ( aExp == 0 ) {
1106         ++expDiff;
1107     }
1108     else {
1109         aSig |= 0x40000000;
1110     }
1111     shift32RightJamming( aSig, - expDiff, &aSig );
1112     bSig |= 0x40000000;
1113  bBigger:
1114     zSig = bSig - aSig;
1115     zExp = bExp;
1116     zSign ^= 1;
1117     goto normalizeRoundAndPack;
1118  aExpBigger:
1119     if ( aExp == 0xFF ) {
1120         if ( aSig ) return propagateFloat32NaN( a, b );
1121         return a;
1122     }
1123     if ( bExp == 0 ) {
1124         --expDiff;
1125     }
1126     else {
1127         bSig |= 0x40000000;
1128     }
1129     shift32RightJamming( bSig, expDiff, &bSig );
1130     aSig |= 0x40000000;
1131  aBigger:
1132     zSig = aSig - bSig;
1133     zExp = aExp;
1134  normalizeRoundAndPack:
1135     --zExp;
1136     return normalizeRoundAndPackFloat32( roundData, zSign, zExp, zSig );
1137
1138 }
1139
1140 /*
1141 -------------------------------------------------------------------------------
1142 Returns the result of adding the single-precision floating-point values `a'
1143 and `b'.  The operation is performed according to the IEC/IEEE Standard for
1144 Binary Floating-point Arithmetic.
1145 -------------------------------------------------------------------------------
1146 */
1147 float32 float32_add( struct roundingData *roundData, float32 a, float32 b )
1148 {
1149     flag aSign, bSign;
1150
1151     aSign = extractFloat32Sign( a );
1152     bSign = extractFloat32Sign( b );
1153     if ( aSign == bSign ) {
1154         return addFloat32Sigs( roundData, a, b, aSign );
1155     }
1156     else {
1157         return subFloat32Sigs( roundData, a, b, aSign );
1158     }
1159
1160 }
1161
1162 /*
1163 -------------------------------------------------------------------------------
1164 Returns the result of subtracting the single-precision floating-point values
1165 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1166 for Binary Floating-point Arithmetic.
1167 -------------------------------------------------------------------------------
1168 */
1169 float32 float32_sub( struct roundingData *roundData, float32 a, float32 b )
1170 {
1171     flag aSign, bSign;
1172
1173     aSign = extractFloat32Sign( a );
1174     bSign = extractFloat32Sign( b );
1175     if ( aSign == bSign ) {
1176         return subFloat32Sigs( roundData, a, b, aSign );
1177     }
1178     else {
1179         return addFloat32Sigs( roundData, a, b, aSign );
1180     }
1181
1182 }
1183
1184 /*
1185 -------------------------------------------------------------------------------
1186 Returns the result of multiplying the single-precision floating-point values
1187 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1188 for Binary Floating-point Arithmetic.
1189 -------------------------------------------------------------------------------
1190 */
1191 float32 float32_mul( struct roundingData *roundData, float32 a, float32 b )
1192 {
1193     flag aSign, bSign, zSign;
1194     int16 aExp, bExp, zExp;
1195     bits32 aSig, bSig;
1196     bits64 zSig64;
1197     bits32 zSig;
1198
1199     aSig = extractFloat32Frac( a );
1200     aExp = extractFloat32Exp( a );
1201     aSign = extractFloat32Sign( a );
1202     bSig = extractFloat32Frac( b );
1203     bExp = extractFloat32Exp( b );
1204     bSign = extractFloat32Sign( b );
1205     zSign = aSign ^ bSign;
1206     if ( aExp == 0xFF ) {
1207         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1208             return propagateFloat32NaN( a, b );
1209         }
1210         if ( ( bExp | bSig ) == 0 ) {
1211             roundData->exception |= float_flag_invalid;
1212             return float32_default_nan;
1213         }
1214         return packFloat32( zSign, 0xFF, 0 );
1215     }
1216     if ( bExp == 0xFF ) {
1217         if ( bSig ) return propagateFloat32NaN( a, b );
1218         if ( ( aExp | aSig ) == 0 ) {
1219             roundData->exception |= float_flag_invalid;
1220             return float32_default_nan;
1221         }
1222         return packFloat32( zSign, 0xFF, 0 );
1223     }
1224     if ( aExp == 0 ) {
1225         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1226         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1227     }
1228     if ( bExp == 0 ) {
1229         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1230         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1231     }
1232     zExp = aExp + bExp - 0x7F;
1233     aSig = ( aSig | 0x00800000 )<<7;
1234     bSig = ( bSig | 0x00800000 )<<8;
1235     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1236     zSig = zSig64;
1237     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1238         zSig <<= 1;
1239         --zExp;
1240     }
1241     return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1242
1243 }
1244
1245 /*
1246 -------------------------------------------------------------------------------
1247 Returns the result of dividing the single-precision floating-point value `a'
1248 by the corresponding value `b'.  The operation is performed according to the
1249 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1250 -------------------------------------------------------------------------------
1251 */
1252 float32 float32_div( struct roundingData *roundData, float32 a, float32 b )
1253 {
1254     flag aSign, bSign, zSign;
1255     int16 aExp, bExp, zExp;
1256     bits32 aSig, bSig, zSig;
1257
1258     aSig = extractFloat32Frac( a );
1259     aExp = extractFloat32Exp( a );
1260     aSign = extractFloat32Sign( a );
1261     bSig = extractFloat32Frac( b );
1262     bExp = extractFloat32Exp( b );
1263     bSign = extractFloat32Sign( b );
1264     zSign = aSign ^ bSign;
1265     if ( aExp == 0xFF ) {
1266         if ( aSig ) return propagateFloat32NaN( a, b );
1267         if ( bExp == 0xFF ) {
1268             if ( bSig ) return propagateFloat32NaN( a, b );
1269             roundData->exception |= float_flag_invalid;
1270             return float32_default_nan;
1271         }
1272         return packFloat32( zSign, 0xFF, 0 );
1273     }
1274     if ( bExp == 0xFF ) {
1275         if ( bSig ) return propagateFloat32NaN( a, b );
1276         return packFloat32( zSign, 0, 0 );
1277     }
1278     if ( bExp == 0 ) {
1279         if ( bSig == 0 ) {
1280             if ( ( aExp | aSig ) == 0 ) {
1281                 roundData->exception |= float_flag_invalid;
1282                 return float32_default_nan;
1283             }
1284             roundData->exception |= float_flag_divbyzero;
1285             return packFloat32( zSign, 0xFF, 0 );
1286         }
1287         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1288     }
1289     if ( aExp == 0 ) {
1290         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1291         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1292     }
1293     zExp = aExp - bExp + 0x7D;
1294     aSig = ( aSig | 0x00800000 )<<7;
1295     bSig = ( bSig | 0x00800000 )<<8;
1296     if ( bSig <= ( aSig + aSig ) ) {
1297         aSig >>= 1;
1298         ++zExp;
1299     }
1300     {
1301         bits64 tmp = ( (bits64) aSig )<<32;
1302         do_div( tmp, bSig );
1303         zSig = tmp;
1304     }
1305     if ( ( zSig & 0x3F ) == 0 ) {
1306         zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
1307     }
1308     return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1309
1310 }
1311
1312 /*
1313 -------------------------------------------------------------------------------
1314 Returns the remainder of the single-precision floating-point value `a'
1315 with respect to the corresponding value `b'.  The operation is performed
1316 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1317 -------------------------------------------------------------------------------
1318 */
1319 float32 float32_rem( struct roundingData *roundData, float32 a, float32 b )
1320 {
1321     flag aSign, bSign, zSign;
1322     int16 aExp, bExp, expDiff;
1323     bits32 aSig, bSig;
1324     bits32 q;
1325     bits64 aSig64, bSig64, q64;
1326     bits32 alternateASig;
1327     sbits32 sigMean;
1328
1329     aSig = extractFloat32Frac( a );
1330     aExp = extractFloat32Exp( a );
1331     aSign = extractFloat32Sign( a );
1332     bSig = extractFloat32Frac( b );
1333     bExp = extractFloat32Exp( b );
1334     bSign = extractFloat32Sign( b );
1335     if ( aExp == 0xFF ) {
1336         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1337             return propagateFloat32NaN( a, b );
1338         }
1339         roundData->exception |= float_flag_invalid;
1340         return float32_default_nan;
1341     }
1342     if ( bExp == 0xFF ) {
1343         if ( bSig ) return propagateFloat32NaN( a, b );
1344         return a;
1345     }
1346     if ( bExp == 0 ) {
1347         if ( bSig == 0 ) {
1348             roundData->exception |= float_flag_invalid;
1349             return float32_default_nan;
1350         }
1351         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1352     }
1353     if ( aExp == 0 ) {
1354         if ( aSig == 0 ) return a;
1355         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1356     }
1357     expDiff = aExp - bExp;
1358     aSig |= 0x00800000;
1359     bSig |= 0x00800000;
1360     if ( expDiff < 32 ) {
1361         aSig <<= 8;
1362         bSig <<= 8;
1363         if ( expDiff < 0 ) {
1364             if ( expDiff < -1 ) return a;
1365             aSig >>= 1;
1366         }
1367         q = ( bSig <= aSig );
1368         if ( q ) aSig -= bSig;
1369         if ( 0 < expDiff ) {
1370             bits64 tmp = ( (bits64) aSig )<<32;
1371             do_div( tmp, bSig );
1372             q = tmp;
1373             q >>= 32 - expDiff;
1374             bSig >>= 2;
1375             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1376         }
1377         else {
1378             aSig >>= 2;
1379             bSig >>= 2;
1380         }
1381     }
1382     else {
1383         if ( bSig <= aSig ) aSig -= bSig;
1384         aSig64 = ( (bits64) aSig )<<40;
1385         bSig64 = ( (bits64) bSig )<<40;
1386         expDiff -= 64;
1387         while ( 0 < expDiff ) {
1388             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1389             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1390             aSig64 = - ( ( bSig * q64 )<<38 );
1391             expDiff -= 62;
1392         }
1393         expDiff += 64;
1394         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1395         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1396         q = q64>>( 64 - expDiff );
1397         bSig <<= 6;
1398         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1399     }
1400     do {
1401         alternateASig = aSig;
1402         ++q;
1403         aSig -= bSig;
1404     } while ( 0 <= (sbits32) aSig );
1405     sigMean = aSig + alternateASig;
1406     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1407         aSig = alternateASig;
1408     }
1409     zSign = ( (sbits32) aSig < 0 );
1410     if ( zSign ) aSig = - aSig;
1411     return normalizeRoundAndPackFloat32( roundData, aSign ^ zSign, bExp, aSig );
1412
1413 }
1414
1415 /*
1416 -------------------------------------------------------------------------------
1417 Returns the square root of the single-precision floating-point value `a'.
1418 The operation is performed according to the IEC/IEEE Standard for Binary
1419 Floating-point Arithmetic.
1420 -------------------------------------------------------------------------------
1421 */
1422 float32 float32_sqrt( struct roundingData *roundData, float32 a )
1423 {
1424     flag aSign;
1425     int16 aExp, zExp;
1426     bits32 aSig, zSig;
1427     bits64 rem, term;
1428
1429     aSig = extractFloat32Frac( a );
1430     aExp = extractFloat32Exp( a );
1431     aSign = extractFloat32Sign( a );
1432     if ( aExp == 0xFF ) {
1433         if ( aSig ) return propagateFloat32NaN( a, 0 );
1434         if ( ! aSign ) return a;
1435         roundData->exception |= float_flag_invalid;
1436         return float32_default_nan;
1437     }
1438     if ( aSign ) {
1439         if ( ( aExp | aSig ) == 0 ) return a;
1440         roundData->exception |= float_flag_invalid;
1441         return float32_default_nan;
1442     }
1443     if ( aExp == 0 ) {
1444         if ( aSig == 0 ) return 0;
1445         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1446     }
1447     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1448     aSig = ( aSig | 0x00800000 )<<8;
1449     zSig = estimateSqrt32( aExp, aSig ) + 2;
1450     if ( ( zSig & 0x7F ) <= 5 ) {
1451         if ( zSig < 2 ) {
1452             zSig = 0xFFFFFFFF;
1453         }
1454         else {
1455             aSig >>= aExp & 1;
1456             term = ( (bits64) zSig ) * zSig;
1457             rem = ( ( (bits64) aSig )<<32 ) - term;
1458             while ( (sbits64) rem < 0 ) {
1459                 --zSig;
1460                 rem += ( ( (bits64) zSig )<<1 ) | 1;
1461             }
1462             zSig |= ( rem != 0 );
1463         }
1464     }
1465     shift32RightJamming( zSig, 1, &zSig );
1466     return roundAndPackFloat32( roundData, 0, zExp, zSig );
1467
1468 }
1469
1470 /*
1471 -------------------------------------------------------------------------------
1472 Returns 1 if the single-precision floating-point value `a' is equal to the
1473 corresponding value `b', and 0 otherwise.  The comparison is performed
1474 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1475 -------------------------------------------------------------------------------
1476 */
1477 flag float32_eq( float32 a, float32 b )
1478 {
1479
1480     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1481          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1482        ) {
1483         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1484             float_raise( float_flag_invalid );
1485         }
1486         return 0;
1487     }
1488     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1489
1490 }
1491
1492 /*
1493 -------------------------------------------------------------------------------
1494 Returns 1 if the single-precision floating-point value `a' is less than or
1495 equal to the corresponding value `b', and 0 otherwise.  The comparison is
1496 performed according to the IEC/IEEE Standard for Binary Floating-point
1497 Arithmetic.
1498 -------------------------------------------------------------------------------
1499 */
1500 flag float32_le( float32 a, float32 b )
1501 {
1502     flag aSign, bSign;
1503
1504     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1505          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1506        ) {
1507         float_raise( float_flag_invalid );
1508         return 0;
1509     }
1510     aSign = extractFloat32Sign( a );
1511     bSign = extractFloat32Sign( b );
1512     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1513     return ( a == b ) || ( aSign ^ ( a < b ) );
1514
1515 }
1516
1517 /*
1518 -------------------------------------------------------------------------------
1519 Returns 1 if the single-precision floating-point value `a' is less than
1520 the corresponding value `b', and 0 otherwise.  The comparison is performed
1521 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1522 -------------------------------------------------------------------------------
1523 */
1524 flag float32_lt( float32 a, float32 b )
1525 {
1526     flag aSign, bSign;
1527
1528     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1529          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1530        ) {
1531         float_raise( float_flag_invalid );
1532         return 0;
1533     }
1534     aSign = extractFloat32Sign( a );
1535     bSign = extractFloat32Sign( b );
1536     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1537     return ( a != b ) && ( aSign ^ ( a < b ) );
1538
1539 }
1540
1541 /*
1542 -------------------------------------------------------------------------------
1543 Returns 1 if the single-precision floating-point value `a' is equal to the
1544 corresponding value `b', and 0 otherwise.  The invalid exception is raised
1545 if either operand is a NaN.  Otherwise, the comparison is performed
1546 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1547 -------------------------------------------------------------------------------
1548 */
1549 flag float32_eq_signaling( float32 a, float32 b )
1550 {
1551
1552     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1553          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1554        ) {
1555         float_raise( float_flag_invalid );
1556         return 0;
1557     }
1558     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1559
1560 }
1561
1562 /*
1563 -------------------------------------------------------------------------------
1564 Returns 1 if the single-precision floating-point value `a' is less than or
1565 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
1566 cause an exception.  Otherwise, the comparison is performed according to the
1567 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1568 -------------------------------------------------------------------------------
1569 */
1570 flag float32_le_quiet( float32 a, float32 b )
1571 {
1572     flag aSign, bSign;
1573     //int16 aExp, bExp;
1574
1575     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1576          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1577        ) {
1578         /* Do nothing, even if NaN as we're quiet */
1579         return 0;
1580     }
1581     aSign = extractFloat32Sign( a );
1582     bSign = extractFloat32Sign( b );
1583     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1584     return ( a == b ) || ( aSign ^ ( a < b ) );
1585
1586 }
1587
1588 /*
1589 -------------------------------------------------------------------------------
1590 Returns 1 if the single-precision floating-point value `a' is less than
1591 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
1592 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
1593 Standard for Binary Floating-point Arithmetic.
1594 -------------------------------------------------------------------------------
1595 */
1596 flag float32_lt_quiet( float32 a, float32 b )
1597 {
1598     flag aSign, bSign;
1599
1600     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1601          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1602        ) {
1603         /* Do nothing, even if NaN as we're quiet */
1604         return 0;
1605     }
1606     aSign = extractFloat32Sign( a );
1607     bSign = extractFloat32Sign( b );
1608     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1609     return ( a != b ) && ( aSign ^ ( a < b ) );
1610
1611 }
1612
1613 /*
1614 -------------------------------------------------------------------------------
1615 Returns the result of converting the double-precision floating-point value
1616 `a' to the 32-bit two's complement integer format.  The conversion is
1617 performed according to the IEC/IEEE Standard for Binary Floating-point
1618 Arithmetic---which means in particular that the conversion is rounded
1619 according to the current rounding mode.  If `a' is a NaN, the largest
1620 positive integer is returned.  Otherwise, if the conversion overflows, the
1621 largest integer with the same sign as `a' is returned.
1622 -------------------------------------------------------------------------------
1623 */
1624 int32 float64_to_int32( struct roundingData *roundData, float64 a )
1625 {
1626     flag aSign;
1627     int16 aExp, shiftCount;
1628     bits64 aSig;
1629
1630     aSig = extractFloat64Frac( a );
1631     aExp = extractFloat64Exp( a );
1632     aSign = extractFloat64Sign( a );
1633     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1634     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1635     shiftCount = 0x42C - aExp;
1636     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1637     return roundAndPackInt32( roundData, aSign, aSig );
1638
1639 }
1640
1641 /*
1642 -------------------------------------------------------------------------------
1643 Returns the result of converting the double-precision floating-point value
1644 `a' to the 32-bit two's complement integer format.  The conversion is
1645 performed according to the IEC/IEEE Standard for Binary Floating-point
1646 Arithmetic, except that the conversion is always rounded toward zero.  If
1647 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1648 conversion overflows, the largest integer with the same sign as `a' is
1649 returned.
1650 -------------------------------------------------------------------------------
1651 */
1652 int32 float64_to_int32_round_to_zero( float64 a )
1653 {
1654     flag aSign;
1655     int16 aExp, shiftCount;
1656     bits64 aSig, savedASig;
1657     int32 z;
1658
1659     aSig = extractFloat64Frac( a );
1660     aExp = extractFloat64Exp( a );
1661     aSign = extractFloat64Sign( a );
1662     shiftCount = 0x433 - aExp;
1663     if ( shiftCount < 21 ) {
1664         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1665         goto invalid;
1666     }
1667     else if ( 52 < shiftCount ) {
1668         if ( aExp || aSig ) float_raise( float_flag_inexact );
1669         return 0;
1670     }
1671     aSig |= LIT64( 0x0010000000000000 );
1672     savedASig = aSig;
1673     aSig >>= shiftCount;
1674     z = aSig;
1675     if ( aSign ) z = - z;
1676     if ( ( z < 0 ) ^ aSign ) {
1677  invalid:
1678         float_raise( float_flag_invalid );
1679         return aSign ? 0x80000000 : 0x7FFFFFFF;
1680     }
1681     if ( ( aSig<<shiftCount ) != savedASig ) {
1682         float_raise( float_flag_inexact );
1683     }
1684     return z;
1685
1686 }
1687
1688 /*
1689 -------------------------------------------------------------------------------
1690 Returns the result of converting the double-precision floating-point value
1691 `a' to the 32-bit two's complement unsigned integer format.  The conversion
1692 is performed according to the IEC/IEEE Standard for Binary Floating-point
1693 Arithmetic---which means in particular that the conversion is rounded
1694 according to the current rounding mode.  If `a' is a NaN, the largest
1695 positive integer is returned.  Otherwise, if the conversion overflows, the
1696 largest positive integer is returned.
1697 -------------------------------------------------------------------------------
1698 */
1699 int32 float64_to_uint32( struct roundingData *roundData, float64 a )
1700 {
1701     flag aSign;
1702     int16 aExp, shiftCount;
1703     bits64 aSig;
1704
1705     aSig = extractFloat64Frac( a );
1706     aExp = extractFloat64Exp( a );
1707     aSign = 0; //extractFloat64Sign( a );
1708     //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1709     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1710     shiftCount = 0x42C - aExp;
1711     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1712     return roundAndPackInt32( roundData, aSign, aSig );
1713 }
1714
1715 /*
1716 -------------------------------------------------------------------------------
1717 Returns the result of converting the double-precision floating-point value
1718 `a' to the 32-bit two's complement integer format.  The conversion is
1719 performed according to the IEC/IEEE Standard for Binary Floating-point
1720 Arithmetic, except that the conversion is always rounded toward zero.  If
1721 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1722 conversion overflows, the largest positive integer is returned.
1723 -------------------------------------------------------------------------------
1724 */
1725 int32 float64_to_uint32_round_to_zero( float64 a )
1726 {
1727     flag aSign;
1728     int16 aExp, shiftCount;
1729     bits64 aSig, savedASig;
1730     int32 z;
1731
1732     aSig = extractFloat64Frac( a );
1733     aExp = extractFloat64Exp( a );
1734     aSign = extractFloat64Sign( a );
1735     shiftCount = 0x433 - aExp;
1736     if ( shiftCount < 21 ) {
1737         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1738         goto invalid;
1739     }
1740     else if ( 52 < shiftCount ) {
1741         if ( aExp || aSig ) float_raise( float_flag_inexact );
1742         return 0;
1743     }
1744     aSig |= LIT64( 0x0010000000000000 );
1745     savedASig = aSig;
1746     aSig >>= shiftCount;
1747     z = aSig;
1748     if ( aSign ) z = - z;
1749     if ( ( z < 0 ) ^ aSign ) {
1750  invalid:
1751         float_raise( float_flag_invalid );
1752         return aSign ? 0x80000000 : 0x7FFFFFFF;
1753     }
1754     if ( ( aSig<<shiftCount ) != savedASig ) {
1755         float_raise( float_flag_inexact );
1756     }
1757     return z;
1758 }
1759
1760 /*
1761 -------------------------------------------------------------------------------
1762 Returns the result of converting the double-precision floating-point value
1763 `a' to the single-precision floating-point format.  The conversion is
1764 performed according to the IEC/IEEE Standard for Binary Floating-point
1765 Arithmetic.
1766 -------------------------------------------------------------------------------
1767 */
1768 float32 float64_to_float32( struct roundingData *roundData, float64 a )
1769 {
1770     flag aSign;
1771     int16 aExp;
1772     bits64 aSig;
1773     bits32 zSig;
1774
1775     aSig = extractFloat64Frac( a );
1776     aExp = extractFloat64Exp( a );
1777     aSign = extractFloat64Sign( a );
1778     if ( aExp == 0x7FF ) {
1779         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
1780         return packFloat32( aSign, 0xFF, 0 );
1781     }
1782     shift64RightJamming( aSig, 22, &aSig );
1783     zSig = aSig;
1784     if ( aExp || zSig ) {
1785         zSig |= 0x40000000;
1786         aExp -= 0x381;
1787     }
1788     return roundAndPackFloat32( roundData, aSign, aExp, zSig );
1789
1790 }
1791
1792 #ifdef FLOATX80
1793
1794 /*
1795 -------------------------------------------------------------------------------
1796 Returns the result of converting the double-precision floating-point value
1797 `a' to the extended double-precision floating-point format.  The conversion
1798 is performed according to the IEC/IEEE Standard for Binary Floating-point
1799 Arithmetic.
1800 -------------------------------------------------------------------------------
1801 */
1802 floatx80 float64_to_floatx80( float64 a )
1803 {
1804     flag aSign;
1805     int16 aExp;
1806     bits64 aSig;
1807
1808     aSig = extractFloat64Frac( a );
1809     aExp = extractFloat64Exp( a );
1810     aSign = extractFloat64Sign( a );
1811     if ( aExp == 0x7FF ) {
1812         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
1813         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1814     }
1815     if ( aExp == 0 ) {
1816         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1817         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
1818     }
1819     return
1820         packFloatx80(
1821             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
1822
1823 }
1824
1825 #endif
1826
1827 /*
1828 -------------------------------------------------------------------------------
1829 Rounds the double-precision floating-point value `a' to an integer, and
1830 returns the result as a double-precision floating-point value.  The
1831 operation is performed according to the IEC/IEEE Standard for Binary
1832 Floating-point Arithmetic.
1833 -------------------------------------------------------------------------------
1834 */
1835 float64 float64_round_to_int( struct roundingData *roundData, float64 a )
1836 {
1837     flag aSign;
1838     int16 aExp;
1839     bits64 lastBitMask, roundBitsMask;
1840     int8 roundingMode;
1841     float64 z;
1842
1843     aExp = extractFloat64Exp( a );
1844     if ( 0x433 <= aExp ) {
1845         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
1846             return propagateFloat64NaN( a, a );
1847         }
1848         return a;
1849     }
1850     if ( aExp <= 0x3FE ) {
1851         if ( (bits64) ( a<<1 ) == 0 ) return a;
1852         roundData->exception |= float_flag_inexact;
1853         aSign = extractFloat64Sign( a );
1854         switch ( roundData->mode ) {
1855          case float_round_nearest_even:
1856             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
1857                 return packFloat64( aSign, 0x3FF, 0 );
1858             }
1859             break;
1860          case float_round_down:
1861             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
1862          case float_round_up:
1863             return
1864             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
1865         }
1866         return packFloat64( aSign, 0, 0 );
1867     }
1868     lastBitMask = 1;
1869     lastBitMask <<= 0x433 - aExp;
1870     roundBitsMask = lastBitMask - 1;
1871     z = a;
1872     roundingMode = roundData->mode;
1873     if ( roundingMode == float_round_nearest_even ) {
1874         z += lastBitMask>>1;
1875         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1876     }
1877     else if ( roundingMode != float_round_to_zero ) {
1878         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1879             z += roundBitsMask;
1880         }
1881     }
1882     z &= ~ roundBitsMask;
1883     if ( z != a ) roundData->exception |= float_flag_inexact;
1884     return z;
1885
1886 }
1887
1888 /*
1889 -------------------------------------------------------------------------------
1890 Returns the result of adding the absolute values of the double-precision
1891 floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
1892 before being returned.  `zSign' is ignored if the result is a NaN.  The
1893 addition is performed according to the IEC/IEEE Standard for Binary
1894 Floating-point Arithmetic.
1895 -------------------------------------------------------------------------------
1896 */
1897 static float64 addFloat64Sigs( struct roundingData *roundData, float64 a, float64 b, flag zSign )
1898 {
1899     int16 aExp, bExp, zExp;
1900     bits64 aSig, bSig, zSig;
1901     int16 expDiff;
1902
1903     aSig = extractFloat64Frac( a );
1904     aExp = extractFloat64Exp( a );
1905     bSig = extractFloat64Frac( b );
1906     bExp = extractFloat64Exp( b );
1907     expDiff = aExp - bExp;
1908     aSig <<= 9;
1909     bSig <<= 9;
1910     if ( 0 < expDiff ) {
1911         if ( aExp == 0x7FF ) {
1912             if ( aSig ) return propagateFloat64NaN( a, b );
1913             return a;
1914         }
1915         if ( bExp == 0 ) {
1916             --expDiff;
1917         }
1918         else {
1919             bSig |= LIT64( 0x2000000000000000 );
1920         }
1921         shift64RightJamming( bSig, expDiff, &bSig );
1922         zExp = aExp;
1923     }
1924     else if ( expDiff < 0 ) {
1925         if ( bExp == 0x7FF ) {
1926             if ( bSig ) return propagateFloat64NaN( a, b );
1927             return packFloat64( zSign, 0x7FF, 0 );
1928         }
1929         if ( aExp == 0 ) {
1930             ++expDiff;
1931         }
1932         else {
1933             aSig |= LIT64( 0x2000000000000000 );
1934         }
1935         shift64RightJamming( aSig, - expDiff, &aSig );
1936         zExp = bExp;
1937     }
1938     else {
1939         if ( aExp == 0x7FF ) {
1940             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
1941             return a;
1942         }
1943         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
1944         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
1945         zExp = aExp;
1946         goto roundAndPack;
1947     }
1948     aSig |= LIT64( 0x2000000000000000 );
1949     zSig = ( aSig + bSig )<<1;
1950     --zExp;
1951     if ( (sbits64) zSig < 0 ) {
1952         zSig = aSig + bSig;
1953         ++zExp;
1954     }
1955  roundAndPack:
1956     return roundAndPackFloat64( roundData, zSign, zExp, zSig );
1957
1958 }
1959
1960 /*
1961 -------------------------------------------------------------------------------
1962 Returns the result of subtracting the absolute values of the double-
1963 precision floating-point values `a' and `b'.  If `zSign' is true, the
1964 difference is negated before being returned.  `zSign' is ignored if the
1965 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1966 Standard for Binary Floating-point Arithmetic.
1967 -------------------------------------------------------------------------------
1968 */
1969 static float64 subFloat64Sigs( struct roundingData *roundData, float64 a, float64 b, flag zSign )
1970 {
1971     int16 aExp, bExp, zExp;
1972     bits64 aSig, bSig, zSig;
1973     int16 expDiff;
1974
1975     aSig = extractFloat64Frac( a );
1976     aExp = extractFloat64Exp( a );
1977     bSig = extractFloat64Frac( b );
1978     bExp = extractFloat64Exp( b );
1979     expDiff = aExp - bExp;
1980     aSig <<= 10;
1981     bSig <<= 10;
1982     if ( 0 < expDiff ) goto aExpBigger;
1983     if ( expDiff < 0 ) goto bExpBigger;
1984     if ( aExp == 0x7FF ) {
1985         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
1986         roundData->exception |= float_flag_invalid;
1987         return float64_default_nan;
1988     }
1989     if ( aExp == 0 ) {
1990         aExp = 1;
1991         bExp = 1;
1992     }
1993     if ( bSig < aSig ) goto aBigger;
1994     if ( aSig < bSig ) goto bBigger;
1995     return packFloat64( roundData->mode == float_round_down, 0, 0 );
1996  bExpBigger:
1997     if ( bExp == 0x7FF ) {
1998         if ( bSig ) return propagateFloat64NaN( a, b );
1999         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2000     }
2001     if ( aExp == 0 ) {
2002         ++expDiff;
2003     }
2004     else {
2005         aSig |= LIT64( 0x4000000000000000 );
2006     }
2007     shift64RightJamming( aSig, - expDiff, &aSig );
2008     bSig |= LIT64( 0x4000000000000000 );
2009  bBigger:
2010     zSig = bSig - aSig;
2011     zExp = bExp;
2012     zSign ^= 1;
2013     goto normalizeRoundAndPack;
2014  aExpBigger:
2015     if ( aExp == 0x7FF ) {
2016         if ( aSig ) return propagateFloat64NaN( a, b );
2017         return a;
2018     }
2019     if ( bExp == 0 ) {
2020         --expDiff;
2021     }
2022     else {
2023         bSig |= LIT64( 0x4000000000000000 );
2024     }
2025     shift64RightJamming( bSig, expDiff, &bSig );
2026     aSig |= LIT64( 0x4000000000000000 );
2027  aBigger:
2028     zSig = aSig - bSig;
2029     zExp = aExp;
2030  normalizeRoundAndPack:
2031     --zExp;
2032     return normalizeRoundAndPackFloat64( roundData, zSign, zExp, zSig );
2033
2034 }
2035
2036 /*
2037 -------------------------------------------------------------------------------
2038 Returns the result of adding the double-precision floating-point values `a'
2039 and `b'.  The operation is performed according to the IEC/IEEE Standard for
2040 Binary Floating-point Arithmetic.
2041 -------------------------------------------------------------------------------
2042 */
2043 float64 float64_add( struct roundingData *roundData, float64 a, float64 b )
2044 {
2045     flag aSign, bSign;
2046
2047     aSign = extractFloat64Sign( a );
2048     bSign = extractFloat64Sign( b );
2049     if ( aSign == bSign ) {
2050         return addFloat64Sigs( roundData, a, b, aSign );
2051     }
2052     else {
2053         return subFloat64Sigs( roundData, a, b, aSign );
2054     }
2055
2056 }
2057
2058 /*
2059 -------------------------------------------------------------------------------
2060 Returns the result of subtracting the double-precision floating-point values
2061 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2062 for Binary Floating-point Arithmetic.
2063 -------------------------------------------------------------------------------
2064 */
2065 float64 float64_sub( struct roundingData *roundData, float64 a, float64 b )
2066 {
2067     flag aSign, bSign;
2068
2069     aSign = extractFloat64Sign( a );
2070     bSign = extractFloat64Sign( b );
2071     if ( aSign == bSign ) {
2072         return subFloat64Sigs( roundData, a, b, aSign );
2073     }
2074     else {
2075         return addFloat64Sigs( roundData, a, b, aSign );
2076     }
2077
2078 }
2079
2080 /*
2081 -------------------------------------------------------------------------------
2082 Returns the result of multiplying the double-precision floating-point values
2083 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2084 for Binary Floating-point Arithmetic.
2085 -------------------------------------------------------------------------------
2086 */
2087 float64 float64_mul( struct roundingData *roundData, float64 a, float64 b )
2088 {
2089     flag aSign, bSign, zSign;
2090     int16 aExp, bExp, zExp;
2091     bits64 aSig, bSig, zSig0, zSig1;
2092
2093     aSig = extractFloat64Frac( a );
2094     aExp = extractFloat64Exp( a );
2095     aSign = extractFloat64Sign( a );
2096     bSig = extractFloat64Frac( b );
2097     bExp = extractFloat64Exp( b );
2098     bSign = extractFloat64Sign( b );
2099     zSign = aSign ^ bSign;
2100     if ( aExp == 0x7FF ) {
2101         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2102             return propagateFloat64NaN( a, b );
2103         }
2104         if ( ( bExp | bSig ) == 0 ) {
2105             roundData->exception |= float_flag_invalid;
2106             return float64_default_nan;
2107         }
2108         return packFloat64( zSign, 0x7FF, 0 );
2109     }
2110     if ( bExp == 0x7FF ) {
2111         if ( bSig ) return propagateFloat64NaN( a, b );
2112         if ( ( aExp | aSig ) == 0 ) {
2113             roundData->exception |= float_flag_invalid;
2114             return float64_default_nan;
2115         }
2116         return packFloat64( zSign, 0x7FF, 0 );
2117     }
2118     if ( aExp == 0 ) {
2119         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2120         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2121     }
2122     if ( bExp == 0 ) {
2123         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2124         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2125     }
2126     zExp = aExp + bExp - 0x3FF;
2127     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2128     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2129     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2130     zSig0 |= ( zSig1 != 0 );
2131     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2132         zSig0 <<= 1;
2133         --zExp;
2134     }
2135     return roundAndPackFloat64( roundData, zSign, zExp, zSig0 );
2136
2137 }
2138
2139 /*
2140 -------------------------------------------------------------------------------
2141 Returns the result of dividing the double-precision floating-point value `a'
2142 by the corresponding value `b'.  The operation is performed according to
2143 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2144 -------------------------------------------------------------------------------
2145 */
2146 float64 float64_div( struct roundingData *roundData, float64 a, float64 b )
2147 {
2148     flag aSign, bSign, zSign;
2149     int16 aExp, bExp, zExp;
2150     bits64 aSig, bSig, zSig;
2151     bits64 rem0, rem1;
2152     bits64 term0, term1;
2153
2154     aSig = extractFloat64Frac( a );
2155     aExp = extractFloat64Exp( a );
2156     aSign = extractFloat64Sign( a );
2157     bSig = extractFloat64Frac( b );
2158     bExp = extractFloat64Exp( b );
2159     bSign = extractFloat64Sign( b );
2160     zSign = aSign ^ bSign;
2161     if ( aExp == 0x7FF ) {
2162         if ( aSig ) return propagateFloat64NaN( a, b );
2163         if ( bExp == 0x7FF ) {
2164             if ( bSig ) return propagateFloat64NaN( a, b );
2165             roundData->exception |= float_flag_invalid;
2166             return float64_default_nan;
2167         }
2168         return packFloat64( zSign, 0x7FF, 0 );
2169     }
2170     if ( bExp == 0x7FF ) {
2171         if ( bSig ) return propagateFloat64NaN( a, b );
2172         return packFloat64( zSign, 0, 0 );
2173     }
2174     if ( bExp == 0 ) {
2175         if ( bSig == 0 ) {
2176             if ( ( aExp | aSig ) == 0 ) {
2177                 roundData->exception |= float_flag_invalid;
2178                 return float64_default_nan;
2179             }
2180             roundData->exception |= float_flag_divbyzero;
2181             return packFloat64( zSign, 0x7FF, 0 );
2182         }
2183         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2184     }
2185     if ( aExp == 0 ) {
2186         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2187         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2188     }
2189     zExp = aExp - bExp + 0x3FD;
2190     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2191     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2192     if ( bSig <= ( aSig + aSig ) ) {
2193         aSig >>= 1;
2194         ++zExp;
2195     }
2196     zSig = estimateDiv128To64( aSig, 0, bSig );
2197     if ( ( zSig & 0x1FF ) <= 2 ) {
2198         mul64To128( bSig, zSig, &term0, &term1 );
2199         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2200         while ( (sbits64) rem0 < 0 ) {
2201             --zSig;
2202             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2203         }
2204         zSig |= ( rem1 != 0 );
2205     }
2206     return roundAndPackFloat64( roundData, zSign, zExp, zSig );
2207
2208 }
2209
2210 /*
2211 -------------------------------------------------------------------------------
2212 Returns the remainder of the double-precision floating-point value `a'
2213 with respect to the corresponding value `b'.  The operation is performed
2214 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2215 -------------------------------------------------------------------------------
2216 */
2217 float64 float64_rem( struct roundingData *roundData, float64 a, float64 b )
2218 {
2219     flag aSign, bSign, zSign;
2220     int16 aExp, bExp, expDiff;
2221     bits64 aSig, bSig;
2222     bits64 q, alternateASig;
2223     sbits64 sigMean;
2224
2225     aSig = extractFloat64Frac( a );
2226     aExp = extractFloat64Exp( a );
2227     aSign = extractFloat64Sign( a );
2228     bSig = extractFloat64Frac( b );
2229     bExp = extractFloat64Exp( b );
2230     bSign = extractFloat64Sign( b );
2231     if ( aExp == 0x7FF ) {
2232         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2233             return propagateFloat64NaN( a, b );
2234         }
2235         roundData->exception |= float_flag_invalid;
2236         return float64_default_nan;
2237     }
2238     if ( bExp == 0x7FF ) {
2239         if ( bSig ) return propagateFloat64NaN( a, b );
2240         return a;
2241     }
2242     if ( bExp == 0 ) {
2243         if ( bSig == 0 ) {
2244             roundData->exception |= float_flag_invalid;
2245             return float64_default_nan;
2246         }
2247         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2248     }
2249     if ( aExp == 0 ) {
2250         if ( aSig == 0 ) return a;
2251         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2252     }
2253     expDiff = aExp - bExp;
2254     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2255     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2256     if ( expDiff < 0 ) {
2257         if ( expDiff < -1 ) return a;
2258         aSig >>= 1;
2259     }
2260     q = ( bSig <= aSig );
2261     if ( q ) aSig -= bSig;
2262     expDiff -= 64;
2263     while ( 0 < expDiff ) {
2264         q = estimateDiv128To64( aSig, 0, bSig );
2265         q = ( 2 < q ) ? q - 2 : 0;
2266         aSig = - ( ( bSig>>2 ) * q );
2267         expDiff -= 62;
2268     }
2269     expDiff += 64;
2270     if ( 0 < expDiff ) {
2271         q = estimateDiv128To64( aSig, 0, bSig );
2272         q = ( 2 < q ) ? q - 2 : 0;
2273         q >>= 64 - expDiff;
2274         bSig >>= 2;
2275         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2276     }
2277     else {
2278         aSig >>= 2;
2279         bSig >>= 2;
2280     }
2281     do {
2282         alternateASig = aSig;
2283         ++q;
2284         aSig -= bSig;
2285     } while ( 0 <= (sbits64) aSig );
2286     sigMean = aSig + alternateASig;
2287     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2288         aSig = alternateASig;
2289     }
2290     zSign = ( (sbits64) aSig < 0 );
2291     if ( zSign ) aSig = - aSig;
2292     return normalizeRoundAndPackFloat64( roundData, aSign ^ zSign, bExp, aSig );
2293
2294 }
2295
2296 /*
2297 -------------------------------------------------------------------------------
2298 Returns the square root of the double-precision floating-point value `a'.
2299 The operation is performed according to the IEC/IEEE Standard for Binary
2300 Floating-point Arithmetic.
2301 -------------------------------------------------------------------------------
2302 */
2303 float64 float64_sqrt( struct roundingData *roundData, float64 a )
2304 {
2305     flag aSign;
2306     int16 aExp, zExp;
2307     bits64 aSig, zSig;
2308     bits64 rem0, rem1, term0, term1; //, shiftedRem;
2309     //float64 z;
2310
2311     aSig = extractFloat64Frac( a );
2312     aExp = extractFloat64Exp( a );
2313     aSign = extractFloat64Sign( a );
2314     if ( aExp == 0x7FF ) {
2315         if ( aSig ) return propagateFloat64NaN( a, a );
2316         if ( ! aSign ) return a;
2317         roundData->exception |= float_flag_invalid;
2318         return float64_default_nan;
2319     }
2320     if ( aSign ) {
2321         if ( ( aExp | aSig ) == 0 ) return a;
2322         roundData->exception |= float_flag_invalid;
2323         return float64_default_nan;
2324     }
2325     if ( aExp == 0 ) {
2326         if ( aSig == 0 ) return 0;
2327         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2328     }
2329     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2330     aSig |= LIT64( 0x0010000000000000 );
2331     zSig = estimateSqrt32( aExp, aSig>>21 );
2332     zSig <<= 31;
2333     aSig <<= 9 - ( aExp & 1 );
2334     zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2;
2335     if ( ( zSig & 0x3FF ) <= 5 ) {
2336         if ( zSig < 2 ) {
2337             zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
2338         }
2339         else {
2340             aSig <<= 2;
2341             mul64To128( zSig, zSig, &term0, &term1 );
2342             sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2343             while ( (sbits64) rem0 < 0 ) {
2344                 --zSig;
2345                 shortShift128Left( 0, zSig, 1, &term0, &term1 );
2346                 term1 |= 1;
2347                 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
2348             }
2349             zSig |= ( ( rem0 | rem1 ) != 0 );
2350         }
2351     }
2352     shift64RightJamming( zSig, 1, &zSig );
2353     return roundAndPackFloat64( roundData, 0, zExp, zSig );
2354
2355 }
2356
2357 /*
2358 -------------------------------------------------------------------------------
2359 Returns 1 if the double-precision floating-point value `a' is equal to the
2360 corresponding value `b', and 0 otherwise.  The comparison is performed
2361 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2362 -------------------------------------------------------------------------------
2363 */
2364 flag float64_eq( float64 a, float64 b )
2365 {
2366
2367     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2368          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2369        ) {
2370         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2371             float_raise( float_flag_invalid );
2372         }
2373         return 0;
2374     }
2375     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2376
2377 }
2378
2379 /*
2380 -------------------------------------------------------------------------------
2381 Returns 1 if the double-precision floating-point value `a' is less than or
2382 equal to the corresponding value `b', and 0 otherwise.  The comparison is
2383 performed according to the IEC/IEEE Standard for Binary Floating-point
2384 Arithmetic.
2385 -------------------------------------------------------------------------------
2386 */
2387 flag float64_le( float64 a, float64 b )
2388 {
2389     flag aSign, bSign;
2390
2391     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2392          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2393        ) {
2394         float_raise( float_flag_invalid );
2395         return 0;
2396     }
2397     aSign = extractFloat64Sign( a );
2398     bSign = extractFloat64Sign( b );
2399     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2400     return ( a == b ) || ( aSign ^ ( a < b ) );
2401
2402 }
2403
2404 /*
2405 -------------------------------------------------------------------------------
2406 Returns 1 if the double-precision floating-point value `a' is less than
2407 the corresponding value `b', and 0 otherwise.  The comparison is performed
2408 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2409 -------------------------------------------------------------------------------
2410 */
2411 flag float64_lt( float64 a, float64 b )
2412 {
2413     flag aSign, bSign;
2414
2415     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2416          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2417        ) {
2418         float_raise( float_flag_invalid );
2419         return 0;
2420     }
2421     aSign = extractFloat64Sign( a );
2422     bSign = extractFloat64Sign( b );
2423     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2424     return ( a != b ) && ( aSign ^ ( a < b ) );
2425
2426 }
2427
2428 /*
2429 -------------------------------------------------------------------------------
2430 Returns 1 if the double-precision floating-point value `a' is equal to the
2431 corresponding value `b', and 0 otherwise.  The invalid exception is raised
2432 if either operand is a NaN.  Otherwise, the comparison is performed
2433 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2434 -------------------------------------------------------------------------------
2435 */
2436 flag float64_eq_signaling( float64 a, float64 b )
2437 {
2438
2439     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2440          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2441        ) {
2442         float_raise( float_flag_invalid );
2443         return 0;
2444     }
2445     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2446
2447 }
2448
2449 /*
2450 -------------------------------------------------------------------------------
2451 Returns 1 if the double-precision floating-point value `a' is less than or
2452 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2453 cause an exception.  Otherwise, the comparison is performed according to the
2454 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2455 -------------------------------------------------------------------------------
2456 */
2457 flag float64_le_quiet( float64 a, float64 b )
2458 {
2459     flag aSign, bSign;
2460     //int16 aExp, bExp;
2461
2462     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2463          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2464        ) {
2465         /* Do nothing, even if NaN as we're quiet */
2466         return 0;
2467     }
2468     aSign = extractFloat64Sign( a );
2469     bSign = extractFloat64Sign( b );
2470     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2471     return ( a == b ) || ( aSign ^ ( a < b ) );
2472
2473 }
2474
2475 /*
2476 -------------------------------------------------------------------------------
2477 Returns 1 if the double-precision floating-point value `a' is less than
2478 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2479 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2480 Standard for Binary Floating-point Arithmetic.
2481 -------------------------------------------------------------------------------
2482 */
2483 flag float64_lt_quiet( float64 a, float64 b )
2484 {
2485     flag aSign, bSign;
2486
2487     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2488          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2489        ) {
2490         /* Do nothing, even if NaN as we're quiet */
2491         return 0;
2492     }
2493     aSign = extractFloat64Sign( a );
2494     bSign = extractFloat64Sign( b );
2495     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2496     return ( a != b ) && ( aSign ^ ( a < b ) );
2497
2498 }
2499
2500 #ifdef FLOATX80
2501
2502 /*
2503 -------------------------------------------------------------------------------
2504 Returns the result of converting the extended double-precision floating-
2505 point value `a' to the 32-bit two's complement integer format.  The
2506 conversion is performed according to the IEC/IEEE Standard for Binary
2507 Floating-point Arithmetic---which means in particular that the conversion
2508 is rounded according to the current rounding mode.  If `a' is a NaN, the
2509 largest positive integer is returned.  Otherwise, if the conversion
2510 overflows, the largest integer with the same sign as `a' is returned.
2511 -------------------------------------------------------------------------------
2512 */
2513 int32 floatx80_to_int32( struct roundingData *roundData, floatx80 a )
2514 {
2515     flag aSign;
2516     int32 aExp, shiftCount;
2517     bits64 aSig;
2518
2519     aSig = extractFloatx80Frac( a );
2520     aExp = extractFloatx80Exp( a );
2521     aSign = extractFloatx80Sign( a );
2522     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2523     shiftCount = 0x4037 - aExp;
2524     if ( shiftCount <= 0 ) shiftCount = 1;
2525     shift64RightJamming( aSig, shiftCount, &aSig );
2526     return roundAndPackInt32( roundData, aSign, aSig );
2527
2528 }
2529
2530 /*
2531 -------------------------------------------------------------------------------
2532 Returns the result of converting the extended double-precision floating-
2533 point value `a' to the 32-bit two's complement integer format.  The
2534 conversion is performed according to the IEC/IEEE Standard for Binary
2535 Floating-point Arithmetic, except that the conversion is always rounded
2536 toward zero.  If `a' is a NaN, the largest positive integer is returned.
2537 Otherwise, if the conversion overflows, the largest integer with the same
2538 sign as `a' is returned.
2539 -------------------------------------------------------------------------------
2540 */
2541 int32 floatx80_to_int32_round_to_zero( floatx80 a )
2542 {
2543     flag aSign;
2544     int32 aExp, shiftCount;
2545     bits64 aSig, savedASig;
2546     int32 z;
2547
2548     aSig = extractFloatx80Frac( a );
2549     aExp = extractFloatx80Exp( a );
2550     aSign = extractFloatx80Sign( a );
2551     shiftCount = 0x403E - aExp;
2552     if ( shiftCount < 32 ) {
2553         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2554         goto invalid;
2555     }
2556     else if ( 63 < shiftCount ) {
2557         if ( aExp || aSig ) float_raise( float_flag_inexact );
2558         return 0;
2559     }
2560     savedASig = aSig;
2561     aSig >>= shiftCount;
2562     z = aSig;
2563     if ( aSign ) z = - z;
2564     if ( ( z < 0 ) ^ aSign ) {
2565  invalid:
2566         float_raise( float_flag_invalid );
2567         return aSign ? 0x80000000 : 0x7FFFFFFF;
2568     }
2569     if ( ( aSig<<shiftCount ) != savedASig ) {
2570         float_raise( float_flag_inexact );
2571     }
2572     return z;
2573
2574 }
2575
2576 /*
2577 -------------------------------------------------------------------------------
2578 Returns the result of converting the extended double-precision floating-
2579 point value `a' to the single-precision floating-point format.  The
2580 conversion is performed according to the IEC/IEEE Standard for Binary
2581 Floating-point Arithmetic.
2582 -------------------------------------------------------------------------------
2583 */
2584 float32 floatx80_to_float32( struct roundingData *roundData, floatx80 a )
2585 {
2586     flag aSign;
2587     int32 aExp;
2588     bits64 aSig;
2589
2590     aSig = extractFloatx80Frac( a );
2591     aExp = extractFloatx80Exp( a );
2592     aSign = extractFloatx80Sign( a );
2593     if ( aExp == 0x7FFF ) {
2594         if ( (bits64) ( aSig<<1 ) ) {
2595             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
2596         }
2597         return packFloat32( aSign, 0xFF, 0 );
2598     }
2599     shift64RightJamming( aSig, 33, &aSig );
2600     if ( aExp || aSig ) aExp -= 0x3F81;
2601     return roundAndPackFloat32( roundData, aSign, aExp, aSig );
2602
2603 }
2604
2605 /*
2606 -------------------------------------------------------------------------------
2607 Returns the result of converting the extended double-precision floating-
2608 point value `a' to the double-precision floating-point format.  The
2609 conversion is performed according to the IEC/IEEE Standard for Binary
2610 Floating-point Arithmetic.
2611 -------------------------------------------------------------------------------
2612 */
2613 float64 floatx80_to_float64( struct roundingData *roundData, floatx80 a )
2614 {
2615     flag aSign;
2616     int32 aExp;
2617     bits64 aSig, zSig;
2618
2619     aSig = extractFloatx80Frac( a );
2620     aExp = extractFloatx80Exp( a );
2621     aSign = extractFloatx80Sign( a );
2622     if ( aExp == 0x7FFF ) {
2623         if ( (bits64) ( aSig<<1 ) ) {
2624             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
2625         }
2626         return packFloat64( aSign, 0x7FF, 0 );
2627     }
2628     shift64RightJamming( aSig, 1, &zSig );
2629     if ( aExp || aSig ) aExp -= 0x3C01;
2630     return roundAndPackFloat64( roundData, aSign, aExp, zSig );
2631
2632 }
2633
2634 /*
2635 -------------------------------------------------------------------------------
2636 Rounds the extended double-precision floating-point value `a' to an integer,
2637 and returns the result as an extended quadruple-precision floating-point
2638 value.  The operation is performed according to the IEC/IEEE Standard for
2639 Binary Floating-point Arithmetic.
2640 -------------------------------------------------------------------------------
2641 */
2642 floatx80 floatx80_round_to_int( struct roundingData *roundData, floatx80 a )
2643 {
2644     flag aSign;
2645     int32 aExp;
2646     bits64 lastBitMask, roundBitsMask;
2647     int8 roundingMode;
2648     floatx80 z;
2649
2650     aExp = extractFloatx80Exp( a );
2651     if ( 0x403E <= aExp ) {
2652         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
2653             return propagateFloatx80NaN( a, a );
2654         }
2655         return a;
2656     }
2657     if ( aExp <= 0x3FFE ) {
2658         if (    ( aExp == 0 )
2659              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
2660             return a;
2661         }
2662         roundData->exception |= float_flag_inexact;
2663         aSign = extractFloatx80Sign( a );
2664         switch ( roundData->mode ) {
2665          case float_round_nearest_even:
2666             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
2667                ) {
2668                 return
2669                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
2670             }
2671             break;
2672          case float_round_down:
2673             return
2674                   aSign ?
2675                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
2676                 : packFloatx80( 0, 0, 0 );
2677          case float_round_up:
2678             return
2679                   aSign ? packFloatx80( 1, 0, 0 )
2680                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
2681         }
2682         return packFloatx80( aSign, 0, 0 );
2683     }
2684     lastBitMask = 1;
2685     lastBitMask <<= 0x403E - aExp;
2686     roundBitsMask = lastBitMask - 1;
2687     z = a;
2688     roundingMode = roundData->mode;
2689     if ( roundingMode == float_round_nearest_even ) {
2690         z.low += lastBitMask>>1;
2691         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
2692     }
2693     else if ( roundingMode != float_round_to_zero ) {
2694         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2695             z.low += roundBitsMask;
2696         }
2697     }
2698     z.low &= ~ roundBitsMask;
2699     if ( z.low == 0 ) {
2700         ++z.high;
2701         z.low = LIT64( 0x8000000000000000 );
2702     }
2703     if ( z.low != a.low ) roundData->exception |= float_flag_inexact;
2704     return z;
2705
2706 }
2707
2708 /*
2709 -------------------------------------------------------------------------------
2710 Returns the result of adding the absolute values of the extended double-
2711 precision floating-point values `a' and `b'.  If `zSign' is true, the sum is
2712 negated before being returned.  `zSign' is ignored if the result is a NaN.
2713 The addition is performed according to the IEC/IEEE Standard for Binary
2714 Floating-point Arithmetic.
2715 -------------------------------------------------------------------------------
2716 */
2717 static floatx80 addFloatx80Sigs( struct roundingData *roundData, floatx80 a, floatx80 b, flag zSign )
2718 {
2719     int32 aExp, bExp, zExp;
2720     bits64 aSig, bSig, zSig0, zSig1;
2721     int32 expDiff;
2722
2723     aSig = extractFloatx80Frac( a );
2724     aExp = extractFloatx80Exp( a );
2725     bSig = extractFloatx80Frac( b );
2726     bExp = extractFloatx80Exp( b );
2727     expDiff = aExp - bExp;
2728     if ( 0 < expDiff ) {
2729         if ( aExp == 0x7FFF ) {
2730             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2731             return a;
2732         }
2733         if ( bExp == 0 ) --expDiff;
2734         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2735         zExp = aExp;
2736     }
2737     else if ( expDiff < 0 ) {
2738         if ( bExp == 0x7FFF ) {
2739             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2740             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2741         }
2742         if ( aExp == 0 ) ++expDiff;
2743         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2744         zExp = bExp;
2745     }
2746     else {
2747         if ( aExp == 0x7FFF ) {
2748             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2749                 return propagateFloatx80NaN( a, b );
2750             }
2751             return a;
2752         }
2753         zSig1 = 0;
2754         zSig0 = aSig + bSig;
2755         if ( aExp == 0 ) {
2756             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
2757             goto roundAndPack;
2758         }
2759         zExp = aExp;
2760         goto shiftRight1;
2761     }
2762
2763     zSig0 = aSig + bSig;
2764
2765     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
2766  shiftRight1:
2767     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
2768     zSig0 |= LIT64( 0x8000000000000000 );
2769     ++zExp;
2770  roundAndPack:
2771     return
2772         roundAndPackFloatx80(
2773             roundData, zSign, zExp, zSig0, zSig1 );
2774
2775 }
2776
2777 /*
2778 -------------------------------------------------------------------------------
2779 Returns the result of subtracting the absolute values of the extended
2780 double-precision floating-point values `a' and `b'.  If `zSign' is true,
2781 the difference is negated before being returned.  `zSign' is ignored if the
2782 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2783 Standard for Binary Floating-point Arithmetic.
2784 -------------------------------------------------------------------------------
2785 */
2786 static floatx80 subFloatx80Sigs( struct roundingData *roundData, floatx80 a, floatx80 b, flag zSign )
2787 {
2788     int32 aExp, bExp, zExp;
2789     bits64 aSig, bSig, zSig0, zSig1;
2790     int32 expDiff;
2791     floatx80 z;
2792
2793     aSig = extractFloatx80Frac( a );
2794     aExp = extractFloatx80Exp( a );
2795     bSig = extractFloatx80Frac( b );
2796     bExp = extractFloatx80Exp( b );
2797     expDiff = aExp - bExp;
2798     if ( 0 < expDiff ) goto aExpBigger;
2799     if ( expDiff < 0 ) goto bExpBigger;
2800     if ( aExp == 0x7FFF ) {
2801         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2802             return propagateFloatx80NaN( a, b );
2803         }
2804         roundData->exception |= float_flag_invalid;
2805         z.low = floatx80_default_nan_low;
2806         z.high = floatx80_default_nan_high;
2807         z.__padding = 0;
2808         return z;
2809     }
2810     if ( aExp == 0 ) {
2811         aExp = 1;
2812         bExp = 1;
2813     }
2814     zSig1 = 0;
2815     if ( bSig < aSig ) goto aBigger;
2816     if ( aSig < bSig ) goto bBigger;
2817     return packFloatx80( roundData->mode == float_round_down, 0, 0 );
2818  bExpBigger:
2819     if ( bExp == 0x7FFF ) {
2820         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2821         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
2822     }
2823     if ( aExp == 0 ) ++expDiff;
2824     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2825  bBigger:
2826     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
2827     zExp = bExp;
2828     zSign ^= 1;
2829     goto normalizeRoundAndPack;
2830  aExpBigger:
2831     if ( aExp == 0x7FFF ) {
2832         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2833         return a;
2834     }
2835     if ( bExp == 0 ) --expDiff;
2836     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2837  aBigger:
2838     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
2839     zExp = aExp;
2840  normalizeRoundAndPack:
2841     return
2842         normalizeRoundAndPackFloatx80(
2843             roundData, zSign, zExp, zSig0, zSig1 );
2844
2845 }
2846
2847 /*
2848 -------------------------------------------------------------------------------
2849 Returns the result of adding the extended double-precision floating-point
2850 values `a' and `b'.  The operation is performed according to the IEC/IEEE
2851 Standard for Binary Floating-point Arithmetic.
2852 -------------------------------------------------------------------------------
2853 */
2854 floatx80 floatx80_add( struct roundingData *roundData, floatx80 a, floatx80 b )
2855 {
2856     flag aSign, bSign;
2857
2858     aSign = extractFloatx80Sign( a );
2859     bSign = extractFloatx80Sign( b );
2860     if ( aSign == bSign ) {
2861         return addFloatx80Sigs( roundData, a, b, aSign );
2862     }
2863     else {
2864         return subFloatx80Sigs( roundData, a, b, aSign );
2865     }
2866
2867 }
2868
2869 /*
2870 -------------------------------------------------------------------------------
2871 Returns the result of subtracting the extended double-precision floating-
2872 point values `a' and `b'.  The operation is performed according to the
2873 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2874 -------------------------------------------------------------------------------
2875 */
2876 floatx80 floatx80_sub( struct roundingData *roundData, floatx80 a, floatx80 b )
2877 {
2878     flag aSign, bSign;
2879
2880     aSign = extractFloatx80Sign( a );
2881     bSign = extractFloatx80Sign( b );
2882     if ( aSign == bSign ) {
2883         return subFloatx80Sigs( roundData, a, b, aSign );
2884     }
2885     else {
2886         return addFloatx80Sigs( roundData, a, b, aSign );
2887     }
2888
2889 }
2890
2891 /*
2892 -------------------------------------------------------------------------------
2893 Returns the result of multiplying the extended double-precision floating-
2894 point values `a' and `b'.  The operation is performed according to the
2895 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2896 -------------------------------------------------------------------------------
2897 */
2898 floatx80 floatx80_mul( struct roundingData *roundData, floatx80 a, floatx80 b )
2899 {
2900     flag aSign, bSign, zSign;
2901     int32 aExp, bExp, zExp;
2902     bits64 aSig, bSig, zSig0, zSig1;
2903     floatx80 z;
2904
2905     aSig = extractFloatx80Frac( a );
2906     aExp = extractFloatx80Exp( a );
2907     aSign = extractFloatx80Sign( a );
2908     bSig = extractFloatx80Frac( b );
2909     bExp = extractFloatx80Exp( b );
2910     bSign = extractFloatx80Sign( b );
2911     zSign = aSign ^ bSign;
2912     if ( aExp == 0x7FFF ) {
2913         if (    (bits64) ( aSig<<1 )
2914              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
2915             return propagateFloatx80NaN( a, b );
2916         }
2917         if ( ( bExp | bSig ) == 0 ) goto invalid;
2918         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2919     }
2920     if ( bExp == 0x7FFF ) {
2921         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2922         if ( ( aExp | aSig ) == 0 ) {
2923  invalid:
2924             roundData->exception |= float_flag_invalid;
2925             z.low = floatx80_default_nan_low;
2926             z.high = floatx80_default_nan_high;
2927             z.__padding = 0;
2928             return z;
2929         }
2930         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2931     }
2932     if ( aExp == 0 ) {
2933         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
2934         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
2935     }
2936     if ( bExp == 0 ) {
2937         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
2938         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
2939     }
2940     zExp = aExp + bExp - 0x3FFE;
2941     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2942     if ( 0 < (sbits64) zSig0 ) {
2943         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
2944         --zExp;
2945     }
2946     return
2947         roundAndPackFloatx80(
2948             roundData, zSign, zExp, zSig0, zSig1 );
2949
2950 }
2951
2952 /*
2953 -------------------------------------------------------------------------------
2954 Returns the result of dividing the extended double-precision floating-point
2955 value `a' by the corresponding value `b'.  The operation is performed
2956 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2957 -------------------------------------------------------------------------------
2958 */
2959 floatx80 floatx80_div( struct roundingData *roundData, floatx80 a, floatx80 b )
2960 {
2961     flag aSign, bSign, zSign;
2962     int32 aExp, bExp, zExp;
2963     bits64 aSig, bSig, zSig0, zSig1;
2964     bits64 rem0, rem1, rem2, term0, term1, term2;
2965     floatx80 z;
2966
2967     aSig = extractFloatx80Frac( a );
2968     aExp = extractFloatx80Exp( a );
2969     aSign = extractFloatx80Sign( a );
2970     bSig = extractFloatx80Frac( b );
2971     bExp = extractFloatx80Exp( b );
2972     bSign = extractFloatx80Sign( b );
2973     zSign = aSign ^ bSign;
2974     if ( aExp == 0x7FFF ) {
2975         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2976         if ( bExp == 0x7FFF ) {
2977             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2978             goto invalid;
2979         }
2980         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2981     }
2982     if ( bExp == 0x7FFF ) {
2983         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2984         return packFloatx80( zSign, 0, 0 );
2985     }
2986     if ( bExp == 0 ) {
2987         if ( bSig == 0 ) {
2988             if ( ( aExp | aSig ) == 0 ) {
2989  invalid:
2990                 roundData->exception |= float_flag_invalid;
2991                 z.low = floatx80_default_nan_low;
2992                 z.high = floatx80_default_nan_high;
2993                 z.__padding = 0;
2994                 return z;
2995             }
2996             roundData->exception |= float_flag_divbyzero;
2997             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2998         }
2999         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3000     }
3001     if ( aExp == 0 ) {
3002         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3003         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3004     }
3005     zExp = aExp - bExp + 0x3FFE;
3006     rem1 = 0;
3007     if ( bSig <= aSig ) {
3008         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3009         ++zExp;
3010     }
3011     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3012     mul64To128( bSig, zSig0, &term0, &term1 );
3013     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3014     while ( (sbits64) rem0 < 0 ) {
3015         --zSig0;
3016         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3017     }
3018     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3019     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3020         mul64To128( bSig, zSig1, &term1, &term2 );
3021         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3022         while ( (sbits64) rem1 < 0 ) {
3023             --zSig1;
3024             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3025         }
3026         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3027     }
3028     return
3029         roundAndPackFloatx80(
3030             roundData, zSign, zExp, zSig0, zSig1 );
3031
3032 }
3033
3034 /*
3035 -------------------------------------------------------------------------------
3036 Returns the remainder of the extended double-precision floating-point value
3037 `a' with respect to the corresponding value `b'.  The operation is performed
3038 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3039 -------------------------------------------------------------------------------
3040 */
3041 floatx80 floatx80_rem( struct roundingData *roundData, floatx80 a, floatx80 b )
3042 {
3043     flag aSign, bSign, zSign;
3044     int32 aExp, bExp, expDiff;
3045     bits64 aSig0, aSig1, bSig;
3046     bits64 q, term0, term1, alternateASig0, alternateASig1;
3047     floatx80 z;
3048
3049     aSig0 = extractFloatx80Frac( a );
3050     aExp = extractFloatx80Exp( a );
3051     aSign = extractFloatx80Sign( a );
3052     bSig = extractFloatx80Frac( b );
3053     bExp = extractFloatx80Exp( b );
3054     bSign = extractFloatx80Sign( b );
3055     if ( aExp == 0x7FFF ) {
3056         if (    (bits64) ( aSig0<<1 )
3057              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3058             return propagateFloatx80NaN( a, b );
3059         }
3060         goto invalid;
3061     }
3062     if ( bExp == 0x7FFF ) {
3063         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3064         return a;
3065     }
3066     if ( bExp == 0 ) {
3067         if ( bSig == 0 ) {
3068  invalid:
3069             roundData->exception |= float_flag_invalid;
3070             z.low = floatx80_default_nan_low;
3071             z.high = floatx80_default_nan_high;
3072             z.__padding = 0;
3073             return z;
3074         }
3075         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3076     }
3077     if ( aExp == 0 ) {
3078         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3079         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3080     }
3081     bSig |= LIT64( 0x8000000000000000 );
3082     zSign = aSign;
3083     expDiff = aExp - bExp;
3084     aSig1 = 0;
3085     if ( expDiff < 0 ) {
3086         if ( expDiff < -1 ) return a;
3087         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3088         expDiff = 0;
3089     }
3090     q = ( bSig <= aSig0 );
3091     if ( q ) aSig0 -= bSig;
3092     expDiff -= 64;
3093     while ( 0 < expDiff ) {
3094         q = estimateDiv128To64( aSig0, aSig1, bSig );
3095         q = ( 2 < q ) ? q - 2 : 0;
3096         mul64To128( bSig, q, &term0, &term1 );
3097         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3098         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3099         expDiff -= 62;
3100     }
3101     expDiff += 64;
3102     if ( 0 < expDiff ) {
3103         q = estimateDiv128To64( aSig0, aSig1, bSig );
3104         q = ( 2 < q ) ? q - 2 : 0;
3105         q >>= 64 - expDiff;
3106         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3107         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3108         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3109         while ( le128( term0, term1, aSig0, aSig1 ) ) {
3110             ++q;
3111             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3112         }
3113     }
3114     else {
3115         term1 = 0;
3116         term0 = bSig;
3117     }
3118     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3119     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3120          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3121               && ( q & 1 ) )
3122        ) {
3123         aSig0 = alternateASig0;
3124         aSig1 = alternateASig1;
3125         zSign = ! zSign;
3126     }
3127
3128     return
3129         normalizeRoundAndPackFloatx80(
3130             roundData, zSign, bExp + expDiff, aSig0, aSig1 );
3131
3132 }
3133
3134 /*
3135 -------------------------------------------------------------------------------
3136 Returns the square root of the extended double-precision floating-point
3137 value `a'.  The operation is performed according to the IEC/IEEE Standard
3138 for Binary Floating-point Arithmetic.
3139 -------------------------------------------------------------------------------
3140 */
3141 floatx80 floatx80_sqrt( struct roundingData *roundData, floatx80 a )
3142 {
3143     flag aSign;
3144     int32 aExp, zExp;
3145     bits64 aSig0, aSig1, zSig0, zSig1;
3146     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3147     bits64 shiftedRem0, shiftedRem1;
3148     floatx80 z;
3149
3150     aSig0 = extractFloatx80Frac( a );
3151     aExp = extractFloatx80Exp( a );
3152     aSign = extractFloatx80Sign( a );
3153     if ( aExp == 0x7FFF ) {
3154         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3155         if ( ! aSign ) return a;
3156         goto invalid;
3157     }
3158     if ( aSign ) {
3159         if ( ( aExp | aSig0 ) == 0 ) return a;
3160  invalid:
3161         roundData->exception |= float_flag_invalid;
3162         z.low = floatx80_default_nan_low;
3163         z.high = floatx80_default_nan_high;
3164         z.__padding = 0;
3165         return z;
3166     }
3167     if ( aExp == 0 ) {
3168         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3169         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3170     }
3171     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3172     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3173     zSig0 <<= 31;
3174     aSig1 = 0;
3175     shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 );
3176     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
3177     if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
3178     shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
3179     mul64To128( zSig0, zSig0, &term0, &term1 );
3180     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3181     while ( (sbits64) rem0 < 0 ) {
3182         --zSig0;
3183         shortShift128Left( 0, zSig0, 1, &term0, &term1 );
3184         term1 |= 1;
3185         add128( rem0, rem1, term0, term1, &rem0, &rem1 );
3186     }
3187     shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
3188     zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
3189     if ( (bits64) ( zSig1<<1 ) <= 10 ) {
3190         if ( zSig1 == 0 ) zSig1 = 1;
3191         mul64To128( zSig0, zSig1, &term1, &term2 );
3192         shortShift128Left( term1, term2, 1, &term1, &term2 );
3193         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3194         mul64To128( zSig1, zSig1, &term2, &term3 );
3195         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3196         while ( (sbits64) rem1 < 0 ) {
3197             --zSig1;
3198             shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
3199             term3 |= 1;
3200             add192(
3201                 rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
3202         }
3203         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3204     }
3205     return
3206         roundAndPackFloatx80(
3207             roundData, 0, zExp, zSig0, zSig1 );
3208
3209 }
3210
3211 /*
3212 -------------------------------------------------------------------------------
3213 Returns 1 if the extended double-precision floating-point value `a' is
3214 equal to the corresponding value `b', and 0 otherwise.  The comparison is
3215 performed according to the IEC/IEEE Standard for Binary Floating-point
3216 Arithmetic.
3217 -------------------------------------------------------------------------------
3218 */
3219 flag floatx80_eq( floatx80 a, floatx80 b )
3220 {
3221
3222     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3223               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3224          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3225               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3226        ) {
3227         if (    floatx80_is_signaling_nan( a )
3228              || floatx80_is_signaling_nan( b ) ) {
3229             float_raise( float_flag_invalid );
3230         }
3231         return 0;
3232     }
3233     return
3234            ( a.low == b.low )
3235         && (    ( a.high == b.high )
3236              || (    ( a.low == 0 )
3237                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3238            );
3239
3240 }
3241
3242 /*
3243 -------------------------------------------------------------------------------
3244 Returns 1 if the extended double-precision floating-point value `a' is
3245 less than or equal to the corresponding value `b', and 0 otherwise.  The
3246 comparison is performed according to the IEC/IEEE Standard for Binary
3247 Floating-point Arithmetic.
3248 -------------------------------------------------------------------------------
3249 */
3250 flag floatx80_le( floatx80 a, floatx80 b )
3251 {
3252     flag aSign, bSign;
3253
3254     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3255               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3256          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3257               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3258        ) {
3259         float_raise( float_flag_invalid );
3260         return 0;
3261     }
3262     aSign = extractFloatx80Sign( a );
3263     bSign = extractFloatx80Sign( b );
3264     if ( aSign != bSign ) {
3265         return
3266                aSign
3267             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3268                  == 0 );
3269     }
3270     return
3271           aSign ? le128( b.high, b.low, a.high, a.low )
3272         : le128( a.high, a.low, b.high, b.low );
3273
3274 }
3275
3276 /*
3277 -------------------------------------------------------------------------------
3278 Returns 1 if the extended double-precision floating-point value `a' is
3279 less than the corresponding value `b', and 0 otherwise.  The comparison
3280 is performed according to the IEC/IEEE Standard for Binary Floating-point
3281 Arithmetic.
3282 -------------------------------------------------------------------------------
3283 */
3284 flag floatx80_lt( floatx80 a, floatx80 b )
3285 {
3286     flag aSign, bSign;
3287
3288     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3289               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3290          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3291               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3292        ) {
3293         float_raise( float_flag_invalid );
3294         return 0;
3295     }
3296     aSign = extractFloatx80Sign( a );
3297     bSign = extractFloatx80Sign( b );
3298     if ( aSign != bSign ) {
3299         return
3300                aSign
3301             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3302                  != 0 );
3303     }
3304     return
3305           aSign ? lt128( b.high, b.low, a.high, a.low )
3306         : lt128( a.high, a.low, b.high, b.low );
3307
3308 }
3309
3310 /*
3311 -------------------------------------------------------------------------------
3312 Returns 1 if the extended double-precision floating-point value `a' is equal
3313 to the corresponding value `b', and 0 otherwise.  The invalid exception is
3314 raised if either operand is a NaN.  Otherwise, the comparison is performed
3315 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3316 -------------------------------------------------------------------------------
3317 */
3318 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3319 {
3320
3321     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3322               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3323          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3324               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3325        ) {
3326         float_raise( float_flag_invalid );
3327         return 0;
3328     }
3329     return
3330            ( a.low == b.low )
3331         && (    ( a.high == b.high )
3332              || (    ( a.low == 0 )
3333                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3334            );
3335
3336 }
3337
3338 /*
3339 -------------------------------------------------------------------------------
3340 Returns 1 if the extended double-precision floating-point value `a' is less
3341 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
3342 do not cause an exception.  Otherwise, the comparison is performed according
3343 to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3344 -------------------------------------------------------------------------------
3345 */
3346 flag floatx80_le_quiet( floatx80 a, floatx80 b )
3347 {
3348     flag aSign, bSign;
3349
3350     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3351               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3352          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3353               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3354        ) {
3355         /* Do nothing, even if NaN as we're quiet */
3356         return 0;
3357     }
3358     aSign = extractFloatx80Sign( a );
3359     bSign = extractFloatx80Sign( b );
3360     if ( aSign != bSign ) {
3361         return
3362                aSign
3363             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3364                  == 0 );
3365     }
3366     return
3367           aSign ? le128( b.high, b.low, a.high, a.low )
3368         : le128( a.high, a.low, b.high, b.low );
3369
3370 }
3371
3372 /*
3373 -------------------------------------------------------------------------------
3374 Returns 1 if the extended double-precision floating-point value `a' is less
3375 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
3376 an exception.  Otherwise, the comparison is performed according to the
3377 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3378 -------------------------------------------------------------------------------
3379 */
3380 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3381 {
3382     flag aSign, bSign;
3383
3384     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3385               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3386          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3387               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3388        ) {
3389         /* Do nothing, even if NaN as we're quiet */
3390         return 0;
3391     }
3392     aSign = extractFloatx80Sign( a );
3393     bSign = extractFloatx80Sign( b );
3394     if ( aSign != bSign ) {
3395         return
3396                aSign
3397             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3398                  != 0 );
3399     }
3400     return
3401           aSign ? lt128( b.high, b.low, a.high, a.low )
3402         : lt128( a.high, a.low, b.high, b.low );
3403
3404 }
3405
3406 #endif