fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * The code in this source file is derived from release 2a of the SoftFloat
   5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6  * some later contributions) are provided under that license, as detailed below.
   7  * It has subsequently been modified by contributors to the QEMU Project,
   8  * so some portions are provided under:
   9  *  the SoftFloat-2a license
  10  *  the BSD license
  11  *  GPL-v2-or-later
  12  *
  13  * Any future contributions to this file after December 1st 2014 will be
  14  * taken to be licensed under the Softfloat-2a license unless specifically
  15  * indicated otherwise.
  16  */
  17
  18 /*
  19 ===============================================================================
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* BSD licensing:
  48  * Copyright (c) 2006, Fabrice Bellard
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are met:
  53  *
  54  * 1. Redistributions of source code must retain the above copyright notice,
  55  * this list of conditions and the following disclaimer.
  56  *
  57  * 2. Redistributions in binary form must reproduce the above copyright notice,
  58  * this list of conditions and the following disclaimer in the documentation
  59  * and/or other materials provided with the distribution.
  60  *
  61  * 3. Neither the name of the copyright holder nor the names of its contributors
  62  * may be used to endorse or promote products derived from this software without
  63  * specific prior written permission.
  64  *
  65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75  * THE POSSIBILITY OF SUCH DAMAGE.
  76  */
  77
  78 /* Portions of this work are licensed under the terms of the GNU GPL,
  79  * version 2 or later. See the COPYING file in the top-level directory.
  80  */
  81
  82 /* softfloat (and in particular the code in softfloat-specialize.h) is
  83  * target-dependent and needs the TARGET_* macros.
  84  */
  85 #include "qemu/osdep.h"
  86
  87 #include "fpu/softfloat.h"
  88
  89 /* We only need stdlib for abort() */
  90
  91 /*----------------------------------------------------------------------------
  92 | Primitive arithmetic functions, including multi-word arithmetic, and
  93 | division and square root approximations.  (Can be specialized to target if
  94 | desired.)
  95 *----------------------------------------------------------------------------*/
  96 #include "softfloat-macros.h"
  97
  98 /*----------------------------------------------------------------------------
  99 | Functions and definitions to determine:  (1) whether tininess for underflow
 100 | is detected before or after rounding by default, (2) what (if anything)
 101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
 102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 103 | are propagated from function inputs to output.  These details are target-
 104 | specific.
 105 *----------------------------------------------------------------------------*/
 106 #include "softfloat-specialize.h"
 107
 108 /*----------------------------------------------------------------------------
 109 | Returns the fraction bits of the half-precision floating-point value `a'.
 110 *----------------------------------------------------------------------------*/
 111
 112 static inline uint32_t extractFloat16Frac(float16 a)
 113 {
 114     return float16_val(a) & 0x3ff;
 115 }
 116
 117 /*----------------------------------------------------------------------------
 118 | Returns the exponent bits of the half-precision floating-point value `a'.
 119 *----------------------------------------------------------------------------*/
 120
 121 static inline int extractFloat16Exp(float16 a)
 122 {
 123     return (float16_val(a) >> 10) & 0x1f;
 124 }
 125
 126 /*----------------------------------------------------------------------------
 127 | Returns the sign bit of the single-precision floating-point value `a'.
 128 *----------------------------------------------------------------------------*/
 129
 130 static inline flag extractFloat16Sign(float16 a)
 131 {
 132     return float16_val(a)>>15;
 133 }
 134
 135 /*----------------------------------------------------------------------------
 136 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
 137 | and 7, and returns the properly rounded 32-bit integer corresponding to the
 138 | input.  If `zSign' is 1, the input is negated before being converted to an
 139 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 140 | is simply rounded to an integer, with the inexact exception raised if the
 141 | input cannot be represented exactly as an integer.  However, if the fixed-
 142 | point input is too large, the invalid exception is raised and the largest
 143 | positive or negative integer is returned.
 144 *----------------------------------------------------------------------------*/
 145
 146 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
 147 {
 148     int8_t roundingMode;
 149     flag roundNearestEven;
 150     int8_t roundIncrement, roundBits;
 151     int32_t z;
 152
 153     roundingMode = status->float_rounding_mode;
 154     roundNearestEven = ( roundingMode == float_round_nearest_even );
 155     switch (roundingMode) {
 156     case float_round_nearest_even:
 157     case float_round_ties_away:
 158         roundIncrement = 0x40;
 159         break;
 160     case float_round_to_zero:
 161         roundIncrement = 0;
 162         break;
 163     case float_round_up:
 164         roundIncrement = zSign ? 0 : 0x7f;
 165         break;
 166     case float_round_down:
 167         roundIncrement = zSign ? 0x7f : 0;
 168         break;
 169     default:
 170         abort();
 171     }
 172     roundBits = absZ & 0x7F;
 173     absZ = ( absZ + roundIncrement )>>7;
 174     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 175     z = absZ;
 176     if ( zSign ) z = - z;
 177     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 178         float_raise(float_flag_invalid, status);
 179         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
 180     }
 181     if (roundBits) {
 182         status->float_exception_flags |= float_flag_inexact;
 183     }
 184     return z;
 185
 186 }
 187
 188 /*----------------------------------------------------------------------------
 189 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 190 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 191 | and returns the properly rounded 64-bit integer corresponding to the input.
 192 | If `zSign' is 1, the input is negated before being converted to an integer.
 193 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 194 | the inexact exception raised if the input cannot be represented exactly as
 195 | an integer.  However, if the fixed-point input is too large, the invalid
 196 | exception is raised and the largest positive or negative integer is
 197 | returned.
 198 *----------------------------------------------------------------------------*/
 199
 200 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
 201                                float_status *status)
 202 {
 203     int8_t roundingMode;
 204     flag roundNearestEven, increment;
 205     int64_t z;
 206
 207     roundingMode = status->float_rounding_mode;
 208     roundNearestEven = ( roundingMode == float_round_nearest_even );
 209     switch (roundingMode) {
 210     case float_round_nearest_even:
 211     case float_round_ties_away:
 212         increment = ((int64_t) absZ1 < 0);
 213         break;
 214     case float_round_to_zero:
 215         increment = 0;
 216         break;
 217     case float_round_up:
 218         increment = !zSign && absZ1;
 219         break;
 220     case float_round_down:
 221         increment = zSign && absZ1;
 222         break;
 223     default:
 224         abort();
 225     }
 226     if ( increment ) {
 227         ++absZ0;
 228         if ( absZ0 == 0 ) goto overflow;
 229         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 230     }
 231     z = absZ0;
 232     if ( zSign ) z = - z;
 233     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 234  overflow:
 235         float_raise(float_flag_invalid, status);
 236         return
 237               zSign ? (int64_t) LIT64( 0x8000000000000000 )
 238             : LIT64( 0x7FFFFFFFFFFFFFFF );
 239     }
 240     if (absZ1) {
 241         status->float_exception_flags |= float_flag_inexact;
 242     }
 243     return z;
 244
 245 }
 246
 247 /*----------------------------------------------------------------------------
 248 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 249 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 250 | and returns the properly rounded 64-bit unsigned integer corresponding to the
 251 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
 252 | with the inexact exception raised if the input cannot be represented exactly
 253 | as an integer.  However, if the fixed-point input is too large, the invalid
 254 | exception is raised and the largest unsigned integer is returned.
 255 *----------------------------------------------------------------------------*/
 256
 257 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
 258                                 uint64_t absZ1, float_status *status)
 259 {
 260     int8_t roundingMode;
 261     flag roundNearestEven, increment;
 262
 263     roundingMode = status->float_rounding_mode;
 264     roundNearestEven = (roundingMode == float_round_nearest_even);
 265     switch (roundingMode) {
 266     case float_round_nearest_even:
 267     case float_round_ties_away:
 268         increment = ((int64_t)absZ1 < 0);
 269         break;
 270     case float_round_to_zero:
 271         increment = 0;
 272         break;
 273     case float_round_up:
 274         increment = !zSign && absZ1;
 275         break;
 276     case float_round_down:
 277         increment = zSign && absZ1;
 278         break;
 279     default:
 280         abort();
 281     }
 282     if (increment) {
 283         ++absZ0;
 284         if (absZ0 == 0) {
 285             float_raise(float_flag_invalid, status);
 286             return LIT64(0xFFFFFFFFFFFFFFFF);
 287         }
 288         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
 289     }
 290
 291     if (zSign && absZ0) {
 292         float_raise(float_flag_invalid, status);
 293         return 0;
 294     }
 295
 296     if (absZ1) {
 297         status->float_exception_flags |= float_flag_inexact;
 298     }
 299     return absZ0;
 300 }
 301
 302 /*----------------------------------------------------------------------------
 303 | Returns the fraction bits of the single-precision floating-point value `a'.
 304 *----------------------------------------------------------------------------*/
 305
 306 static inline uint32_t extractFloat32Frac( float32 a )
 307 {
 308
 309     return float32_val(a) & 0x007FFFFF;
 310
 311 }
 312
 313 /*----------------------------------------------------------------------------
 314 | Returns the exponent bits of the single-precision floating-point value `a'.
 315 *----------------------------------------------------------------------------*/
 316
 317 static inline int extractFloat32Exp(float32 a)
 318 {
 319
 320     return ( float32_val(a)>>23 ) & 0xFF;
 321
 322 }
 323
 324 /*----------------------------------------------------------------------------
 325 | Returns the sign bit of the single-precision floating-point value `a'.
 326 *----------------------------------------------------------------------------*/
 327
 328 static inline flag extractFloat32Sign( float32 a )
 329 {
 330
 331     return float32_val(a)>>31;
 332
 333 }
 334
 335 /*----------------------------------------------------------------------------
 336 | If `a' is denormal and we are in flush-to-zero mode then set the
 337 | input-denormal exception and return zero. Otherwise just return the value.
 338 *----------------------------------------------------------------------------*/
 339 float32 float32_squash_input_denormal(float32 a, float_status *status)
 340 {
 341     if (status->flush_inputs_to_zero) {
 342         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
 343             float_raise(float_flag_input_denormal, status);
 344             return make_float32(float32_val(a) & 0x80000000);
 345         }
 346     }
 347     return a;
 348 }
 349
 350 /*----------------------------------------------------------------------------
 351 | Normalizes the subnormal single-precision floating-point value represented
 352 | by the denormalized significand `aSig'.  The normalized exponent and
 353 | significand are stored at the locations pointed to by `zExpPtr' and
 354 | `zSigPtr', respectively.
 355 *----------------------------------------------------------------------------*/
 356
 357 static void
 358  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
 359 {
 360     int8_t shiftCount;
 361
 362     shiftCount = countLeadingZeros32( aSig ) - 8;
 363     *zSigPtr = aSig<<shiftCount;
 364     *zExpPtr = 1 - shiftCount;
 365
 366 }
 367
 368 /*----------------------------------------------------------------------------
 369 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 370 | single-precision floating-point value, returning the result.  After being
 371 | shifted into the proper positions, the three fields are simply added
 372 | together to form the result.  This means that any integer portion of `zSig'
 373 | will be added into the exponent.  Since a properly normalized significand
 374 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 375 | than the desired result exponent whenever `zSig' is a complete, normalized
 376 | significand.
 377 *----------------------------------------------------------------------------*/
 378
 379 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
 380 {
 381
 382     return make_float32(
 383           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
 384
 385 }
 386
 387 /*----------------------------------------------------------------------------
 388 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 389 | and significand `zSig', and returns the proper single-precision floating-
 390 | point value corresponding to the abstract input.  Ordinarily, the abstract
 391 | value is simply rounded and packed into the single-precision format, with
 392 | the inexact exception raised if the abstract input cannot be represented
 393 | exactly.  However, if the abstract value is too large, the overflow and
 394 | inexact exceptions are raised and an infinity or maximal finite value is
 395 | returned.  If the abstract value is too small, the input value is rounded to
 396 | a subnormal number, and the underflow and inexact exceptions are raised if
 397 | the abstract input cannot be represented exactly as a subnormal single-
 398 | precision floating-point number.
 399 |     The input significand `zSig' has its binary point between bits 30
 400 | and 29, which is 7 bits to the left of the usual location.  This shifted
 401 | significand must be normalized or smaller.  If `zSig' is not normalized,
 402 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 403 | and it must not require rounding.  In the usual case that `zSig' is
 404 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 405 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 406 | Binary Floating-Point Arithmetic.
 407 *----------------------------------------------------------------------------*/
 408
 409 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
 410                                    float_status *status)
 411 {
 412     int8_t roundingMode;
 413     flag roundNearestEven;
 414     int8_t roundIncrement, roundBits;
 415     flag isTiny;
 416
 417     roundingMode = status->float_rounding_mode;
 418     roundNearestEven = ( roundingMode == float_round_nearest_even );
 419     switch (roundingMode) {
 420     case float_round_nearest_even:
 421     case float_round_ties_away:
 422         roundIncrement = 0x40;
 423         break;
 424     case float_round_to_zero:
 425         roundIncrement = 0;
 426         break;
 427     case float_round_up:
 428         roundIncrement = zSign ? 0 : 0x7f;
 429         break;
 430     case float_round_down:
 431         roundIncrement = zSign ? 0x7f : 0;
 432         break;
 433     default:
 434         abort();
 435         break;
 436     }
 437     roundBits = zSig & 0x7F;
 438     if ( 0xFD <= (uint16_t) zExp ) {
 439         if (    ( 0xFD < zExp )
 440              || (    ( zExp == 0xFD )
 441                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
 442            ) {
 443             float_raise(float_flag_overflow | float_flag_inexact, status);
 444             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 445         }
 446         if ( zExp < 0 ) {
 447             if (status->flush_to_zero) {
 448                 float_raise(float_flag_output_denormal, status);
 449                 return packFloat32(zSign, 0, 0);
 450             }
 451             isTiny =
 452                 (status->float_detect_tininess
 453                  == float_tininess_before_rounding)
 454                 || ( zExp < -1 )
 455                 || ( zSig + roundIncrement < 0x80000000 );
 456             shift32RightJamming( zSig, - zExp, &zSig );
 457             zExp = 0;
 458             roundBits = zSig & 0x7F;
 459             if (isTiny && roundBits) {
 460                 float_raise(float_flag_underflow, status);
 461             }
 462         }
 463     }
 464     if (roundBits) {
 465         status->float_exception_flags |= float_flag_inexact;
 466     }
 467     zSig = ( zSig + roundIncrement )>>7;
 468     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 469     if ( zSig == 0 ) zExp = 0;
 470     return packFloat32( zSign, zExp, zSig );
 471
 472 }
 473
 474 /*----------------------------------------------------------------------------
 475 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 476 | and significand `zSig', and returns the proper single-precision floating-
 477 | point value corresponding to the abstract input.  This routine is just like
 478 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 479 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 480 | floating-point exponent.
 481 *----------------------------------------------------------------------------*/
 482
 483 static float32
 484  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
 485                               float_status *status)
 486 {
 487     int8_t shiftCount;
 488
 489     shiftCount = countLeadingZeros32( zSig ) - 1;
 490     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
 491                                status);
 492
 493 }
 494
 495 /*----------------------------------------------------------------------------
 496 | Returns the fraction bits of the double-precision floating-point value `a'.
 497 *----------------------------------------------------------------------------*/
 498
 499 static inline uint64_t extractFloat64Frac( float64 a )
 500 {
 501
 502     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 503
 504 }
 505
 506 /*----------------------------------------------------------------------------
 507 | Returns the exponent bits of the double-precision floating-point value `a'.
 508 *----------------------------------------------------------------------------*/
 509
 510 static inline int extractFloat64Exp(float64 a)
 511 {
 512
 513     return ( float64_val(a)>>52 ) & 0x7FF;
 514
 515 }
 516
 517 /*----------------------------------------------------------------------------
 518 | Returns the sign bit of the double-precision floating-point value `a'.
 519 *----------------------------------------------------------------------------*/
 520
 521 static inline flag extractFloat64Sign( float64 a )
 522 {
 523
 524     return float64_val(a)>>63;
 525
 526 }
 527
 528 /*----------------------------------------------------------------------------
 529 | If `a' is denormal and we are in flush-to-zero mode then set the
 530 | input-denormal exception and return zero. Otherwise just return the value.
 531 *----------------------------------------------------------------------------*/
 532 float64 float64_squash_input_denormal(float64 a, float_status *status)
 533 {
 534     if (status->flush_inputs_to_zero) {
 535         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
 536             float_raise(float_flag_input_denormal, status);
 537             return make_float64(float64_val(a) & (1ULL << 63));
 538         }
 539     }
 540     return a;
 541 }
 542
 543 /*----------------------------------------------------------------------------
 544 | Normalizes the subnormal double-precision floating-point value represented
 545 | by the denormalized significand `aSig'.  The normalized exponent and
 546 | significand are stored at the locations pointed to by `zExpPtr' and
 547 | `zSigPtr', respectively.
 548 *----------------------------------------------------------------------------*/
 549
 550 static void
 551  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
 552 {
 553     int8_t shiftCount;
 554
 555     shiftCount = countLeadingZeros64( aSig ) - 11;
 556     *zSigPtr = aSig<<shiftCount;
 557     *zExpPtr = 1 - shiftCount;
 558
 559 }
 560
 561 /*----------------------------------------------------------------------------
 562 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 563 | double-precision floating-point value, returning the result.  After being
 564 | shifted into the proper positions, the three fields are simply added
 565 | together to form the result.  This means that any integer portion of `zSig'
 566 | will be added into the exponent.  Since a properly normalized significand
 567 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 568 | than the desired result exponent whenever `zSig' is a complete, normalized
 569 | significand.
 570 *----------------------------------------------------------------------------*/
 571
 572 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
 573 {
 574
 575     return make_float64(
 576         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
 577
 578 }
 579
 580 /*----------------------------------------------------------------------------
 581 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 582 | and significand `zSig', and returns the proper double-precision floating-
 583 | point value corresponding to the abstract input.  Ordinarily, the abstract
 584 | value is simply rounded and packed into the double-precision format, with
 585 | the inexact exception raised if the abstract input cannot be represented
 586 | exactly.  However, if the abstract value is too large, the overflow and
 587 | inexact exceptions are raised and an infinity or maximal finite value is
 588 | returned.  If the abstract value is too small, the input value is rounded to
 589 | a subnormal number, and the underflow and inexact exceptions are raised if
 590 | the abstract input cannot be represented exactly as a subnormal double-
 591 | precision floating-point number.
 592 |     The input significand `zSig' has its binary point between bits 62
 593 | and 61, which is 10 bits to the left of the usual location.  This shifted
 594 | significand must be normalized or smaller.  If `zSig' is not normalized,
 595 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 596 | and it must not require rounding.  In the usual case that `zSig' is
 597 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 598 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 599 | Binary Floating-Point Arithmetic.
 600 *----------------------------------------------------------------------------*/
 601
 602 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
 603                                    float_status *status)
 604 {
 605     int8_t roundingMode;
 606     flag roundNearestEven;
 607     int roundIncrement, roundBits;
 608     flag isTiny;
 609
 610     roundingMode = status->float_rounding_mode;
 611     roundNearestEven = ( roundingMode == float_round_nearest_even );
 612     switch (roundingMode) {
 613     case float_round_nearest_even:
 614     case float_round_ties_away:
 615         roundIncrement = 0x200;
 616         break;
 617     case float_round_to_zero:
 618         roundIncrement = 0;
 619         break;
 620     case float_round_up:
 621         roundIncrement = zSign ? 0 : 0x3ff;
 622         break;
 623     case float_round_down:
 624         roundIncrement = zSign ? 0x3ff : 0;
 625         break;
 626     default:
 627         abort();
 628     }
 629     roundBits = zSig & 0x3FF;
 630     if ( 0x7FD <= (uint16_t) zExp ) {
 631         if (    ( 0x7FD < zExp )
 632              || (    ( zExp == 0x7FD )
 633                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
 634            ) {
 635             float_raise(float_flag_overflow | float_flag_inexact, status);
 636             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 637         }
 638         if ( zExp < 0 ) {
 639             if (status->flush_to_zero) {
 640                 float_raise(float_flag_output_denormal, status);
 641                 return packFloat64(zSign, 0, 0);
 642             }
 643             isTiny =
 644                    (status->float_detect_tininess
 645                     == float_tininess_before_rounding)
 646                 || ( zExp < -1 )
 647                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 648             shift64RightJamming( zSig, - zExp, &zSig );
 649             zExp = 0;
 650             roundBits = zSig & 0x3FF;
 651             if (isTiny && roundBits) {
 652                 float_raise(float_flag_underflow, status);
 653             }
 654         }
 655     }
 656     if (roundBits) {
 657         status->float_exception_flags |= float_flag_inexact;
 658     }
 659     zSig = ( zSig + roundIncrement )>>10;
 660     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 661     if ( zSig == 0 ) zExp = 0;
 662     return packFloat64( zSign, zExp, zSig );
 663
 664 }
 665
 666 /*----------------------------------------------------------------------------
 667 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 668 | and significand `zSig', and returns the proper double-precision floating-
 669 | point value corresponding to the abstract input.  This routine is just like
 670 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 671 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 672 | floating-point exponent.
 673 *----------------------------------------------------------------------------*/
 674
 675 static float64
 676  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
 677                               float_status *status)
 678 {
 679     int8_t shiftCount;
 680
 681     shiftCount = countLeadingZeros64( zSig ) - 1;
 682     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
 683                                status);
 684
 685 }
 686
 687 /*----------------------------------------------------------------------------
 688 | Returns the fraction bits of the extended double-precision floating-point
 689 | value `a'.
 690 *----------------------------------------------------------------------------*/
 691
 692 static inline uint64_t extractFloatx80Frac( floatx80 a )
 693 {
 694
 695     return a.low;
 696
 697 }
 698
 699 /*----------------------------------------------------------------------------
 700 | Returns the exponent bits of the extended double-precision floating-point
 701 | value `a'.
 702 *----------------------------------------------------------------------------*/
 703
 704 static inline int32_t extractFloatx80Exp( floatx80 a )
 705 {
 706
 707     return a.high & 0x7FFF;
 708
 709 }
 710
 711 /*----------------------------------------------------------------------------
 712 | Returns the sign bit of the extended double-precision floating-point value
 713 | `a'.
 714 *----------------------------------------------------------------------------*/
 715
 716 static inline flag extractFloatx80Sign( floatx80 a )
 717 {
 718
 719     return a.high>>15;
 720
 721 }
 722
 723 /*----------------------------------------------------------------------------
 724 | Normalizes the subnormal extended double-precision floating-point value
 725 | represented by the denormalized significand `aSig'.  The normalized exponent
 726 | and significand are stored at the locations pointed to by `zExpPtr' and
 727 | `zSigPtr', respectively.
 728 *----------------------------------------------------------------------------*/
 729
 730 static void
 731  normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
 732 {
 733     int8_t shiftCount;
 734
 735     shiftCount = countLeadingZeros64( aSig );
 736     *zSigPtr = aSig<<shiftCount;
 737     *zExpPtr = 1 - shiftCount;
 738
 739 }
 740
 741 /*----------------------------------------------------------------------------
 742 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 743 | extended double-precision floating-point value, returning the result.
 744 *----------------------------------------------------------------------------*/
 745
 746 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
 747 {
 748     floatx80 z;
 749
 750     z.low = zSig;
 751     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
 752     return z;
 753
 754 }
 755
 756 /*----------------------------------------------------------------------------
 757 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 758 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 759 | and returns the proper extended double-precision floating-point value
 760 | corresponding to the abstract input.  Ordinarily, the abstract value is
 761 | rounded and packed into the extended double-precision format, with the
 762 | inexact exception raised if the abstract input cannot be represented
 763 | exactly.  However, if the abstract value is too large, the overflow and
 764 | inexact exceptions are raised and an infinity or maximal finite value is
 765 | returned.  If the abstract value is too small, the input value is rounded to
 766 | a subnormal number, and the underflow and inexact exceptions are raised if
 767 | the abstract input cannot be represented exactly as a subnormal extended
 768 | double-precision floating-point number.
 769 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 770 | number of bits as single or double precision, respectively.  Otherwise, the
 771 | result is rounded to the full precision of the extended double-precision
 772 | format.
 773 |     The input significand must be normalized or smaller.  If the input
 774 | significand is not normalized, `zExp' must be 0; in that case, the result
 775 | returned is a subnormal number, and it must not require rounding.  The
 776 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 777 | Floating-Point Arithmetic.
 778 *----------------------------------------------------------------------------*/
 779
 780 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
 781                                      int32_t zExp, uint64_t zSig0, uint64_t zSig1,
 782                                      float_status *status)
 783 {
 784     int8_t roundingMode;
 785     flag roundNearestEven, increment, isTiny;
 786     int64_t roundIncrement, roundMask, roundBits;
 787
 788     roundingMode = status->float_rounding_mode;
 789     roundNearestEven = ( roundingMode == float_round_nearest_even );
 790     if ( roundingPrecision == 80 ) goto precision80;
 791     if ( roundingPrecision == 64 ) {
 792         roundIncrement = LIT64( 0x0000000000000400 );
 793         roundMask = LIT64( 0x00000000000007FF );
 794     }
 795     else if ( roundingPrecision == 32 ) {
 796         roundIncrement = LIT64( 0x0000008000000000 );
 797         roundMask = LIT64( 0x000000FFFFFFFFFF );
 798     }
 799     else {
 800         goto precision80;
 801     }
 802     zSig0 |= ( zSig1 != 0 );
 803     switch (roundingMode) {
 804     case float_round_nearest_even:
 805     case float_round_ties_away:
 806         break;
 807     case float_round_to_zero:
 808         roundIncrement = 0;
 809         break;
 810     case float_round_up:
 811         roundIncrement = zSign ? 0 : roundMask;
 812         break;
 813     case float_round_down:
 814         roundIncrement = zSign ? roundMask : 0;
 815         break;
 816     default:
 817         abort();
 818     }
 819     roundBits = zSig0 & roundMask;
 820     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 821         if (    ( 0x7FFE < zExp )
 822              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 823            ) {
 824             goto overflow;
 825         }
 826         if ( zExp <= 0 ) {
 827             if (status->flush_to_zero) {
 828                 float_raise(float_flag_output_denormal, status);
 829                 return packFloatx80(zSign, 0, 0);
 830             }
 831             isTiny =
 832                    (status->float_detect_tininess
 833                     == float_tininess_before_rounding)
 834                 || ( zExp < 0 )
 835                 || ( zSig0 <= zSig0 + roundIncrement );
 836             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 837             zExp = 0;
 838             roundBits = zSig0 & roundMask;
 839             if (isTiny && roundBits) {
 840                 float_raise(float_flag_underflow, status);
 841             }
 842             if (roundBits) {
 843                 status->float_exception_flags |= float_flag_inexact;
 844             }
 845             zSig0 += roundIncrement;
 846             if ( (int64_t) zSig0 < 0 ) zExp = 1;
 847             roundIncrement = roundMask + 1;
 848             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 849                 roundMask |= roundIncrement;
 850             }
 851             zSig0 &= ~ roundMask;
 852             return packFloatx80( zSign, zExp, zSig0 );
 853         }
 854     }
 855     if (roundBits) {
 856         status->float_exception_flags |= float_flag_inexact;
 857     }
 858     zSig0 += roundIncrement;
 859     if ( zSig0 < roundIncrement ) {
 860         ++zExp;
 861         zSig0 = LIT64( 0x8000000000000000 );
 862     }
 863     roundIncrement = roundMask + 1;
 864     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 865         roundMask |= roundIncrement;
 866     }
 867     zSig0 &= ~ roundMask;
 868     if ( zSig0 == 0 ) zExp = 0;
 869     return packFloatx80( zSign, zExp, zSig0 );
 870  precision80:
 871     switch (roundingMode) {
 872     case float_round_nearest_even:
 873     case float_round_ties_away:
 874         increment = ((int64_t)zSig1 < 0);
 875         break;
 876     case float_round_to_zero:
 877         increment = 0;
 878         break;
 879     case float_round_up:
 880         increment = !zSign && zSig1;
 881         break;
 882     case float_round_down:
 883         increment = zSign && zSig1;
 884         break;
 885     default:
 886         abort();
 887     }
 888     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 889         if (    ( 0x7FFE < zExp )
 890              || (    ( zExp == 0x7FFE )
 891                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 892                   && increment
 893                 )
 894            ) {
 895             roundMask = 0;
 896  overflow:
 897             float_raise(float_flag_overflow | float_flag_inexact, status);
 898             if (    ( roundingMode == float_round_to_zero )
 899                  || ( zSign && ( roundingMode == float_round_up ) )
 900                  || ( ! zSign && ( roundingMode == float_round_down ) )
 901                ) {
 902                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 903             }
 904             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 905         }
 906         if ( zExp <= 0 ) {
 907             isTiny =
 908                    (status->float_detect_tininess
 909                     == float_tininess_before_rounding)
 910                 || ( zExp < 0 )
 911                 || ! increment
 912                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 913             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 914             zExp = 0;
 915             if (isTiny && zSig1) {
 916                 float_raise(float_flag_underflow, status);
 917             }
 918             if (zSig1) {
 919                 status->float_exception_flags |= float_flag_inexact;
 920             }
 921             switch (roundingMode) {
 922             case float_round_nearest_even:
 923             case float_round_ties_away:
 924                 increment = ((int64_t)zSig1 < 0);
 925                 break;
 926             case float_round_to_zero:
 927                 increment = 0;
 928                 break;
 929             case float_round_up:
 930                 increment = !zSign && zSig1;
 931                 break;
 932             case float_round_down:
 933                 increment = zSign && zSig1;
 934                 break;
 935             default:
 936                 abort();
 937             }
 938             if ( increment ) {
 939                 ++zSig0;
 940                 zSig0 &=
 941                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 942                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
 943             }
 944             return packFloatx80( zSign, zExp, zSig0 );
 945         }
 946     }
 947     if (zSig1) {
 948         status->float_exception_flags |= float_flag_inexact;
 949     }
 950     if ( increment ) {
 951         ++zSig0;
 952         if ( zSig0 == 0 ) {
 953             ++zExp;
 954             zSig0 = LIT64( 0x8000000000000000 );
 955         }
 956         else {
 957             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 958         }
 959     }
 960     else {
 961         if ( zSig0 == 0 ) zExp = 0;
 962     }
 963     return packFloatx80( zSign, zExp, zSig0 );
 964
 965 }
 966
 967 /*----------------------------------------------------------------------------
 968 | Takes an abstract floating-point value having sign `zSign', exponent
 969 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 970 | and returns the proper extended double-precision floating-point value
 971 | corresponding to the abstract input.  This routine is just like
 972 | `roundAndPackFloatx80' except that the input significand does not have to be
 973 | normalized.
 974 *----------------------------------------------------------------------------*/
 975
 976 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
 977                                               flag zSign, int32_t zExp,
 978                                               uint64_t zSig0, uint64_t zSig1,
 979                                               float_status *status)
 980 {
 981     int8_t shiftCount;
 982
 983     if ( zSig0 == 0 ) {
 984         zSig0 = zSig1;
 985         zSig1 = 0;
 986         zExp -= 64;
 987     }
 988     shiftCount = countLeadingZeros64( zSig0 );
 989     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 990     zExp -= shiftCount;
 991     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
 992                                 zSig0, zSig1, status);
 993
 994 }
 995
 996 /*----------------------------------------------------------------------------
 997 | Returns the least-significant 64 fraction bits of the quadruple-precision
 998 | floating-point value `a'.
 999 *----------------------------------------------------------------------------*/
1000
1001 static inline uint64_t extractFloat128Frac1( float128 a )
1002 {
1003
1004     return a.low;
1005
1006 }
1007
1008 /*----------------------------------------------------------------------------
1009 | Returns the most-significant 48 fraction bits of the quadruple-precision
1010 | floating-point value `a'.
1011 *----------------------------------------------------------------------------*/
1012
1013 static inline uint64_t extractFloat128Frac0( float128 a )
1014 {
1015
1016     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
1017
1018 }
1019
1020 /*----------------------------------------------------------------------------
1021 | Returns the exponent bits of the quadruple-precision floating-point value
1022 | `a'.
1023 *----------------------------------------------------------------------------*/
1024
1025 static inline int32_t extractFloat128Exp( float128 a )
1026 {
1027
1028     return ( a.high>>48 ) & 0x7FFF;
1029
1030 }
1031
1032 /*----------------------------------------------------------------------------
1033 | Returns the sign bit of the quadruple-precision floating-point value `a'.
1034 *----------------------------------------------------------------------------*/
1035
1036 static inline flag extractFloat128Sign( float128 a )
1037 {
1038
1039     return a.high>>63;
1040
1041 }
1042
1043 /*----------------------------------------------------------------------------
1044 | Normalizes the subnormal quadruple-precision floating-point value
1045 | represented by the denormalized significand formed by the concatenation of
1046 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
1047 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
1048 | significand are stored at the location pointed to by `zSig0Ptr', and the
1049 | least significant 64 bits of the normalized significand are stored at the
1050 | location pointed to by `zSig1Ptr'.
1051 *----------------------------------------------------------------------------*/
1052
1053 static void
1054  normalizeFloat128Subnormal(
1055      uint64_t aSig0,
1056      uint64_t aSig1,
1057      int32_t *zExpPtr,
1058      uint64_t *zSig0Ptr,
1059      uint64_t *zSig1Ptr
1060  )
1061 {
1062     int8_t shiftCount;
1063
1064     if ( aSig0 == 0 ) {
1065         shiftCount = countLeadingZeros64( aSig1 ) - 15;
1066         if ( shiftCount < 0 ) {
1067             *zSig0Ptr = aSig1>>( - shiftCount );
1068             *zSig1Ptr = aSig1<<( shiftCount & 63 );
1069         }
1070         else {
1071             *zSig0Ptr = aSig1<<shiftCount;
1072             *zSig1Ptr = 0;
1073         }
1074         *zExpPtr = - shiftCount - 63;
1075     }
1076     else {
1077         shiftCount = countLeadingZeros64( aSig0 ) - 15;
1078         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1079         *zExpPtr = 1 - shiftCount;
1080     }
1081
1082 }
1083
1084 /*----------------------------------------------------------------------------
1085 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1086 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1087 | floating-point value, returning the result.  After being shifted into the
1088 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1089 | added together to form the most significant 32 bits of the result.  This
1090 | means that any integer portion of `zSig0' will be added into the exponent.
1091 | Since a properly normalized significand will have an integer portion equal
1092 | to 1, the `zExp' input should be 1 less than the desired result exponent
1093 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1094 | significand.
1095 *----------------------------------------------------------------------------*/
1096
1097 static inline float128
1098  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
1099 {
1100     float128 z;
1101
1102     z.low = zSig1;
1103     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1104     return z;
1105
1106 }
1107
1108 /*----------------------------------------------------------------------------
1109 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1110 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1111 | and `zSig2', and returns the proper quadruple-precision floating-point value
1112 | corresponding to the abstract input.  Ordinarily, the abstract value is
1113 | simply rounded and packed into the quadruple-precision format, with the
1114 | inexact exception raised if the abstract input cannot be represented
1115 | exactly.  However, if the abstract value is too large, the overflow and
1116 | inexact exceptions are raised and an infinity or maximal finite value is
1117 | returned.  If the abstract value is too small, the input value is rounded to
1118 | a subnormal number, and the underflow and inexact exceptions are raised if
1119 | the abstract input cannot be represented exactly as a subnormal quadruple-
1120 | precision floating-point number.
1121 |     The input significand must be normalized or smaller.  If the input
1122 | significand is not normalized, `zExp' must be 0; in that case, the result
1123 | returned is a subnormal number, and it must not require rounding.  In the
1124 | usual case that the input significand is normalized, `zExp' must be 1 less
1125 | than the ``true'' floating-point exponent.  The handling of underflow and
1126 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127 *----------------------------------------------------------------------------*/
1128
1129 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
1130                                      uint64_t zSig0, uint64_t zSig1,
1131                                      uint64_t zSig2, float_status *status)
1132 {
1133     int8_t roundingMode;
1134     flag roundNearestEven, increment, isTiny;
1135
1136     roundingMode = status->float_rounding_mode;
1137     roundNearestEven = ( roundingMode == float_round_nearest_even );
1138     switch (roundingMode) {
1139     case float_round_nearest_even:
1140     case float_round_ties_away:
1141         increment = ((int64_t)zSig2 < 0);
1142         break;
1143     case float_round_to_zero:
1144         increment = 0;
1145         break;
1146     case float_round_up:
1147         increment = !zSign && zSig2;
1148         break;
1149     case float_round_down:
1150         increment = zSign && zSig2;
1151         break;
1152     default:
1153         abort();
1154     }
1155     if ( 0x7FFD <= (uint32_t) zExp ) {
1156         if (    ( 0x7FFD < zExp )
1157              || (    ( zExp == 0x7FFD )
1158                   && eq128(
1159                          LIT64( 0x0001FFFFFFFFFFFF ),
1160                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1161                          zSig0,
1162                          zSig1
1163                      )
1164                   && increment
1165                 )
1166            ) {
1167             float_raise(float_flag_overflow | float_flag_inexact, status);
1168             if (    ( roundingMode == float_round_to_zero )
1169                  || ( zSign && ( roundingMode == float_round_up ) )
1170                  || ( ! zSign && ( roundingMode == float_round_down ) )
1171                ) {
1172                 return
1173                     packFloat128(
1174                         zSign,
1175                         0x7FFE,
1176                         LIT64( 0x0000FFFFFFFFFFFF ),
1177                         LIT64( 0xFFFFFFFFFFFFFFFF )
1178                     );
1179             }
1180             return packFloat128( zSign, 0x7FFF, 0, 0 );
1181         }
1182         if ( zExp < 0 ) {
1183             if (status->flush_to_zero) {
1184                 float_raise(float_flag_output_denormal, status);
1185                 return packFloat128(zSign, 0, 0, 0);
1186             }
1187             isTiny =
1188                    (status->float_detect_tininess
1189                     == float_tininess_before_rounding)
1190                 || ( zExp < -1 )
1191                 || ! increment
1192                 || lt128(
1193                        zSig0,
1194                        zSig1,
1195                        LIT64( 0x0001FFFFFFFFFFFF ),
1196                        LIT64( 0xFFFFFFFFFFFFFFFF )
1197                    );
1198             shift128ExtraRightJamming(
1199                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1200             zExp = 0;
1201             if (isTiny && zSig2) {
1202                 float_raise(float_flag_underflow, status);
1203             }
1204             switch (roundingMode) {
1205             case float_round_nearest_even:
1206             case float_round_ties_away:
1207                 increment = ((int64_t)zSig2 < 0);
1208                 break;
1209             case float_round_to_zero:
1210                 increment = 0;
1211                 break;
1212             case float_round_up:
1213                 increment = !zSign && zSig2;
1214                 break;
1215             case float_round_down:
1216                 increment = zSign && zSig2;
1217                 break;
1218             default:
1219                 abort();
1220             }
1221         }
1222     }
1223     if (zSig2) {
1224         status->float_exception_flags |= float_flag_inexact;
1225     }
1226     if ( increment ) {
1227         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1228         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1229     }
1230     else {
1231         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1232     }
1233     return packFloat128( zSign, zExp, zSig0, zSig1 );
1234
1235 }
1236
1237 /*----------------------------------------------------------------------------
1238 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1239 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1240 | returns the proper quadruple-precision floating-point value corresponding
1241 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1242 | except that the input significand has fewer bits and does not have to be
1243 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1244 | point exponent.
1245 *----------------------------------------------------------------------------*/
1246
1247 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
1248                                               uint64_t zSig0, uint64_t zSig1,
1249                                               float_status *status)
1250 {
1251     int8_t shiftCount;
1252     uint64_t zSig2;
1253
1254     if ( zSig0 == 0 ) {
1255         zSig0 = zSig1;
1256         zSig1 = 0;
1257         zExp -= 64;
1258     }
1259     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1260     if ( 0 <= shiftCount ) {
1261         zSig2 = 0;
1262         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1263     }
1264     else {
1265         shift128ExtraRightJamming(
1266             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1267     }
1268     zExp -= shiftCount;
1269     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
1270
1271 }
1272
1273 /*----------------------------------------------------------------------------
1274 | Returns the result of converting the 32-bit two's complement integer `a'
1275 | to the single-precision floating-point format.  The conversion is performed
1276 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1277 *----------------------------------------------------------------------------*/
1278
1279 float32 int32_to_float32(int32_t a, float_status *status)
1280 {
1281     flag zSign;
1282
1283     if ( a == 0 ) return float32_zero;
1284     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1285     zSign = ( a < 0 );
1286     return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
1287 }
1288
1289 /*----------------------------------------------------------------------------
1290 | Returns the result of converting the 32-bit two's complement integer `a'
1291 | to the double-precision floating-point format.  The conversion is performed
1292 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1293 *----------------------------------------------------------------------------*/
1294
1295 float64 int32_to_float64(int32_t a, float_status *status)
1296 {
1297     flag zSign;
1298     uint32_t absA;
1299     int8_t shiftCount;
1300     uint64_t zSig;
1301
1302     if ( a == 0 ) return float64_zero;
1303     zSign = ( a < 0 );
1304     absA = zSign ? - a : a;
1305     shiftCount = countLeadingZeros32( absA ) + 21;
1306     zSig = absA;
1307     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1308
1309 }
1310
1311 /*----------------------------------------------------------------------------
1312 | Returns the result of converting the 32-bit two's complement integer `a'
1313 | to the extended double-precision floating-point format.  The conversion
1314 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1315 | Arithmetic.
1316 *----------------------------------------------------------------------------*/
1317
1318 floatx80 int32_to_floatx80(int32_t a, float_status *status)
1319 {
1320     flag zSign;
1321     uint32_t absA;
1322     int8_t shiftCount;
1323     uint64_t zSig;
1324
1325     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1326     zSign = ( a < 0 );
1327     absA = zSign ? - a : a;
1328     shiftCount = countLeadingZeros32( absA ) + 32;
1329     zSig = absA;
1330     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1331
1332 }
1333
1334 /*----------------------------------------------------------------------------
1335 | Returns the result of converting the 32-bit two's complement integer `a' to
1336 | the quadruple-precision floating-point format.  The conversion is performed
1337 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1338 *----------------------------------------------------------------------------*/
1339
1340 float128 int32_to_float128(int32_t a, float_status *status)
1341 {
1342     flag zSign;
1343     uint32_t absA;
1344     int8_t shiftCount;
1345     uint64_t zSig0;
1346
1347     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1348     zSign = ( a < 0 );
1349     absA = zSign ? - a : a;
1350     shiftCount = countLeadingZeros32( absA ) + 17;
1351     zSig0 = absA;
1352     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1353
1354 }
1355
1356 /*----------------------------------------------------------------------------
1357 | Returns the result of converting the 64-bit two's complement integer `a'
1358 | to the single-precision floating-point format.  The conversion is performed
1359 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1360 *----------------------------------------------------------------------------*/
1361
1362 float32 int64_to_float32(int64_t a, float_status *status)
1363 {
1364     flag zSign;
1365     uint64_t absA;
1366     int8_t shiftCount;
1367
1368     if ( a == 0 ) return float32_zero;
1369     zSign = ( a < 0 );
1370     absA = zSign ? - a : a;
1371     shiftCount = countLeadingZeros64( absA ) - 40;
1372     if ( 0 <= shiftCount ) {
1373         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1374     }
1375     else {
1376         shiftCount += 7;
1377         if ( shiftCount < 0 ) {
1378             shift64RightJamming( absA, - shiftCount, &absA );
1379         }
1380         else {
1381             absA <<= shiftCount;
1382         }
1383         return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
1384     }
1385
1386 }
1387
1388 /*----------------------------------------------------------------------------
1389 | Returns the result of converting the 64-bit two's complement integer `a'
1390 | to the double-precision floating-point format.  The conversion is performed
1391 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1392 *----------------------------------------------------------------------------*/
1393
1394 float64 int64_to_float64(int64_t a, float_status *status)
1395 {
1396     flag zSign;
1397
1398     if ( a == 0 ) return float64_zero;
1399     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1400         return packFloat64( 1, 0x43E, 0 );
1401     }
1402     zSign = ( a < 0 );
1403     return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
1404 }
1405
1406 /*----------------------------------------------------------------------------
1407 | Returns the result of converting the 64-bit two's complement integer `a'
1408 | to the extended double-precision floating-point format.  The conversion
1409 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1410 | Arithmetic.
1411 *----------------------------------------------------------------------------*/
1412
1413 floatx80 int64_to_floatx80(int64_t a, float_status *status)
1414 {
1415     flag zSign;
1416     uint64_t absA;
1417     int8_t shiftCount;
1418
1419     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1420     zSign = ( a < 0 );
1421     absA = zSign ? - a : a;
1422     shiftCount = countLeadingZeros64( absA );
1423     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1424
1425 }
1426
1427 /*----------------------------------------------------------------------------
1428 | Returns the result of converting the 64-bit two's complement integer `a' to
1429 | the quadruple-precision floating-point format.  The conversion is performed
1430 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1431 *----------------------------------------------------------------------------*/
1432
1433 float128 int64_to_float128(int64_t a, float_status *status)
1434 {
1435     flag zSign;
1436     uint64_t absA;
1437     int8_t shiftCount;
1438     int32_t zExp;
1439     uint64_t zSig0, zSig1;
1440
1441     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1442     zSign = ( a < 0 );
1443     absA = zSign ? - a : a;
1444     shiftCount = countLeadingZeros64( absA ) + 49;
1445     zExp = 0x406E - shiftCount;
1446     if ( 64 <= shiftCount ) {
1447         zSig1 = 0;
1448         zSig0 = absA;
1449         shiftCount -= 64;
1450     }
1451     else {
1452         zSig1 = absA;
1453         zSig0 = 0;
1454     }
1455     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1456     return packFloat128( zSign, zExp, zSig0, zSig1 );
1457
1458 }
1459
1460 /*----------------------------------------------------------------------------
1461 | Returns the result of converting the 64-bit unsigned integer `a'
1462 | to the single-precision floating-point format.  The conversion is performed
1463 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1464 *----------------------------------------------------------------------------*/
1465
1466 float32 uint64_to_float32(uint64_t a, float_status *status)
1467 {
1468     int shiftcount;
1469
1470     if (a == 0) {
1471         return float32_zero;
1472     }
1473
1474     /* Determine (left) shift needed to put first set bit into bit posn 23
1475      * (since packFloat32() expects the binary point between bits 23 and 22);
1476      * this is the fast case for smallish numbers.
1477      */
1478     shiftcount = countLeadingZeros64(a) - 40;
1479     if (shiftcount >= 0) {
1480         return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1481     }
1482     /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1483      * expects the binary point between bits 30 and 29, hence the + 7.
1484      */
1485     shiftcount += 7;
1486     if (shiftcount < 0) {
1487         shift64RightJamming(a, -shiftcount, &a);
1488     } else {
1489         a <<= shiftcount;
1490     }
1491
1492     return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
1493 }
1494
1495 /*----------------------------------------------------------------------------
1496 | Returns the result of converting the 64-bit unsigned integer `a'
1497 | to the double-precision floating-point format.  The conversion is performed
1498 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1499 *----------------------------------------------------------------------------*/
1500
1501 float64 uint64_to_float64(uint64_t a, float_status *status)
1502 {
1503     int exp = 0x43C;
1504     int shiftcount;
1505
1506     if (a == 0) {
1507         return float64_zero;
1508     }
1509
1510     shiftcount = countLeadingZeros64(a) - 1;
1511     if (shiftcount < 0) {
1512         shift64RightJamming(a, -shiftcount, &a);
1513     } else {
1514         a <<= shiftcount;
1515     }
1516     return roundAndPackFloat64(0, exp - shiftcount, a, status);
1517 }
1518
1519 /*----------------------------------------------------------------------------
1520 | Returns the result of converting the 64-bit unsigned integer `a'
1521 | to the quadruple-precision floating-point format.  The conversion is performed
1522 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1523 *----------------------------------------------------------------------------*/
1524
1525 float128 uint64_to_float128(uint64_t a, float_status *status)
1526 {
1527     if (a == 0) {
1528         return float128_zero;
1529     }
1530     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
1531 }
1532
1533 /*----------------------------------------------------------------------------
1534 | Returns the result of converting the single-precision floating-point value
1535 | `a' to the 32-bit two's complement integer format.  The conversion is
1536 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1537 | Arithmetic---which means in particular that the conversion is rounded
1538 | according to the current rounding mode.  If `a' is a NaN, the largest
1539 | positive integer is returned.  Otherwise, if the conversion overflows, the
1540 | largest integer with the same sign as `a' is returned.
1541 *----------------------------------------------------------------------------*/
1542
1543 int32_t float32_to_int32(float32 a, float_status *status)
1544 {
1545     flag aSign;
1546     int aExp;
1547     int shiftCount;
1548     uint32_t aSig;
1549     uint64_t aSig64;
1550
1551     a = float32_squash_input_denormal(a, status);
1552     aSig = extractFloat32Frac( a );
1553     aExp = extractFloat32Exp( a );
1554     aSign = extractFloat32Sign( a );
1555     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1556     if ( aExp ) aSig |= 0x00800000;
1557     shiftCount = 0xAF - aExp;
1558     aSig64 = aSig;
1559     aSig64 <<= 32;
1560     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1561     return roundAndPackInt32(aSign, aSig64, status);
1562
1563 }
1564
1565 /*----------------------------------------------------------------------------
1566 | Returns the result of converting the single-precision floating-point value
1567 | `a' to the 32-bit two's complement integer format.  The conversion is
1568 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1569 | Arithmetic, except that the conversion is always rounded toward zero.
1570 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1571 | the conversion overflows, the largest integer with the same sign as `a' is
1572 | returned.
1573 *----------------------------------------------------------------------------*/
1574
1575 int32_t float32_to_int32_round_to_zero(float32 a, float_status *status)
1576 {
1577     flag aSign;
1578     int aExp;
1579     int shiftCount;
1580     uint32_t aSig;
1581     int32_t z;
1582     a = float32_squash_input_denormal(a, status);
1583
1584     aSig = extractFloat32Frac( a );
1585     aExp = extractFloat32Exp( a );
1586     aSign = extractFloat32Sign( a );
1587     shiftCount = aExp - 0x9E;
1588     if ( 0 <= shiftCount ) {
1589         if ( float32_val(a) != 0xCF000000 ) {
1590             float_raise(float_flag_invalid, status);
1591             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1592         }
1593         return (int32_t) 0x80000000;
1594     }
1595     else if ( aExp <= 0x7E ) {
1596         if (aExp | aSig) {
1597             status->float_exception_flags |= float_flag_inexact;
1598         }
1599         return 0;
1600     }
1601     aSig = ( aSig | 0x00800000 )<<8;
1602     z = aSig>>( - shiftCount );
1603     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1604         status->float_exception_flags |= float_flag_inexact;
1605     }
1606     if ( aSign ) z = - z;
1607     return z;
1608
1609 }
1610
1611 /*----------------------------------------------------------------------------
1612 | Returns the result of converting the single-precision floating-point value
1613 | `a' to the 16-bit two's complement integer format.  The conversion is
1614 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1615 | Arithmetic, except that the conversion is always rounded toward zero.
1616 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1617 | the conversion overflows, the largest integer with the same sign as `a' is
1618 | returned.
1619 *----------------------------------------------------------------------------*/
1620
1621 int16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
1622 {
1623     flag aSign;
1624     int aExp;
1625     int shiftCount;
1626     uint32_t aSig;
1627     int32_t z;
1628
1629     aSig = extractFloat32Frac( a );
1630     aExp = extractFloat32Exp( a );
1631     aSign = extractFloat32Sign( a );
1632     shiftCount = aExp - 0x8E;
1633     if ( 0 <= shiftCount ) {
1634         if ( float32_val(a) != 0xC7000000 ) {
1635             float_raise(float_flag_invalid, status);
1636             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1637                 return 0x7FFF;
1638             }
1639         }
1640         return (int32_t) 0xffff8000;
1641     }
1642     else if ( aExp <= 0x7E ) {
1643         if ( aExp | aSig ) {
1644             status->float_exception_flags |= float_flag_inexact;
1645         }
1646         return 0;
1647     }
1648     shiftCount -= 0x10;
1649     aSig = ( aSig | 0x00800000 )<<8;
1650     z = aSig>>( - shiftCount );
1651     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1652         status->float_exception_flags |= float_flag_inexact;
1653     }
1654     if ( aSign ) {
1655         z = - z;
1656     }
1657     return z;
1658
1659 }
1660
1661 /*----------------------------------------------------------------------------
1662 | Returns the result of converting the single-precision floating-point value
1663 | `a' to the 64-bit two's complement integer format.  The conversion is
1664 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1665 | Arithmetic---which means in particular that the conversion is rounded
1666 | according to the current rounding mode.  If `a' is a NaN, the largest
1667 | positive integer is returned.  Otherwise, if the conversion overflows, the
1668 | largest integer with the same sign as `a' is returned.
1669 *----------------------------------------------------------------------------*/
1670
1671 int64_t float32_to_int64(float32 a, float_status *status)
1672 {
1673     flag aSign;
1674     int aExp;
1675     int shiftCount;
1676     uint32_t aSig;
1677     uint64_t aSig64, aSigExtra;
1678     a = float32_squash_input_denormal(a, status);
1679
1680     aSig = extractFloat32Frac( a );
1681     aExp = extractFloat32Exp( a );
1682     aSign = extractFloat32Sign( a );
1683     shiftCount = 0xBE - aExp;
1684     if ( shiftCount < 0 ) {
1685         float_raise(float_flag_invalid, status);
1686         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1687             return LIT64( 0x7FFFFFFFFFFFFFFF );
1688         }
1689         return (int64_t) LIT64( 0x8000000000000000 );
1690     }
1691     if ( aExp ) aSig |= 0x00800000;
1692     aSig64 = aSig;
1693     aSig64 <<= 40;
1694     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1695     return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
1696
1697 }
1698
1699 /*----------------------------------------------------------------------------
1700 | Returns the result of converting the single-precision floating-point value
1701 | `a' to the 64-bit unsigned integer format.  The conversion is
1702 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1703 | Arithmetic---which means in particular that the conversion is rounded
1704 | according to the current rounding mode.  If `a' is a NaN, the largest
1705 | unsigned integer is returned.  Otherwise, if the conversion overflows, the
1706 | largest unsigned integer is returned.  If the 'a' is negative, the result
1707 | is rounded and zero is returned; values that do not round to zero will
1708 | raise the inexact exception flag.
1709 *----------------------------------------------------------------------------*/
1710
1711 uint64_t float32_to_uint64(float32 a, float_status *status)
1712 {
1713     flag aSign;
1714     int aExp;
1715     int shiftCount;
1716     uint32_t aSig;
1717     uint64_t aSig64, aSigExtra;
1718     a = float32_squash_input_denormal(a, status);
1719
1720     aSig = extractFloat32Frac(a);
1721     aExp = extractFloat32Exp(a);
1722     aSign = extractFloat32Sign(a);
1723     if ((aSign) && (aExp > 126)) {
1724         float_raise(float_flag_invalid, status);
1725         if (float32_is_any_nan(a)) {
1726             return LIT64(0xFFFFFFFFFFFFFFFF);
1727         } else {
1728             return 0;
1729         }
1730     }
1731     shiftCount = 0xBE - aExp;
1732     if (aExp) {
1733         aSig |= 0x00800000;
1734     }
1735     if (shiftCount < 0) {
1736         float_raise(float_flag_invalid, status);
1737         return LIT64(0xFFFFFFFFFFFFFFFF);
1738     }
1739
1740     aSig64 = aSig;
1741     aSig64 <<= 40;
1742     shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1743     return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
1744 }
1745
1746 /*----------------------------------------------------------------------------
1747 | Returns the result of converting the single-precision floating-point value
1748 | `a' to the 64-bit unsigned integer format.  The conversion is
1749 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1750 | Arithmetic, except that the conversion is always rounded toward zero.  If
1751 | `a' is a NaN, the largest unsigned integer is returned.  Otherwise, if the
1752 | conversion overflows, the largest unsigned integer is returned.  If the
1753 | 'a' is negative, the result is rounded and zero is returned; values that do
1754 | not round to zero will raise the inexact flag.
1755 *----------------------------------------------------------------------------*/
1756
1757 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status)
1758 {
1759     signed char current_rounding_mode = status->float_rounding_mode;
1760     set_float_rounding_mode(float_round_to_zero, status);
1761     int64_t v = float32_to_uint64(a, status);
1762     set_float_rounding_mode(current_rounding_mode, status);
1763     return v;
1764 }
1765
1766 /*----------------------------------------------------------------------------
1767 | Returns the result of converting the single-precision floating-point value
1768 | `a' to the 64-bit two's complement integer format.  The conversion is
1769 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1770 | Arithmetic, except that the conversion is always rounded toward zero.  If
1771 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1772 | conversion overflows, the largest integer with the same sign as `a' is
1773 | returned.
1774 *----------------------------------------------------------------------------*/
1775
1776 int64_t float32_to_int64_round_to_zero(float32 a, float_status *status)
1777 {
1778     flag aSign;
1779     int aExp;
1780     int shiftCount;
1781     uint32_t aSig;
1782     uint64_t aSig64;
1783     int64_t z;
1784     a = float32_squash_input_denormal(a, status);
1785
1786     aSig = extractFloat32Frac( a );
1787     aExp = extractFloat32Exp( a );
1788     aSign = extractFloat32Sign( a );
1789     shiftCount = aExp - 0xBE;
1790     if ( 0 <= shiftCount ) {
1791         if ( float32_val(a) != 0xDF000000 ) {
1792             float_raise(float_flag_invalid, status);
1793             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1794                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1795             }
1796         }
1797         return (int64_t) LIT64( 0x8000000000000000 );
1798     }
1799     else if ( aExp <= 0x7E ) {
1800         if (aExp | aSig) {
1801             status->float_exception_flags |= float_flag_inexact;
1802         }
1803         return 0;
1804     }
1805     aSig64 = aSig | 0x00800000;
1806     aSig64 <<= 40;
1807     z = aSig64>>( - shiftCount );
1808     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1809         status->float_exception_flags |= float_flag_inexact;
1810     }
1811     if ( aSign ) z = - z;
1812     return z;
1813
1814 }
1815
1816 /*----------------------------------------------------------------------------
1817 | Returns the result of converting the single-precision floating-point value
1818 | `a' to the double-precision floating-point format.  The conversion is
1819 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1820 | Arithmetic.
1821 *----------------------------------------------------------------------------*/
1822
1823 float64 float32_to_float64(float32 a, float_status *status)
1824 {
1825     flag aSign;
1826     int aExp;
1827     uint32_t aSig;
1828     a = float32_squash_input_denormal(a, status);
1829
1830     aSig = extractFloat32Frac( a );
1831     aExp = extractFloat32Exp( a );
1832     aSign = extractFloat32Sign( a );
1833     if ( aExp == 0xFF ) {
1834         if (aSig) {
1835             return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
1836         }
1837         return packFloat64( aSign, 0x7FF, 0 );
1838     }
1839     if ( aExp == 0 ) {
1840         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1841         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1842         --aExp;
1843     }
1844     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1845
1846 }
1847
1848 /*----------------------------------------------------------------------------
1849 | Returns the result of converting the single-precision floating-point value
1850 | `a' to the extended double-precision floating-point format.  The conversion
1851 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1852 | Arithmetic.
1853 *----------------------------------------------------------------------------*/
1854
1855 floatx80 float32_to_floatx80(float32 a, float_status *status)
1856 {
1857     flag aSign;
1858     int aExp;
1859     uint32_t aSig;
1860
1861     a = float32_squash_input_denormal(a, status);
1862     aSig = extractFloat32Frac( a );
1863     aExp = extractFloat32Exp( a );
1864     aSign = extractFloat32Sign( a );
1865     if ( aExp == 0xFF ) {
1866         if (aSig) {
1867             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
1868         }
1869         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1870     }
1871     if ( aExp == 0 ) {
1872         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1873         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1874     }
1875     aSig |= 0x00800000;
1876     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1877
1878 }
1879
1880 /*----------------------------------------------------------------------------
1881 | Returns the result of converting the single-precision floating-point value
1882 | `a' to the double-precision floating-point format.  The conversion is
1883 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1884 | Arithmetic.
1885 *----------------------------------------------------------------------------*/
1886
1887 float128 float32_to_float128(float32 a, float_status *status)
1888 {
1889     flag aSign;
1890     int aExp;
1891     uint32_t aSig;
1892
1893     a = float32_squash_input_denormal(a, status);
1894     aSig = extractFloat32Frac( a );
1895     aExp = extractFloat32Exp( a );
1896     aSign = extractFloat32Sign( a );
1897     if ( aExp == 0xFF ) {
1898         if (aSig) {
1899             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
1900         }
1901         return packFloat128( aSign, 0x7FFF, 0, 0 );
1902     }
1903     if ( aExp == 0 ) {
1904         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1905         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1906         --aExp;
1907     }
1908     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1909
1910 }
1911
1912 /*----------------------------------------------------------------------------
1913 | Rounds the single-precision floating-point value `a' to an integer, and
1914 | returns the result as a single-precision floating-point value.  The
1915 | operation is performed according to the IEC/IEEE Standard for Binary
1916 | Floating-Point Arithmetic.
1917 *----------------------------------------------------------------------------*/
1918
1919 float32 float32_round_to_int(float32 a, float_status *status)
1920 {
1921     flag aSign;
1922     int aExp;
1923     uint32_t lastBitMask, roundBitsMask;
1924     uint32_t z;
1925     a = float32_squash_input_denormal(a, status);
1926
1927     aExp = extractFloat32Exp( a );
1928     if ( 0x96 <= aExp ) {
1929         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1930             return propagateFloat32NaN(a, a, status);
1931         }
1932         return a;
1933     }
1934     if ( aExp <= 0x7E ) {
1935         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1936         status->float_exception_flags |= float_flag_inexact;
1937         aSign = extractFloat32Sign( a );
1938         switch (status->float_rounding_mode) {
1939          case float_round_nearest_even:
1940             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1941                 return packFloat32( aSign, 0x7F, 0 );
1942             }
1943             break;
1944         case float_round_ties_away:
1945             if (aExp == 0x7E) {
1946                 return packFloat32(aSign, 0x7F, 0);
1947             }
1948             break;
1949          case float_round_down:
1950             return make_float32(aSign ? 0xBF800000 : 0);
1951          case float_round_up:
1952             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1953         }
1954         return packFloat32( aSign, 0, 0 );
1955     }
1956     lastBitMask = 1;
1957     lastBitMask <<= 0x96 - aExp;
1958     roundBitsMask = lastBitMask - 1;
1959     z = float32_val(a);
1960     switch (status->float_rounding_mode) {
1961     case float_round_nearest_even:
1962         z += lastBitMask>>1;
1963         if ((z & roundBitsMask) == 0) {
1964             z &= ~lastBitMask;
1965         }
1966         break;
1967     case float_round_ties_away:
1968         z += lastBitMask >> 1;
1969         break;
1970     case float_round_to_zero:
1971         break;
1972     case float_round_up:
1973         if (!extractFloat32Sign(make_float32(z))) {
1974             z += roundBitsMask;
1975         }
1976         break;
1977     case float_round_down:
1978         if (extractFloat32Sign(make_float32(z))) {
1979             z += roundBitsMask;
1980         }
1981         break;
1982     default:
1983         abort();
1984     }
1985     z &= ~ roundBitsMask;
1986     if (z != float32_val(a)) {
1987         status->float_exception_flags |= float_flag_inexact;
1988     }
1989     return make_float32(z);
1990
1991 }
1992
1993 /*----------------------------------------------------------------------------
1994 | Returns the result of adding the absolute values of the single-precision
1995 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1996 | before being returned.  `zSign' is ignored if the result is a NaN.
1997 | The addition is performed according to the IEC/IEEE Standard for Binary
1998 | Floating-Point Arithmetic.
1999 *----------------------------------------------------------------------------*/
2000
2001 static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
2002                               float_status *status)
2003 {
2004     int aExp, bExp, zExp;
2005     uint32_t aSig, bSig, zSig;
2006     int expDiff;
2007
2008     aSig = extractFloat32Frac( a );
2009     aExp = extractFloat32Exp( a );
2010     bSig = extractFloat32Frac( b );
2011     bExp = extractFloat32Exp( b );
2012     expDiff = aExp - bExp;
2013     aSig <<= 6;
2014     bSig <<= 6;
2015     if ( 0 < expDiff ) {
2016         if ( aExp == 0xFF ) {
2017             if (aSig) {
2018                 return propagateFloat32NaN(a, b, status);
2019             }
2020             return a;
2021         }
2022         if ( bExp == 0 ) {
2023             --expDiff;
2024         }
2025         else {
2026             bSig |= 0x20000000;
2027         }
2028         shift32RightJamming( bSig, expDiff, &bSig );
2029         zExp = aExp;
2030     }
2031     else if ( expDiff < 0 ) {
2032         if ( bExp == 0xFF ) {
2033             if (bSig) {
2034                 return propagateFloat32NaN(a, b, status);
2035             }
2036             return packFloat32( zSign, 0xFF, 0 );
2037         }
2038         if ( aExp == 0 ) {
2039             ++expDiff;
2040         }
2041         else {
2042             aSig |= 0x20000000;
2043         }
2044         shift32RightJamming( aSig, - expDiff, &aSig );
2045         zExp = bExp;
2046     }
2047     else {
2048         if ( aExp == 0xFF ) {
2049             if (aSig | bSig) {
2050                 return propagateFloat32NaN(a, b, status);
2051             }
2052             return a;
2053         }
2054         if ( aExp == 0 ) {
2055             if (status->flush_to_zero) {
2056                 if (aSig | bSig) {
2057                     float_raise(float_flag_output_denormal, status);
2058                 }
2059                 return packFloat32(zSign, 0, 0);
2060             }
2061             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2062         }
2063         zSig = 0x40000000 + aSig + bSig;
2064         zExp = aExp;
2065         goto roundAndPack;
2066     }
2067     aSig |= 0x20000000;
2068     zSig = ( aSig + bSig )<<1;
2069     --zExp;
2070     if ( (int32_t) zSig < 0 ) {
2071         zSig = aSig + bSig;
2072         ++zExp;
2073     }
2074  roundAndPack:
2075     return roundAndPackFloat32(zSign, zExp, zSig, status);
2076
2077 }
2078
2079 /*----------------------------------------------------------------------------
2080 | Returns the result of subtracting the absolute values of the single-
2081 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2082 | difference is negated before being returned.  `zSign' is ignored if the
2083 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2084 | Standard for Binary Floating-Point Arithmetic.
2085 *----------------------------------------------------------------------------*/
2086
2087 static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
2088                               float_status *status)
2089 {
2090     int aExp, bExp, zExp;
2091     uint32_t aSig, bSig, zSig;
2092     int expDiff;
2093
2094     aSig = extractFloat32Frac( a );
2095     aExp = extractFloat32Exp( a );
2096     bSig = extractFloat32Frac( b );
2097     bExp = extractFloat32Exp( b );
2098     expDiff = aExp - bExp;
2099     aSig <<= 7;
2100     bSig <<= 7;
2101     if ( 0 < expDiff ) goto aExpBigger;
2102     if ( expDiff < 0 ) goto bExpBigger;
2103     if ( aExp == 0xFF ) {
2104         if (aSig | bSig) {
2105             return propagateFloat32NaN(a, b, status);
2106         }
2107         float_raise(float_flag_invalid, status);
2108         return float32_default_nan(status);
2109     }
2110     if ( aExp == 0 ) {
2111         aExp = 1;
2112         bExp = 1;
2113     }
2114     if ( bSig < aSig ) goto aBigger;
2115     if ( aSig < bSig ) goto bBigger;
2116     return packFloat32(status->float_rounding_mode == float_round_down, 0, 0);
2117  bExpBigger:
2118     if ( bExp == 0xFF ) {
2119         if (bSig) {
2120             return propagateFloat32NaN(a, b, status);
2121         }
2122         return packFloat32( zSign ^ 1, 0xFF, 0 );
2123     }
2124     if ( aExp == 0 ) {
2125         ++expDiff;
2126     }
2127     else {
2128         aSig |= 0x40000000;
2129     }
2130     shift32RightJamming( aSig, - expDiff, &aSig );
2131     bSig |= 0x40000000;
2132  bBigger:
2133     zSig = bSig - aSig;
2134     zExp = bExp;
2135     zSign ^= 1;
2136     goto normalizeRoundAndPack;
2137  aExpBigger:
2138     if ( aExp == 0xFF ) {
2139         if (aSig) {
2140             return propagateFloat32NaN(a, b, status);
2141         }
2142         return a;
2143     }
2144     if ( bExp == 0 ) {
2145         --expDiff;
2146     }
2147     else {
2148         bSig |= 0x40000000;
2149     }
2150     shift32RightJamming( bSig, expDiff, &bSig );
2151     aSig |= 0x40000000;
2152  aBigger:
2153     zSig = aSig - bSig;
2154     zExp = aExp;
2155  normalizeRoundAndPack:
2156     --zExp;
2157     return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
2158
2159 }
2160
2161 /*----------------------------------------------------------------------------
2162 | Returns the result of adding the single-precision floating-point values `a'
2163 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2164 | Binary Floating-Point Arithmetic.
2165 *----------------------------------------------------------------------------*/
2166
2167 float32 float32_add(float32 a, float32 b, float_status *status)
2168 {
2169     flag aSign, bSign;
2170     a = float32_squash_input_denormal(a, status);
2171     b = float32_squash_input_denormal(b, status);
2172
2173     aSign = extractFloat32Sign( a );
2174     bSign = extractFloat32Sign( b );
2175     if ( aSign == bSign ) {
2176         return addFloat32Sigs(a, b, aSign, status);
2177     }
2178     else {
2179         return subFloat32Sigs(a, b, aSign, status);
2180     }
2181
2182 }
2183
2184 /*----------------------------------------------------------------------------
2185 | Returns the result of subtracting the single-precision floating-point values
2186 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2187 | for Binary Floating-Point Arithmetic.
2188 *----------------------------------------------------------------------------*/
2189
2190 float32 float32_sub(float32 a, float32 b, float_status *status)
2191 {
2192     flag aSign, bSign;
2193     a = float32_squash_input_denormal(a, status);
2194     b = float32_squash_input_denormal(b, status);
2195
2196     aSign = extractFloat32Sign( a );
2197     bSign = extractFloat32Sign( b );
2198     if ( aSign == bSign ) {
2199         return subFloat32Sigs(a, b, aSign, status);
2200     }
2201     else {
2202         return addFloat32Sigs(a, b, aSign, status);
2203     }
2204
2205 }
2206
2207 /*----------------------------------------------------------------------------
2208 | Returns the result of multiplying the single-precision floating-point values
2209 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2210 | for Binary Floating-Point Arithmetic.
2211 *----------------------------------------------------------------------------*/
2212
2213 float32 float32_mul(float32 a, float32 b, float_status *status)
2214 {
2215     flag aSign, bSign, zSign;
2216     int aExp, bExp, zExp;
2217     uint32_t aSig, bSig;
2218     uint64_t zSig64;
2219     uint32_t zSig;
2220
2221     a = float32_squash_input_denormal(a, status);
2222     b = float32_squash_input_denormal(b, status);
2223
2224     aSig = extractFloat32Frac( a );
2225     aExp = extractFloat32Exp( a );
2226     aSign = extractFloat32Sign( a );
2227     bSig = extractFloat32Frac( b );
2228     bExp = extractFloat32Exp( b );
2229     bSign = extractFloat32Sign( b );
2230     zSign = aSign ^ bSign;
2231     if ( aExp == 0xFF ) {
2232         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2233             return propagateFloat32NaN(a, b, status);
2234         }
2235         if ( ( bExp | bSig ) == 0 ) {
2236             float_raise(float_flag_invalid, status);
2237             return float32_default_nan(status);
2238         }
2239         return packFloat32( zSign, 0xFF, 0 );
2240     }
2241     if ( bExp == 0xFF ) {
2242         if (bSig) {
2243             return propagateFloat32NaN(a, b, status);
2244         }
2245         if ( ( aExp | aSig ) == 0 ) {
2246             float_raise(float_flag_invalid, status);
2247             return float32_default_nan(status);
2248         }
2249         return packFloat32( zSign, 0xFF, 0 );
2250     }
2251     if ( aExp == 0 ) {
2252         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2253         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2254     }
2255     if ( bExp == 0 ) {
2256         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2257         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2258     }
2259     zExp = aExp + bExp - 0x7F;
2260     aSig = ( aSig | 0x00800000 )<<7;
2261     bSig = ( bSig | 0x00800000 )<<8;
2262     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2263     zSig = zSig64;
2264     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2265         zSig <<= 1;
2266         --zExp;
2267     }
2268     return roundAndPackFloat32(zSign, zExp, zSig, status);
2269
2270 }
2271
2272 /*----------------------------------------------------------------------------
2273 | Returns the result of dividing the single-precision floating-point value `a'
2274 | by the corresponding value `b'.  The operation is performed according to the
2275 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2276 *----------------------------------------------------------------------------*/
2277
2278 float32 float32_div(float32 a, float32 b, float_status *status)
2279 {
2280     flag aSign, bSign, zSign;
2281     int aExp, bExp, zExp;
2282     uint32_t aSig, bSig, zSig;
2283     a = float32_squash_input_denormal(a, status);
2284     b = float32_squash_input_denormal(b, status);
2285
2286     aSig = extractFloat32Frac( a );
2287     aExp = extractFloat32Exp( a );
2288     aSign = extractFloat32Sign( a );
2289     bSig = extractFloat32Frac( b );
2290     bExp = extractFloat32Exp( b );
2291     bSign = extractFloat32Sign( b );
2292     zSign = aSign ^ bSign;
2293     if ( aExp == 0xFF ) {
2294         if (aSig) {
2295             return propagateFloat32NaN(a, b, status);
2296         }
2297         if ( bExp == 0xFF ) {
2298             if (bSig) {
2299                 return propagateFloat32NaN(a, b, status);
2300             }
2301             float_raise(float_flag_invalid, status);
2302             return float32_default_nan(status);
2303         }
2304         return packFloat32( zSign, 0xFF, 0 );
2305     }
2306     if ( bExp == 0xFF ) {
2307         if (bSig) {
2308             return propagateFloat32NaN(a, b, status);
2309         }
2310         return packFloat32( zSign, 0, 0 );
2311     }
2312     if ( bExp == 0 ) {
2313         if ( bSig == 0 ) {
2314             if ( ( aExp | aSig ) == 0 ) {
2315                 float_raise(float_flag_invalid, status);
2316                 return float32_default_nan(status);
2317             }
2318             float_raise(float_flag_divbyzero, status);
2319             return packFloat32( zSign, 0xFF, 0 );
2320         }
2321         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2322     }
2323     if ( aExp == 0 ) {
2324         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2325         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2326     }
2327     zExp = aExp - bExp + 0x7D;
2328     aSig = ( aSig | 0x00800000 )<<7;
2329     bSig = ( bSig | 0x00800000 )<<8;
2330     if ( bSig <= ( aSig + aSig ) ) {
2331         aSig >>= 1;
2332         ++zExp;
2333     }
2334     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2335     if ( ( zSig & 0x3F ) == 0 ) {
2336         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2337     }
2338     return roundAndPackFloat32(zSign, zExp, zSig, status);
2339
2340 }
2341
2342 /*----------------------------------------------------------------------------
2343 | Returns the remainder of the single-precision floating-point value `a'
2344 | with respect to the corresponding value `b'.  The operation is performed
2345 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2346 *----------------------------------------------------------------------------*/
2347
2348 float32 float32_rem(float32 a, float32 b, float_status *status)
2349 {
2350     flag aSign, zSign;
2351     int aExp, bExp, expDiff;
2352     uint32_t aSig, bSig;
2353     uint32_t q;
2354     uint64_t aSig64, bSig64, q64;
2355     uint32_t alternateASig;
2356     int32_t sigMean;
2357     a = float32_squash_input_denormal(a, status);
2358     b = float32_squash_input_denormal(b, status);
2359
2360     aSig = extractFloat32Frac( a );
2361     aExp = extractFloat32Exp( a );
2362     aSign = extractFloat32Sign( a );
2363     bSig = extractFloat32Frac( b );
2364     bExp = extractFloat32Exp( b );
2365     if ( aExp == 0xFF ) {
2366         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2367             return propagateFloat32NaN(a, b, status);
2368         }
2369         float_raise(float_flag_invalid, status);
2370         return float32_default_nan(status);
2371     }
2372     if ( bExp == 0xFF ) {
2373         if (bSig) {
2374             return propagateFloat32NaN(a, b, status);
2375         }
2376         return a;
2377     }
2378     if ( bExp == 0 ) {
2379         if ( bSig == 0 ) {
2380             float_raise(float_flag_invalid, status);
2381             return float32_default_nan(status);
2382         }
2383         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2384     }
2385     if ( aExp == 0 ) {
2386         if ( aSig == 0 ) return a;
2387         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2388     }
2389     expDiff = aExp - bExp;
2390     aSig |= 0x00800000;
2391     bSig |= 0x00800000;
2392     if ( expDiff < 32 ) {
2393         aSig <<= 8;
2394         bSig <<= 8;
2395         if ( expDiff < 0 ) {
2396             if ( expDiff < -1 ) return a;
2397             aSig >>= 1;
2398         }
2399         q = ( bSig <= aSig );
2400         if ( q ) aSig -= bSig;
2401         if ( 0 < expDiff ) {
2402             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2403             q >>= 32 - expDiff;
2404             bSig >>= 2;
2405             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2406         }
2407         else {
2408             aSig >>= 2;
2409             bSig >>= 2;
2410         }
2411     }
2412     else {
2413         if ( bSig <= aSig ) aSig -= bSig;
2414         aSig64 = ( (uint64_t) aSig )<<40;
2415         bSig64 = ( (uint64_t) bSig )<<40;
2416         expDiff -= 64;
2417         while ( 0 < expDiff ) {
2418             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2419             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2420             aSig64 = - ( ( bSig * q64 )<<38 );
2421             expDiff -= 62;
2422         }
2423         expDiff += 64;
2424         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2425         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2426         q = q64>>( 64 - expDiff );
2427         bSig <<= 6;
2428         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2429     }
2430     do {
2431         alternateASig = aSig;
2432         ++q;
2433         aSig -= bSig;
2434     } while ( 0 <= (int32_t) aSig );
2435     sigMean = aSig + alternateASig;
2436     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2437         aSig = alternateASig;
2438     }
2439     zSign = ( (int32_t) aSig < 0 );
2440     if ( zSign ) aSig = - aSig;
2441     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
2442 }
2443
2444 /*----------------------------------------------------------------------------
2445 | Returns the result of multiplying the single-precision floating-point values
2446 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2447 | multiplication.  The operation is performed according to the IEC/IEEE
2448 | Standard for Binary Floating-Point Arithmetic 754-2008.
2449 | The flags argument allows the caller to select negation of the
2450 | addend, the intermediate product, or the final result. (The difference
2451 | between this and having the caller do a separate negation is that negating
2452 | externally will flip the sign bit on NaNs.)
2453 *----------------------------------------------------------------------------*/
2454
2455 float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2456                        float_status *status)
2457 {
2458     flag aSign, bSign, cSign, zSign;
2459     int aExp, bExp, cExp, pExp, zExp, expDiff;
2460     uint32_t aSig, bSig, cSig;
2461     flag pInf, pZero, pSign;
2462     uint64_t pSig64, cSig64, zSig64;
2463     uint32_t pSig;
2464     int shiftcount;
2465     flag signflip, infzero;
2466
2467     a = float32_squash_input_denormal(a, status);
2468     b = float32_squash_input_denormal(b, status);
2469     c = float32_squash_input_denormal(c, status);
2470     aSig = extractFloat32Frac(a);
2471     aExp = extractFloat32Exp(a);
2472     aSign = extractFloat32Sign(a);
2473     bSig = extractFloat32Frac(b);
2474     bExp = extractFloat32Exp(b);
2475     bSign = extractFloat32Sign(b);
2476     cSig = extractFloat32Frac(c);
2477     cExp = extractFloat32Exp(c);
2478     cSign = extractFloat32Sign(c);
2479
2480     infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2481                (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2482
2483     /* It is implementation-defined whether the cases of (0,inf,qnan)
2484      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2485      * they return if they do), so we have to hand this information
2486      * off to the target-specific pick-a-NaN routine.
2487      */
2488     if (((aExp == 0xff) && aSig) ||
2489         ((bExp == 0xff) && bSig) ||
2490         ((cExp == 0xff) && cSig)) {
2491         return propagateFloat32MulAddNaN(a, b, c, infzero, status);
2492     }
2493
2494     if (infzero) {
2495         float_raise(float_flag_invalid, status);
2496         return float32_default_nan(status);
2497     }
2498
2499     if (flags & float_muladd_negate_c) {
2500         cSign ^= 1;
2501     }
2502
2503     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2504
2505     /* Work out the sign and type of the product */
2506     pSign = aSign ^ bSign;
2507     if (flags & float_muladd_negate_product) {
2508         pSign ^= 1;
2509     }
2510     pInf = (aExp == 0xff) || (bExp == 0xff);
2511     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2512
2513     if (cExp == 0xff) {
2514         if (pInf && (pSign ^ cSign)) {
2515             /* addition of opposite-signed infinities => InvalidOperation */
2516             float_raise(float_flag_invalid, status);
2517             return float32_default_nan(status);
2518         }
2519         /* Otherwise generate an infinity of the same sign */
2520         return packFloat32(cSign ^ signflip, 0xff, 0);
2521     }
2522
2523     if (pInf) {
2524         return packFloat32(pSign ^ signflip, 0xff, 0);
2525     }
2526
2527     if (pZero) {
2528         if (cExp == 0) {
2529             if (cSig == 0) {
2530                 /* Adding two exact zeroes */
2531                 if (pSign == cSign) {
2532                     zSign = pSign;
2533                 } else if (status->float_rounding_mode == float_round_down) {
2534                     zSign = 1;
2535                 } else {
2536                     zSign = 0;
2537                 }
2538                 return packFloat32(zSign ^ signflip, 0, 0);
2539             }
2540             /* Exact zero plus a denorm */
2541             if (status->flush_to_zero) {
2542                 float_raise(float_flag_output_denormal, status);
2543                 return packFloat32(cSign ^ signflip, 0, 0);
2544             }
2545         }
2546         /* Zero plus something non-zero : just return the something */
2547         if (flags & float_muladd_halve_result) {
2548             if (cExp == 0) {
2549                 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2550             }
2551             /* Subtract one to halve, and one again because roundAndPackFloat32
2552              * wants one less than the true exponent.
2553              */
2554             cExp -= 2;
2555             cSig = (cSig | 0x00800000) << 7;
2556             return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
2557         }
2558         return packFloat32(cSign ^ signflip, cExp, cSig);
2559     }
2560
2561     if (aExp == 0) {
2562         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2563     }
2564     if (bExp == 0) {
2565         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2566     }
2567
2568     /* Calculate the actual result a * b + c */
2569
2570     /* Multiply first; this is easy. */
2571     /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2572      * because we want the true exponent, not the "one-less-than"
2573      * flavour that roundAndPackFloat32() takes.
2574      */
2575     pExp = aExp + bExp - 0x7e;
2576     aSig = (aSig | 0x00800000) << 7;
2577     bSig = (bSig | 0x00800000) << 8;
2578     pSig64 = (uint64_t)aSig * bSig;
2579     if ((int64_t)(pSig64 << 1) >= 0) {
2580         pSig64 <<= 1;
2581         pExp--;
2582     }
2583
2584     zSign = pSign ^ signflip;
2585
2586     /* Now pSig64 is the significand of the multiply, with the explicit bit in
2587      * position 62.
2588      */
2589     if (cExp == 0) {
2590         if (!cSig) {
2591             /* Throw out the special case of c being an exact zero now */
2592             shift64RightJamming(pSig64, 32, &pSig64);
2593             pSig = pSig64;
2594             if (flags & float_muladd_halve_result) {
2595                 pExp--;
2596             }
2597             return roundAndPackFloat32(zSign, pExp - 1,
2598                                        pSig, status);
2599         }
2600         normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2601     }
2602
2603     cSig64 = (uint64_t)cSig << (62 - 23);
2604     cSig64 |= LIT64(0x4000000000000000);
2605     expDiff = pExp - cExp;
2606
2607     if (pSign == cSign) {
2608         /* Addition */
2609         if (expDiff > 0) {
2610             /* scale c to match p */
2611             shift64RightJamming(cSig64, expDiff, &cSig64);
2612             zExp = pExp;
2613         } else if (expDiff < 0) {
2614             /* scale p to match c */
2615             shift64RightJamming(pSig64, -expDiff, &pSig64);
2616             zExp = cExp;
2617         } else {
2618             /* no scaling needed */
2619             zExp = cExp;
2620         }
2621         /* Add significands and make sure explicit bit ends up in posn 62 */
2622         zSig64 = pSig64 + cSig64;
2623         if ((int64_t)zSig64 < 0) {
2624             shift64RightJamming(zSig64, 1, &zSig64);
2625         } else {
2626             zExp--;
2627         }
2628     } else {
2629         /* Subtraction */
2630         if (expDiff > 0) {
2631             shift64RightJamming(cSig64, expDiff, &cSig64);
2632             zSig64 = pSig64 - cSig64;
2633             zExp = pExp;
2634         } else if (expDiff < 0) {
2635             shift64RightJamming(pSig64, -expDiff, &pSig64);
2636             zSig64 = cSig64 - pSig64;
2637             zExp = cExp;
2638             zSign ^= 1;
2639         } else {
2640             zExp = pExp;
2641             if (cSig64 < pSig64) {
2642                 zSig64 = pSig64 - cSig64;
2643             } else if (pSig64 < cSig64) {
2644                 zSig64 = cSig64 - pSig64;
2645                 zSign ^= 1;
2646             } else {
2647                 /* Exact zero */
2648                 zSign = signflip;
2649                 if (status->float_rounding_mode == float_round_down) {
2650                     zSign ^= 1;
2651                 }
2652                 return packFloat32(zSign, 0, 0);
2653             }
2654         }
2655         --zExp;
2656         /* Normalize to put the explicit bit back into bit 62. */
2657         shiftcount = countLeadingZeros64(zSig64) - 1;
2658         zSig64 <<= shiftcount;
2659         zExp -= shiftcount;
2660     }
2661     if (flags & float_muladd_halve_result) {
2662         zExp--;
2663     }
2664
2665     shift64RightJamming(zSig64, 32, &zSig64);
2666     return roundAndPackFloat32(zSign, zExp, zSig64, status);
2667 }
2668
2669
2670 /*----------------------------------------------------------------------------
2671 | Returns the square root of the single-precision floating-point value `a'.
2672 | The operation is performed according to the IEC/IEEE Standard for Binary
2673 | Floating-Point Arithmetic.
2674 *----------------------------------------------------------------------------*/
2675
2676 float32 float32_sqrt(float32 a, float_status *status)
2677 {
2678     flag aSign;
2679     int aExp, zExp;
2680     uint32_t aSig, zSig;
2681     uint64_t rem, term;
2682     a = float32_squash_input_denormal(a, status);
2683
2684     aSig = extractFloat32Frac( a );
2685     aExp = extractFloat32Exp( a );
2686     aSign = extractFloat32Sign( a );
2687     if ( aExp == 0xFF ) {
2688         if (aSig) {
2689             return propagateFloat32NaN(a, float32_zero, status);
2690         }
2691         if ( ! aSign ) return a;
2692         float_raise(float_flag_invalid, status);
2693         return float32_default_nan(status);
2694     }
2695     if ( aSign ) {
2696         if ( ( aExp | aSig ) == 0 ) return a;
2697         float_raise(float_flag_invalid, status);
2698         return float32_default_nan(status);
2699     }
2700     if ( aExp == 0 ) {
2701         if ( aSig == 0 ) return float32_zero;
2702         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2703     }
2704     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2705     aSig = ( aSig | 0x00800000 )<<8;
2706     zSig = estimateSqrt32( aExp, aSig ) + 2;
2707     if ( ( zSig & 0x7F ) <= 5 ) {
2708         if ( zSig < 2 ) {
2709             zSig = 0x7FFFFFFF;
2710             goto roundAndPack;
2711         }
2712         aSig >>= aExp & 1;
2713         term = ( (uint64_t) zSig ) * zSig;
2714         rem = ( ( (uint64_t) aSig )<<32 ) - term;
2715         while ( (int64_t) rem < 0 ) {
2716             --zSig;
2717             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2718         }
2719         zSig |= ( rem != 0 );
2720     }
2721     shift32RightJamming( zSig, 1, &zSig );
2722  roundAndPack:
2723     return roundAndPackFloat32(0, zExp, zSig, status);
2724
2725 }
2726
2727 /*----------------------------------------------------------------------------
2728 | Returns the binary exponential of the single-precision floating-point value
2729 | `a'. The operation is performed according to the IEC/IEEE Standard for
2730 | Binary Floating-Point Arithmetic.
2731 |
2732 | Uses the following identities:
2733 |
2734 | 1. -------------------------------------------------------------------------
2735 |      x    x*ln(2)
2736 |     2  = e
2737 |
2738 | 2. -------------------------------------------------------------------------
2739 |                      2     3     4     5           n
2740 |      x        x     x     x     x     x           x
2741 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2742 |               1!    2!    3!    4!    5!          n!
2743 *----------------------------------------------------------------------------*/
2744
2745 static const float64 float32_exp2_coefficients[15] =
2746 {
2747     const_float64( 0x3ff0000000000000ll ), /*  1 */
2748     const_float64( 0x3fe0000000000000ll ), /*  2 */
2749     const_float64( 0x3fc5555555555555ll ), /*  3 */
2750     const_float64( 0x3fa5555555555555ll ), /*  4 */
2751     const_float64( 0x3f81111111111111ll ), /*  5 */
2752     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2753     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2754     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2755     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2756     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2757     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2758     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2759     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2760     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2761     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2762 };
2763
2764 float32 float32_exp2(float32 a, float_status *status)
2765 {
2766     flag aSign;
2767     int aExp;
2768     uint32_t aSig;
2769     float64 r, x, xn;
2770     int i;
2771     a = float32_squash_input_denormal(a, status);
2772
2773     aSig = extractFloat32Frac( a );
2774     aExp = extractFloat32Exp( a );
2775     aSign = extractFloat32Sign( a );
2776
2777     if ( aExp == 0xFF) {
2778         if (aSig) {
2779             return propagateFloat32NaN(a, float32_zero, status);
2780         }
2781         return (aSign) ? float32_zero : a;
2782     }
2783     if (aExp == 0) {
2784         if (aSig == 0) return float32_one;
2785     }
2786
2787     float_raise(float_flag_inexact, status);
2788
2789     /* ******************************* */
2790     /* using float64 for approximation */
2791     /* ******************************* */
2792     x = float32_to_float64(a, status);
2793     x = float64_mul(x, float64_ln2, status);
2794
2795     xn = x;
2796     r = float64_one;
2797     for (i = 0 ; i < 15 ; i++) {
2798         float64 f;
2799
2800         f = float64_mul(xn, float32_exp2_coefficients[i], status);
2801         r = float64_add(r, f, status);
2802
2803         xn = float64_mul(xn, x, status);
2804     }
2805
2806     return float64_to_float32(r, status);
2807 }
2808
2809 /*----------------------------------------------------------------------------
2810 | Returns the binary log of the single-precision floating-point value `a'.
2811 | The operation is performed according to the IEC/IEEE Standard for Binary
2812 | Floating-Point Arithmetic.
2813 *----------------------------------------------------------------------------*/
2814 float32 float32_log2(float32 a, float_status *status)
2815 {
2816     flag aSign, zSign;
2817     int aExp;
2818     uint32_t aSig, zSig, i;
2819
2820     a = float32_squash_input_denormal(a, status);
2821     aSig = extractFloat32Frac( a );
2822     aExp = extractFloat32Exp( a );
2823     aSign = extractFloat32Sign( a );
2824
2825     if ( aExp == 0 ) {
2826         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2827         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2828     }
2829     if ( aSign ) {
2830         float_raise(float_flag_invalid, status);
2831         return float32_default_nan(status);
2832     }
2833     if ( aExp == 0xFF ) {
2834         if (aSig) {
2835             return propagateFloat32NaN(a, float32_zero, status);
2836         }
2837         return a;
2838     }
2839
2840     aExp -= 0x7F;
2841     aSig |= 0x00800000;
2842     zSign = aExp < 0;
2843     zSig = aExp << 23;
2844
2845     for (i = 1 << 22; i > 0; i >>= 1) {
2846         aSig = ( (uint64_t)aSig * aSig ) >> 23;
2847         if ( aSig & 0x01000000 ) {
2848             aSig >>= 1;
2849             zSig |= i;
2850         }
2851     }
2852
2853     if ( zSign )
2854         zSig = -zSig;
2855
2856     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
2857 }
2858
2859 /*----------------------------------------------------------------------------
2860 | Returns 1 if the single-precision floating-point value `a' is equal to
2861 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2862 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2863 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2864 *----------------------------------------------------------------------------*/
2865
2866 int float32_eq(float32 a, float32 b, float_status *status)
2867 {
2868     uint32_t av, bv;
2869     a = float32_squash_input_denormal(a, status);
2870     b = float32_squash_input_denormal(b, status);
2871
2872     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2873          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2874        ) {
2875         float_raise(float_flag_invalid, status);
2876         return 0;
2877     }
2878     av = float32_val(a);
2879     bv = float32_val(b);
2880     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2881 }
2882
2883 /*----------------------------------------------------------------------------
2884 | Returns 1 if the single-precision floating-point value `a' is less than
2885 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
2886 | exception is raised if either operand is a NaN.  The comparison is performed
2887 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2888 *----------------------------------------------------------------------------*/
2889
2890 int float32_le(float32 a, float32 b, float_status *status)
2891 {
2892     flag aSign, bSign;
2893     uint32_t av, bv;
2894     a = float32_squash_input_denormal(a, status);
2895     b = float32_squash_input_denormal(b, status);
2896
2897     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2898          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2899        ) {
2900         float_raise(float_flag_invalid, status);
2901         return 0;
2902     }
2903     aSign = extractFloat32Sign( a );
2904     bSign = extractFloat32Sign( b );
2905     av = float32_val(a);
2906     bv = float32_val(b);
2907     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2908     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2909
2910 }
2911
2912 /*----------------------------------------------------------------------------
2913 | Returns 1 if the single-precision floating-point value `a' is less than
2914 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2915 | raised if either operand is a NaN.  The comparison is performed according
2916 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2917 *----------------------------------------------------------------------------*/
2918
2919 int float32_lt(float32 a, float32 b, float_status *status)
2920 {
2921     flag aSign, bSign;
2922     uint32_t av, bv;
2923     a = float32_squash_input_denormal(a, status);
2924     b = float32_squash_input_denormal(b, status);
2925
2926     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2927          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2928        ) {
2929         float_raise(float_flag_invalid, status);
2930         return 0;
2931     }
2932     aSign = extractFloat32Sign( a );
2933     bSign = extractFloat32Sign( b );
2934     av = float32_val(a);
2935     bv = float32_val(b);
2936     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2937     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2938
2939 }
2940
2941 /*----------------------------------------------------------------------------
2942 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2943 | be compared, and 0 otherwise.  The invalid exception is raised if either
2944 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
2945 | Standard for Binary Floating-Point Arithmetic.
2946 *----------------------------------------------------------------------------*/
2947
2948 int float32_unordered(float32 a, float32 b, float_status *status)
2949 {
2950     a = float32_squash_input_denormal(a, status);
2951     b = float32_squash_input_denormal(b, status);
2952
2953     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2954          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2955        ) {
2956         float_raise(float_flag_invalid, status);
2957         return 1;
2958     }
2959     return 0;
2960 }
2961
2962 /*----------------------------------------------------------------------------
2963 | Returns 1 if the single-precision floating-point value `a' is equal to
2964 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2965 | exception.  The comparison is performed according to the IEC/IEEE Standard
2966 | for Binary Floating-Point Arithmetic.
2967 *----------------------------------------------------------------------------*/
2968
2969 int float32_eq_quiet(float32 a, float32 b, float_status *status)
2970 {
2971     a = float32_squash_input_denormal(a, status);
2972     b = float32_squash_input_denormal(b, status);
2973
2974     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2975          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2976        ) {
2977         if (float32_is_signaling_nan(a, status)
2978          || float32_is_signaling_nan(b, status)) {
2979             float_raise(float_flag_invalid, status);
2980         }
2981         return 0;
2982     }
2983     return ( float32_val(a) == float32_val(b) ) ||
2984             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2985 }
2986
2987 /*----------------------------------------------------------------------------
2988 | Returns 1 if the single-precision floating-point value `a' is less than or
2989 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2990 | cause an exception.  Otherwise, the comparison is performed according to the
2991 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2992 *----------------------------------------------------------------------------*/
2993
2994 int float32_le_quiet(float32 a, float32 b, float_status *status)
2995 {
2996     flag aSign, bSign;
2997     uint32_t av, bv;
2998     a = float32_squash_input_denormal(a, status);
2999     b = float32_squash_input_denormal(b, status);
3000
3001     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3002          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3003        ) {
3004         if (float32_is_signaling_nan(a, status)
3005          || float32_is_signaling_nan(b, status)) {
3006             float_raise(float_flag_invalid, status);
3007         }
3008         return 0;
3009     }
3010     aSign = extractFloat32Sign( a );
3011     bSign = extractFloat32Sign( b );
3012     av = float32_val(a);
3013     bv = float32_val(b);
3014     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3015     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3016
3017 }
3018
3019 /*----------------------------------------------------------------------------
3020 | Returns 1 if the single-precision floating-point value `a' is less than
3021 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3022 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3023 | Standard for Binary Floating-Point Arithmetic.
3024 *----------------------------------------------------------------------------*/
3025
3026 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3027 {
3028     flag aSign, bSign;
3029     uint32_t av, bv;
3030     a = float32_squash_input_denormal(a, status);
3031     b = float32_squash_input_denormal(b, status);
3032
3033     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3034          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3035        ) {
3036         if (float32_is_signaling_nan(a, status)
3037          || float32_is_signaling_nan(b, status)) {
3038             float_raise(float_flag_invalid, status);
3039         }
3040         return 0;
3041     }
3042     aSign = extractFloat32Sign( a );
3043     bSign = extractFloat32Sign( b );
3044     av = float32_val(a);
3045     bv = float32_val(b);
3046     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3047     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3048
3049 }
3050
3051 /*----------------------------------------------------------------------------
3052 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3053 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3054 | comparison is performed according to the IEC/IEEE Standard for Binary
3055 | Floating-Point Arithmetic.
3056 *----------------------------------------------------------------------------*/
3057
3058 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3059 {
3060     a = float32_squash_input_denormal(a, status);
3061     b = float32_squash_input_denormal(b, status);
3062
3063     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3064          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3065        ) {
3066         if (float32_is_signaling_nan(a, status)
3067          || float32_is_signaling_nan(b, status)) {
3068             float_raise(float_flag_invalid, status);
3069         }
3070         return 1;
3071     }
3072     return 0;
3073 }
3074
3075 /*----------------------------------------------------------------------------
3076 | Returns the result of converting the double-precision floating-point value
3077 | `a' to the 32-bit two's complement integer format.  The conversion is
3078 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3079 | Arithmetic---which means in particular that the conversion is rounded
3080 | according to the current rounding mode.  If `a' is a NaN, the largest
3081 | positive integer is returned.  Otherwise, if the conversion overflows, the
3082 | largest integer with the same sign as `a' is returned.
3083 *----------------------------------------------------------------------------*/
3084
3085 int32_t float64_to_int32(float64 a, float_status *status)
3086 {
3087     flag aSign;
3088     int aExp;
3089     int shiftCount;
3090     uint64_t aSig;
3091     a = float64_squash_input_denormal(a, status);
3092
3093     aSig = extractFloat64Frac( a );
3094     aExp = extractFloat64Exp( a );
3095     aSign = extractFloat64Sign( a );
3096     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3097     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3098     shiftCount = 0x42C - aExp;
3099     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3100     return roundAndPackInt32(aSign, aSig, status);
3101
3102 }
3103
3104 /*----------------------------------------------------------------------------
3105 | Returns the result of converting the double-precision floating-point value
3106 | `a' to the 32-bit two's complement integer format.  The conversion is
3107 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3108 | Arithmetic, except that the conversion is always rounded toward zero.
3109 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3110 | the conversion overflows, the largest integer with the same sign as `a' is
3111 | returned.
3112 *----------------------------------------------------------------------------*/
3113
3114 int32_t float64_to_int32_round_to_zero(float64 a, float_status *status)
3115 {
3116     flag aSign;
3117     int aExp;
3118     int shiftCount;
3119     uint64_t aSig, savedASig;
3120     int32_t z;
3121     a = float64_squash_input_denormal(a, status);
3122
3123     aSig = extractFloat64Frac( a );
3124     aExp = extractFloat64Exp( a );
3125     aSign = extractFloat64Sign( a );
3126     if ( 0x41E < aExp ) {
3127         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3128         goto invalid;
3129     }
3130     else if ( aExp < 0x3FF ) {
3131         if (aExp || aSig) {
3132             status->float_exception_flags |= float_flag_inexact;
3133         }
3134         return 0;
3135     }
3136     aSig |= LIT64( 0x0010000000000000 );
3137     shiftCount = 0x433 - aExp;
3138     savedASig = aSig;
3139     aSig >>= shiftCount;
3140     z = aSig;
3141     if ( aSign ) z = - z;
3142     if ( ( z < 0 ) ^ aSign ) {
3143  invalid:
3144         float_raise(float_flag_invalid, status);
3145         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3146     }
3147     if ( ( aSig<<shiftCount ) != savedASig ) {
3148         status->float_exception_flags |= float_flag_inexact;
3149     }
3150     return z;
3151
3152 }
3153
3154 /*----------------------------------------------------------------------------
3155 | Returns the result of converting the double-precision floating-point value
3156 | `a' to the 16-bit two's complement integer format.  The conversion is
3157 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3158 | Arithmetic, except that the conversion is always rounded toward zero.
3159 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3160 | the conversion overflows, the largest integer with the same sign as `a' is
3161 | returned.
3162 *----------------------------------------------------------------------------*/
3163
3164 int16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
3165 {
3166     flag aSign;
3167     int aExp;
3168     int shiftCount;
3169     uint64_t aSig, savedASig;
3170     int32_t z;
3171
3172     aSig = extractFloat64Frac( a );
3173     aExp = extractFloat64Exp( a );
3174     aSign = extractFloat64Sign( a );
3175     if ( 0x40E < aExp ) {
3176         if ( ( aExp == 0x7FF ) && aSig ) {
3177             aSign = 0;
3178         }
3179         goto invalid;
3180     }
3181     else if ( aExp < 0x3FF ) {
3182         if ( aExp || aSig ) {
3183             status->float_exception_flags |= float_flag_inexact;
3184         }
3185         return 0;
3186     }
3187     aSig |= LIT64( 0x0010000000000000 );
3188     shiftCount = 0x433 - aExp;
3189     savedASig = aSig;
3190     aSig >>= shiftCount;
3191     z = aSig;
3192     if ( aSign ) {
3193         z = - z;
3194     }
3195     if ( ( (int16_t)z < 0 ) ^ aSign ) {
3196  invalid:
3197         float_raise(float_flag_invalid, status);
3198         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3199     }
3200     if ( ( aSig<<shiftCount ) != savedASig ) {
3201         status->float_exception_flags |= float_flag_inexact;
3202     }
3203     return z;
3204 }
3205
3206 /*----------------------------------------------------------------------------
3207 | Returns the result of converting the double-precision floating-point value
3208 | `a' to the 64-bit two's complement integer format.  The conversion is
3209 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3210 | Arithmetic---which means in particular that the conversion is rounded
3211 | according to the current rounding mode.  If `a' is a NaN, the largest
3212 | positive integer is returned.  Otherwise, if the conversion overflows, the
3213 | largest integer with the same sign as `a' is returned.
3214 *----------------------------------------------------------------------------*/
3215
3216 int64_t float64_to_int64(float64 a, float_status *status)
3217 {
3218     flag aSign;
3219     int aExp;
3220     int shiftCount;
3221     uint64_t aSig, aSigExtra;
3222     a = float64_squash_input_denormal(a, status);
3223
3224     aSig = extractFloat64Frac( a );
3225     aExp = extractFloat64Exp( a );
3226     aSign = extractFloat64Sign( a );
3227     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3228     shiftCount = 0x433 - aExp;
3229     if ( shiftCount <= 0 ) {
3230         if ( 0x43E < aExp ) {
3231             float_raise(float_flag_invalid, status);
3232             if (    ! aSign
3233                  || (    ( aExp == 0x7FF )
3234                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
3235                ) {
3236                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3237             }
3238             return (int64_t) LIT64( 0x8000000000000000 );
3239         }
3240         aSigExtra = 0;
3241         aSig <<= - shiftCount;
3242     }
3243     else {
3244         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3245     }
3246     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
3247
3248 }
3249
3250 /*----------------------------------------------------------------------------
3251 | Returns the result of converting the double-precision floating-point value
3252 | `a' to the 64-bit two's complement integer format.  The conversion is
3253 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3254 | Arithmetic, except that the conversion is always rounded toward zero.
3255 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3256 | the conversion overflows, the largest integer with the same sign as `a' is
3257 | returned.
3258 *----------------------------------------------------------------------------*/
3259
3260 int64_t float64_to_int64_round_to_zero(float64 a, float_status *status)
3261 {
3262     flag aSign;
3263     int aExp;
3264     int shiftCount;
3265     uint64_t aSig;
3266     int64_t z;
3267     a = float64_squash_input_denormal(a, status);
3268
3269     aSig = extractFloat64Frac( a );
3270     aExp = extractFloat64Exp( a );
3271     aSign = extractFloat64Sign( a );
3272     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3273     shiftCount = aExp - 0x433;
3274     if ( 0 <= shiftCount ) {
3275         if ( 0x43E <= aExp ) {
3276             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3277                 float_raise(float_flag_invalid, status);
3278                 if (    ! aSign
3279                      || (    ( aExp == 0x7FF )
3280                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
3281                    ) {
3282                     return LIT64( 0x7FFFFFFFFFFFFFFF );
3283                 }
3284             }
3285             return (int64_t) LIT64( 0x8000000000000000 );
3286         }
3287         z = aSig<<shiftCount;
3288     }
3289     else {
3290         if ( aExp < 0x3FE ) {
3291             if (aExp | aSig) {
3292                 status->float_exception_flags |= float_flag_inexact;
3293             }
3294             return 0;
3295         }
3296         z = aSig>>( - shiftCount );
3297         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3298             status->float_exception_flags |= float_flag_inexact;
3299         }
3300     }
3301     if ( aSign ) z = - z;
3302     return z;
3303
3304 }
3305
3306 /*----------------------------------------------------------------------------
3307 | Returns the result of converting the double-precision floating-point value
3308 | `a' to the single-precision floating-point format.  The conversion is
3309 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3310 | Arithmetic.
3311 *----------------------------------------------------------------------------*/
3312
3313 float32 float64_to_float32(float64 a, float_status *status)
3314 {
3315     flag aSign;
3316     int aExp;
3317     uint64_t aSig;
3318     uint32_t zSig;
3319     a = float64_squash_input_denormal(a, status);
3320
3321     aSig = extractFloat64Frac( a );
3322     aExp = extractFloat64Exp( a );
3323     aSign = extractFloat64Sign( a );
3324     if ( aExp == 0x7FF ) {
3325         if (aSig) {
3326             return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3327         }
3328         return packFloat32( aSign, 0xFF, 0 );
3329     }
3330     shift64RightJamming( aSig, 22, &aSig );
3331     zSig = aSig;
3332     if ( aExp || zSig ) {
3333         zSig |= 0x40000000;
3334         aExp -= 0x381;
3335     }
3336     return roundAndPackFloat32(aSign, aExp, zSig, status);
3337
3338 }
3339
3340
3341 /*----------------------------------------------------------------------------
3342 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3343 | half-precision floating-point value, returning the result.  After being
3344 | shifted into the proper positions, the three fields are simply added
3345 | together to form the result.  This means that any integer portion of `zSig'
3346 | will be added into the exponent.  Since a properly normalized significand
3347 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3348 | than the desired result exponent whenever `zSig' is a complete, normalized
3349 | significand.
3350 *----------------------------------------------------------------------------*/
3351 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3352 {
3353     return make_float16(
3354         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3355 }
3356
3357 /*----------------------------------------------------------------------------
3358 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3359 | and significand `zSig', and returns the proper half-precision floating-
3360 | point value corresponding to the abstract input.  Ordinarily, the abstract
3361 | value is simply rounded and packed into the half-precision format, with
3362 | the inexact exception raised if the abstract input cannot be represented
3363 | exactly.  However, if the abstract value is too large, the overflow and
3364 | inexact exceptions are raised and an infinity or maximal finite value is
3365 | returned.  If the abstract value is too small, the input value is rounded to
3366 | a subnormal number, and the underflow and inexact exceptions are raised if
3367 | the abstract input cannot be represented exactly as a subnormal half-
3368 | precision floating-point number.
3369 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3370 | ARM-style "alternative representation", which omits the NaN and Inf
3371 | encodings in order to raise the maximum representable exponent by one.
3372 |     The input significand `zSig' has its binary point between bits 22
3373 | and 23, which is 13 bits to the left of the usual location.  This shifted
3374 | significand must be normalized or smaller.  If `zSig' is not normalized,
3375 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3376 | and it must not require rounding.  In the usual case that `zSig' is
3377 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3378 | Note the slightly odd position of the binary point in zSig compared with the
3379 | other roundAndPackFloat functions. This should probably be fixed if we
3380 | need to implement more float16 routines than just conversion.
3381 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3382 | Binary Floating-Point Arithmetic.
3383 *----------------------------------------------------------------------------*/
3384
3385 static float16 roundAndPackFloat16(flag zSign, int zExp,
3386                                    uint32_t zSig, flag ieee,
3387                                    float_status *status)
3388 {
3389     int maxexp = ieee ? 29 : 30;
3390     uint32_t mask;
3391     uint32_t increment;
3392     bool rounding_bumps_exp;
3393     bool is_tiny = false;
3394
3395     /* Calculate the mask of bits of the mantissa which are not
3396      * representable in half-precision and will be lost.
3397      */
3398     if (zExp < 1) {
3399         /* Will be denormal in halfprec */
3400         mask = 0x00ffffff;
3401         if (zExp >= -11) {
3402             mask >>= 11 + zExp;
3403         }
3404     } else {
3405         /* Normal number in halfprec */
3406         mask = 0x00001fff;
3407     }
3408
3409     switch (status->float_rounding_mode) {
3410     case float_round_nearest_even:
3411         increment = (mask + 1) >> 1;
3412         if ((zSig & mask) == increment) {
3413             increment = zSig & (increment << 1);
3414         }
3415         break;
3416     case float_round_ties_away:
3417         increment = (mask + 1) >> 1;
3418         break;
3419     case float_round_up:
3420         increment = zSign ? 0 : mask;
3421         break;
3422     case float_round_down:
3423         increment = zSign ? mask : 0;
3424         break;
3425     default: /* round_to_zero */
3426         increment = 0;
3427         break;
3428     }
3429
3430     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3431
3432     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3433         if (ieee) {
3434             float_raise(float_flag_overflow | float_flag_inexact, status);
3435             return packFloat16(zSign, 0x1f, 0);
3436         } else {
3437             float_raise(float_flag_invalid, status);
3438             return packFloat16(zSign, 0x1f, 0x3ff);
3439         }
3440     }
3441
3442     if (zExp < 0) {
3443         /* Note that flush-to-zero does not affect half-precision results */
3444         is_tiny =
3445             (status->float_detect_tininess == float_tininess_before_rounding)
3446             || (zExp < -1)
3447             || (!rounding_bumps_exp);
3448     }
3449     if (zSig & mask) {
3450         float_raise(float_flag_inexact, status);
3451         if (is_tiny) {
3452             float_raise(float_flag_underflow, status);
3453         }
3454     }
3455
3456     zSig += increment;
3457     if (rounding_bumps_exp) {
3458         zSig >>= 1;
3459         zExp++;
3460     }
3461
3462     if (zExp < -10) {
3463         return packFloat16(zSign, 0, 0);
3464     }
3465     if (zExp < 0) {
3466         zSig >>= -zExp;
3467         zExp = 0;
3468     }
3469     return packFloat16(zSign, zExp, zSig >> 13);
3470 }
3471
3472 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3473                                       uint32_t *zSigPtr)
3474 {
3475     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3476     *zSigPtr = aSig << shiftCount;
3477     *zExpPtr = 1 - shiftCount;
3478 }
3479
3480 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3481    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3482
3483 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3484 {
3485     flag aSign;
3486     int aExp;
3487     uint32_t aSig;
3488
3489     aSign = extractFloat16Sign(a);
3490     aExp = extractFloat16Exp(a);
3491     aSig = extractFloat16Frac(a);
3492
3493     if (aExp == 0x1f && ieee) {
3494         if (aSig) {
3495             return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3496         }
3497         return packFloat32(aSign, 0xff, 0);
3498     }
3499     if (aExp == 0) {
3500         if (aSig == 0) {
3501             return packFloat32(aSign, 0, 0);
3502         }
3503
3504         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3505         aExp--;
3506     }
3507     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3508 }
3509
3510 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3511 {
3512     flag aSign;
3513     int aExp;
3514     uint32_t aSig;
3515
3516     a = float32_squash_input_denormal(a, status);
3517
3518     aSig = extractFloat32Frac( a );
3519     aExp = extractFloat32Exp( a );
3520     aSign = extractFloat32Sign( a );
3521     if ( aExp == 0xFF ) {
3522         if (aSig) {
3523             /* Input is a NaN */
3524             if (!ieee) {
3525                 float_raise(float_flag_invalid, status);
3526                 return packFloat16(aSign, 0, 0);
3527             }
3528             return commonNaNToFloat16(
3529                 float32ToCommonNaN(a, status), status);
3530         }
3531         /* Infinity */
3532         if (!ieee) {
3533             float_raise(float_flag_invalid, status);
3534             return packFloat16(aSign, 0x1f, 0x3ff);
3535         }
3536         return packFloat16(aSign, 0x1f, 0);
3537     }
3538     if (aExp == 0 && aSig == 0) {
3539         return packFloat16(aSign, 0, 0);
3540     }
3541     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3542      * even if the input is denormal; however this is harmless because
3543      * the largest possible single-precision denormal is still smaller
3544      * than the smallest representable half-precision denormal, and so we
3545      * will end up ignoring aSig and returning via the "always return zero"
3546      * codepath.
3547      */
3548     aSig |= 0x00800000;
3549     aExp -= 0x71;
3550
3551     return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3552 }
3553
3554 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3555 {
3556     flag aSign;
3557     int aExp;
3558     uint32_t aSig;
3559
3560     aSign = extractFloat16Sign(a);
3561     aExp = extractFloat16Exp(a);
3562     aSig = extractFloat16Frac(a);
3563
3564     if (aExp == 0x1f && ieee) {
3565         if (aSig) {
3566             return commonNaNToFloat64(
3567                 float16ToCommonNaN(a, status), status);
3568         }
3569         return packFloat64(aSign, 0x7ff, 0);
3570     }
3571     if (aExp == 0) {
3572         if (aSig == 0) {
3573             return packFloat64(aSign, 0, 0);
3574         }
3575
3576         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3577         aExp--;
3578     }
3579     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3580 }
3581
3582 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3583 {
3584     flag aSign;
3585     int aExp;
3586     uint64_t aSig;
3587     uint32_t zSig;
3588
3589     a = float64_squash_input_denormal(a, status);
3590
3591     aSig = extractFloat64Frac(a);
3592     aExp = extractFloat64Exp(a);
3593     aSign = extractFloat64Sign(a);
3594     if (aExp == 0x7FF) {
3595         if (aSig) {
3596             /* Input is a NaN */
3597             if (!ieee) {
3598                 float_raise(float_flag_invalid, status);
3599                 return packFloat16(aSign, 0, 0);
3600             }
3601             return commonNaNToFloat16(
3602                 float64ToCommonNaN(a, status), status);
3603         }
3604         /* Infinity */
3605         if (!ieee) {
3606             float_raise(float_flag_invalid, status);
3607             return packFloat16(aSign, 0x1f, 0x3ff);
3608         }
3609         return packFloat16(aSign, 0x1f, 0);
3610     }
3611     shift64RightJamming(aSig, 29, &aSig);
3612     zSig = aSig;
3613     if (aExp == 0 && zSig == 0) {
3614         return packFloat16(aSign, 0, 0);
3615     }
3616     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3617      * even if the input is denormal; however this is harmless because
3618      * the largest possible single-precision denormal is still smaller
3619      * than the smallest representable half-precision denormal, and so we
3620      * will end up ignoring aSig and returning via the "always return zero"
3621      * codepath.
3622      */
3623     zSig |= 0x00800000;
3624     aExp -= 0x3F1;
3625
3626     return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
3627 }
3628
3629 /*----------------------------------------------------------------------------
3630 | Returns the result of converting the double-precision floating-point value
3631 | `a' to the extended double-precision floating-point format.  The conversion
3632 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3633 | Arithmetic.
3634 *----------------------------------------------------------------------------*/
3635
3636 floatx80 float64_to_floatx80(float64 a, float_status *status)
3637 {
3638     flag aSign;
3639     int aExp;
3640     uint64_t aSig;
3641
3642     a = float64_squash_input_denormal(a, status);
3643     aSig = extractFloat64Frac( a );
3644     aExp = extractFloat64Exp( a );
3645     aSign = extractFloat64Sign( a );
3646     if ( aExp == 0x7FF ) {
3647         if (aSig) {
3648             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3649         }
3650         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3651     }
3652     if ( aExp == 0 ) {
3653         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3654         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3655     }
3656     return
3657         packFloatx80(
3658             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3659
3660 }
3661
3662 /*----------------------------------------------------------------------------
3663 | Returns the result of converting the double-precision floating-point value
3664 | `a' to the quadruple-precision floating-point format.  The conversion is
3665 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3666 | Arithmetic.
3667 *----------------------------------------------------------------------------*/
3668
3669 float128 float64_to_float128(float64 a, float_status *status)
3670 {
3671     flag aSign;
3672     int aExp;
3673     uint64_t aSig, zSig0, zSig1;
3674
3675     a = float64_squash_input_denormal(a, status);
3676     aSig = extractFloat64Frac( a );
3677     aExp = extractFloat64Exp( a );
3678     aSign = extractFloat64Sign( a );
3679     if ( aExp == 0x7FF ) {
3680         if (aSig) {
3681             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3682         }
3683         return packFloat128( aSign, 0x7FFF, 0, 0 );
3684     }
3685     if ( aExp == 0 ) {
3686         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3687         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3688         --aExp;
3689     }
3690     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3691     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3692
3693 }
3694
3695 /*----------------------------------------------------------------------------
3696 | Rounds the double-precision floating-point value `a' to an integer, and
3697 | returns the result as a double-precision floating-point value.  The
3698 | operation is performed according to the IEC/IEEE Standard for Binary
3699 | Floating-Point Arithmetic.
3700 *----------------------------------------------------------------------------*/
3701
3702 float64 float64_round_to_int(float64 a, float_status *status)
3703 {
3704     flag aSign;
3705     int aExp;
3706     uint64_t lastBitMask, roundBitsMask;
3707     uint64_t z;
3708     a = float64_squash_input_denormal(a, status);
3709
3710     aExp = extractFloat64Exp( a );
3711     if ( 0x433 <= aExp ) {
3712         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3713             return propagateFloat64NaN(a, a, status);
3714         }
3715         return a;
3716     }
3717     if ( aExp < 0x3FF ) {
3718         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3719         status->float_exception_flags |= float_flag_inexact;
3720         aSign = extractFloat64Sign( a );
3721         switch (status->float_rounding_mode) {
3722          case float_round_nearest_even:
3723             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3724                 return packFloat64( aSign, 0x3FF, 0 );
3725             }
3726             break;
3727         case float_round_ties_away:
3728             if (aExp == 0x3FE) {
3729                 return packFloat64(aSign, 0x3ff, 0);
3730             }
3731             break;
3732          case float_round_down:
3733             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3734          case float_round_up:
3735             return make_float64(
3736             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3737         }
3738         return packFloat64( aSign, 0, 0 );
3739     }
3740     lastBitMask = 1;
3741     lastBitMask <<= 0x433 - aExp;
3742     roundBitsMask = lastBitMask - 1;
3743     z = float64_val(a);
3744     switch (status->float_rounding_mode) {
3745     case float_round_nearest_even:
3746         z += lastBitMask >> 1;
3747         if ((z & roundBitsMask) == 0) {
3748             z &= ~lastBitMask;
3749         }
3750         break;
3751     case float_round_ties_away:
3752         z += lastBitMask >> 1;
3753         break;
3754     case float_round_to_zero:
3755         break;
3756     case float_round_up:
3757         if (!extractFloat64Sign(make_float64(z))) {
3758             z += roundBitsMask;
3759         }
3760         break;
3761     case float_round_down:
3762         if (extractFloat64Sign(make_float64(z))) {
3763             z += roundBitsMask;
3764         }
3765         break;
3766     default:
3767         abort();
3768     }
3769     z &= ~ roundBitsMask;
3770     if (z != float64_val(a)) {
3771         status->float_exception_flags |= float_flag_inexact;
3772     }
3773     return make_float64(z);
3774
3775 }
3776
3777 float64 float64_trunc_to_int(float64 a, float_status *status)
3778 {
3779     int oldmode;
3780     float64 res;
3781     oldmode = status->float_rounding_mode;
3782     status->float_rounding_mode = float_round_to_zero;
3783     res = float64_round_to_int(a, status);
3784     status->float_rounding_mode = oldmode;
3785     return res;
3786 }
3787
3788 /*----------------------------------------------------------------------------
3789 | Returns the result of adding the absolute values of the double-precision
3790 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3791 | before being returned.  `zSign' is ignored if the result is a NaN.
3792 | The addition is performed according to the IEC/IEEE Standard for Binary
3793 | Floating-Point Arithmetic.
3794 *----------------------------------------------------------------------------*/
3795
3796 static float64 addFloat64Sigs(float64 a, float64 b, flag zSign,
3797                               float_status *status)
3798 {
3799     int aExp, bExp, zExp;
3800     uint64_t aSig, bSig, zSig;
3801     int expDiff;
3802
3803     aSig = extractFloat64Frac( a );
3804     aExp = extractFloat64Exp( a );
3805     bSig = extractFloat64Frac( b );
3806     bExp = extractFloat64Exp( b );
3807     expDiff = aExp - bExp;
3808     aSig <<= 9;
3809     bSig <<= 9;
3810     if ( 0 < expDiff ) {
3811         if ( aExp == 0x7FF ) {
3812             if (aSig) {
3813                 return propagateFloat64NaN(a, b, status);
3814             }
3815             return a;
3816         }
3817         if ( bExp == 0 ) {
3818             --expDiff;
3819         }
3820         else {
3821             bSig |= LIT64( 0x2000000000000000 );
3822         }
3823         shift64RightJamming( bSig, expDiff, &bSig );
3824         zExp = aExp;
3825     }
3826     else if ( expDiff < 0 ) {
3827         if ( bExp == 0x7FF ) {
3828             if (bSig) {
3829                 return propagateFloat64NaN(a, b, status);
3830             }
3831             return packFloat64( zSign, 0x7FF, 0 );
3832         }
3833         if ( aExp == 0 ) {
3834             ++expDiff;
3835         }
3836         else {
3837             aSig |= LIT64( 0x2000000000000000 );
3838         }
3839         shift64RightJamming( aSig, - expDiff, &aSig );
3840         zExp = bExp;
3841     }
3842     else {
3843         if ( aExp == 0x7FF ) {
3844             if (aSig | bSig) {
3845                 return propagateFloat64NaN(a, b, status);
3846             }
3847             return a;
3848         }
3849         if ( aExp == 0 ) {
3850             if (status->flush_to_zero) {
3851                 if (aSig | bSig) {
3852                     float_raise(float_flag_output_denormal, status);
3853                 }
3854                 return packFloat64(zSign, 0, 0);
3855             }
3856             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3857         }
3858         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3859         zExp = aExp;
3860         goto roundAndPack;
3861     }
3862     aSig |= LIT64( 0x2000000000000000 );
3863     zSig = ( aSig + bSig )<<1;
3864     --zExp;
3865     if ( (int64_t) zSig < 0 ) {
3866         zSig = aSig + bSig;
3867         ++zExp;
3868     }
3869  roundAndPack:
3870     return roundAndPackFloat64(zSign, zExp, zSig, status);
3871
3872 }
3873
3874 /*----------------------------------------------------------------------------
3875 | Returns the result of subtracting the absolute values of the double-
3876 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3877 | difference is negated before being returned.  `zSign' is ignored if the
3878 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3879 | Standard for Binary Floating-Point Arithmetic.
3880 *----------------------------------------------------------------------------*/
3881
3882 static float64 subFloat64Sigs(float64 a, float64 b, flag zSign,
3883                               float_status *status)
3884 {
3885     int aExp, bExp, zExp;
3886     uint64_t aSig, bSig, zSig;
3887     int expDiff;
3888
3889     aSig = extractFloat64Frac( a );
3890     aExp = extractFloat64Exp( a );
3891     bSig = extractFloat64Frac( b );
3892     bExp = extractFloat64Exp( b );
3893     expDiff = aExp - bExp;
3894     aSig <<= 10;
3895     bSig <<= 10;
3896     if ( 0 < expDiff ) goto aExpBigger;
3897     if ( expDiff < 0 ) goto bExpBigger;
3898     if ( aExp == 0x7FF ) {
3899         if (aSig | bSig) {
3900             return propagateFloat64NaN(a, b, status);
3901         }
3902         float_raise(float_flag_invalid, status);
3903         return float64_default_nan(status);
3904     }
3905     if ( aExp == 0 ) {
3906         aExp = 1;
3907         bExp = 1;
3908     }
3909     if ( bSig < aSig ) goto aBigger;
3910     if ( aSig < bSig ) goto bBigger;
3911     return packFloat64(status->float_rounding_mode == float_round_down, 0, 0);
3912  bExpBigger:
3913     if ( bExp == 0x7FF ) {
3914         if (bSig) {
3915             return propagateFloat64NaN(a, b, status);
3916         }
3917         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3918     }
3919     if ( aExp == 0 ) {
3920         ++expDiff;
3921     }
3922     else {
3923         aSig |= LIT64( 0x4000000000000000 );
3924     }
3925     shift64RightJamming( aSig, - expDiff, &aSig );
3926     bSig |= LIT64( 0x4000000000000000 );
3927  bBigger:
3928     zSig = bSig - aSig;
3929     zExp = bExp;
3930     zSign ^= 1;
3931     goto normalizeRoundAndPack;
3932  aExpBigger:
3933     if ( aExp == 0x7FF ) {
3934         if (aSig) {
3935             return propagateFloat64NaN(a, b, status);
3936         }
3937         return a;
3938     }
3939     if ( bExp == 0 ) {
3940         --expDiff;
3941     }
3942     else {
3943         bSig |= LIT64( 0x4000000000000000 );
3944     }
3945     shift64RightJamming( bSig, expDiff, &bSig );
3946     aSig |= LIT64( 0x4000000000000000 );
3947  aBigger:
3948     zSig = aSig - bSig;
3949     zExp = aExp;
3950  normalizeRoundAndPack:
3951     --zExp;
3952     return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
3953
3954 }
3955
3956 /*----------------------------------------------------------------------------
3957 | Returns the result of adding the double-precision floating-point values `a'
3958 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3959 | Binary Floating-Point Arithmetic.
3960 *----------------------------------------------------------------------------*/
3961
3962 float64 float64_add(float64 a, float64 b, float_status *status)
3963 {
3964     flag aSign, bSign;
3965     a = float64_squash_input_denormal(a, status);
3966     b = float64_squash_input_denormal(b, status);
3967
3968     aSign = extractFloat64Sign( a );
3969     bSign = extractFloat64Sign( b );
3970     if ( aSign == bSign ) {
3971         return addFloat64Sigs(a, b, aSign, status);
3972     }
3973     else {
3974         return subFloat64Sigs(a, b, aSign, status);
3975     }
3976
3977 }
3978
3979 /*----------------------------------------------------------------------------
3980 | Returns the result of subtracting the double-precision floating-point values
3981 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3982 | for Binary Floating-Point Arithmetic.
3983 *----------------------------------------------------------------------------*/
3984
3985 float64 float64_sub(float64 a, float64 b, float_status *status)
3986 {
3987     flag aSign, bSign;
3988     a = float64_squash_input_denormal(a, status);
3989     b = float64_squash_input_denormal(b, status);
3990
3991     aSign = extractFloat64Sign( a );
3992     bSign = extractFloat64Sign( b );
3993     if ( aSign == bSign ) {
3994         return subFloat64Sigs(a, b, aSign, status);
3995     }
3996     else {
3997         return addFloat64Sigs(a, b, aSign, status);
3998     }
3999
4000 }
4001
4002 /*----------------------------------------------------------------------------
4003 | Returns the result of multiplying the double-precision floating-point values
4004 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4005 | for Binary Floating-Point Arithmetic.
4006 *----------------------------------------------------------------------------*/
4007
4008 float64 float64_mul(float64 a, float64 b, float_status *status)
4009 {
4010     flag aSign, bSign, zSign;
4011     int aExp, bExp, zExp;
4012     uint64_t aSig, bSig, zSig0, zSig1;
4013
4014     a = float64_squash_input_denormal(a, status);
4015     b = float64_squash_input_denormal(b, status);
4016
4017     aSig = extractFloat64Frac( a );
4018     aExp = extractFloat64Exp( a );
4019     aSign = extractFloat64Sign( a );
4020     bSig = extractFloat64Frac( b );
4021     bExp = extractFloat64Exp( b );
4022     bSign = extractFloat64Sign( b );
4023     zSign = aSign ^ bSign;
4024     if ( aExp == 0x7FF ) {
4025         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4026             return propagateFloat64NaN(a, b, status);
4027         }
4028         if ( ( bExp | bSig ) == 0 ) {
4029             float_raise(float_flag_invalid, status);
4030             return float64_default_nan(status);
4031         }
4032         return packFloat64( zSign, 0x7FF, 0 );
4033     }
4034     if ( bExp == 0x7FF ) {
4035         if (bSig) {
4036             return propagateFloat64NaN(a, b, status);
4037         }
4038         if ( ( aExp | aSig ) == 0 ) {
4039             float_raise(float_flag_invalid, status);
4040             return float64_default_nan(status);
4041         }
4042         return packFloat64( zSign, 0x7FF, 0 );
4043     }
4044     if ( aExp == 0 ) {
4045         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4046         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4047     }
4048     if ( bExp == 0 ) {
4049         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
4050         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4051     }
4052     zExp = aExp + bExp - 0x3FF;
4053     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4054     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4055     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4056     zSig0 |= ( zSig1 != 0 );
4057     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
4058         zSig0 <<= 1;
4059         --zExp;
4060     }
4061     return roundAndPackFloat64(zSign, zExp, zSig0, status);
4062
4063 }
4064
4065 /*----------------------------------------------------------------------------
4066 | Returns the result of dividing the double-precision floating-point value `a'
4067 | by the corresponding value `b'.  The operation is performed according to
4068 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4069 *----------------------------------------------------------------------------*/
4070
4071 float64 float64_div(float64 a, float64 b, float_status *status)
4072 {
4073     flag aSign, bSign, zSign;
4074     int aExp, bExp, zExp;
4075     uint64_t aSig, bSig, zSig;
4076     uint64_t rem0, rem1;
4077     uint64_t term0, term1;
4078     a = float64_squash_input_denormal(a, status);
4079     b = float64_squash_input_denormal(b, status);
4080
4081     aSig = extractFloat64Frac( a );
4082     aExp = extractFloat64Exp( a );
4083     aSign = extractFloat64Sign( a );
4084     bSig = extractFloat64Frac( b );
4085     bExp = extractFloat64Exp( b );
4086     bSign = extractFloat64Sign( b );
4087     zSign = aSign ^ bSign;
4088     if ( aExp == 0x7FF ) {
4089         if (aSig) {
4090             return propagateFloat64NaN(a, b, status);
4091         }
4092         if ( bExp == 0x7FF ) {
4093             if (bSig) {
4094                 return propagateFloat64NaN(a, b, status);
4095             }
4096             float_raise(float_flag_invalid, status);
4097             return float64_default_nan(status);
4098         }
4099         return packFloat64( zSign, 0x7FF, 0 );
4100     }
4101     if ( bExp == 0x7FF ) {
4102         if (bSig) {
4103             return propagateFloat64NaN(a, b, status);
4104         }
4105         return packFloat64( zSign, 0, 0 );
4106     }
4107     if ( bExp == 0 ) {
4108         if ( bSig == 0 ) {
4109             if ( ( aExp | aSig ) == 0 ) {
4110                 float_raise(float_flag_invalid, status);
4111                 return float64_default_nan(status);
4112             }
4113             float_raise(float_flag_divbyzero, status);
4114             return packFloat64( zSign, 0x7FF, 0 );
4115         }
4116         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4117     }
4118     if ( aExp == 0 ) {
4119         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4120         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4121     }
4122     zExp = aExp - bExp + 0x3FD;
4123     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4124     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4125     if ( bSig <= ( aSig + aSig ) ) {
4126         aSig >>= 1;
4127         ++zExp;
4128     }
4129     zSig = estimateDiv128To64( aSig, 0, bSig );
4130     if ( ( zSig & 0x1FF ) <= 2 ) {
4131         mul64To128( bSig, zSig, &term0, &term1 );
4132         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4133         while ( (int64_t) rem0 < 0 ) {
4134             --zSig;
4135             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4136         }
4137         zSig |= ( rem1 != 0 );
4138     }
4139     return roundAndPackFloat64(zSign, zExp, zSig, status);
4140
4141 }
4142
4143 /*----------------------------------------------------------------------------
4144 | Returns the remainder of the double-precision floating-point value `a'
4145 | with respect to the corresponding value `b'.  The operation is performed
4146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4147 *----------------------------------------------------------------------------*/
4148
4149 float64 float64_rem(float64 a, float64 b, float_status *status)
4150 {
4151     flag aSign, zSign;
4152     int aExp, bExp, expDiff;
4153     uint64_t aSig, bSig;
4154     uint64_t q, alternateASig;
4155     int64_t sigMean;
4156
4157     a = float64_squash_input_denormal(a, status);
4158     b = float64_squash_input_denormal(b, status);
4159     aSig = extractFloat64Frac( a );
4160     aExp = extractFloat64Exp( a );
4161     aSign = extractFloat64Sign( a );
4162     bSig = extractFloat64Frac( b );
4163     bExp = extractFloat64Exp( b );
4164     if ( aExp == 0x7FF ) {
4165         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4166             return propagateFloat64NaN(a, b, status);
4167         }
4168         float_raise(float_flag_invalid, status);
4169         return float64_default_nan(status);
4170     }
4171     if ( bExp == 0x7FF ) {
4172         if (bSig) {
4173             return propagateFloat64NaN(a, b, status);
4174         }
4175         return a;
4176     }
4177     if ( bExp == 0 ) {
4178         if ( bSig == 0 ) {
4179             float_raise(float_flag_invalid, status);
4180             return float64_default_nan(status);
4181         }
4182         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4183     }
4184     if ( aExp == 0 ) {
4185         if ( aSig == 0 ) return a;
4186         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4187     }
4188     expDiff = aExp - bExp;
4189     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4190     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4191     if ( expDiff < 0 ) {
4192         if ( expDiff < -1 ) return a;
4193         aSig >>= 1;
4194     }
4195     q = ( bSig <= aSig );
4196     if ( q ) aSig -= bSig;
4197     expDiff -= 64;
4198     while ( 0 < expDiff ) {
4199         q = estimateDiv128To64( aSig, 0, bSig );
4200         q = ( 2 < q ) ? q - 2 : 0;
4201         aSig = - ( ( bSig>>2 ) * q );
4202         expDiff -= 62;
4203     }
4204     expDiff += 64;
4205     if ( 0 < expDiff ) {
4206         q = estimateDiv128To64( aSig, 0, bSig );
4207         q = ( 2 < q ) ? q - 2 : 0;
4208         q >>= 64 - expDiff;
4209         bSig >>= 2;
4210         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4211     }
4212     else {
4213         aSig >>= 2;
4214         bSig >>= 2;
4215     }
4216     do {
4217         alternateASig = aSig;
4218         ++q;
4219         aSig -= bSig;
4220     } while ( 0 <= (int64_t) aSig );
4221     sigMean = aSig + alternateASig;
4222     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4223         aSig = alternateASig;
4224     }
4225     zSign = ( (int64_t) aSig < 0 );
4226     if ( zSign ) aSig = - aSig;
4227     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4228
4229 }
4230
4231 /*----------------------------------------------------------------------------
4232 | Returns the result of multiplying the double-precision floating-point values
4233 | `a' and `b' then adding 'c', with no intermediate rounding step after the
4234 | multiplication.  The operation is performed according to the IEC/IEEE
4235 | Standard for Binary Floating-Point Arithmetic 754-2008.
4236 | The flags argument allows the caller to select negation of the
4237 | addend, the intermediate product, or the final result. (The difference
4238 | between this and having the caller do a separate negation is that negating
4239 | externally will flip the sign bit on NaNs.)
4240 *----------------------------------------------------------------------------*/
4241
4242 float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
4243                        float_status *status)
4244 {
4245     flag aSign, bSign, cSign, zSign;
4246     int aExp, bExp, cExp, pExp, zExp, expDiff;
4247     uint64_t aSig, bSig, cSig;
4248     flag pInf, pZero, pSign;
4249     uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4250     int shiftcount;
4251     flag signflip, infzero;
4252
4253     a = float64_squash_input_denormal(a, status);
4254     b = float64_squash_input_denormal(b, status);
4255     c = float64_squash_input_denormal(c, status);
4256     aSig = extractFloat64Frac(a);
4257     aExp = extractFloat64Exp(a);
4258     aSign = extractFloat64Sign(a);
4259     bSig = extractFloat64Frac(b);
4260     bExp = extractFloat64Exp(b);
4261     bSign = extractFloat64Sign(b);
4262     cSig = extractFloat64Frac(c);
4263     cExp = extractFloat64Exp(c);
4264     cSign = extractFloat64Sign(c);
4265
4266     infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4267                (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4268
4269     /* It is implementation-defined whether the cases of (0,inf,qnan)
4270      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4271      * they return if they do), so we have to hand this information
4272      * off to the target-specific pick-a-NaN routine.
4273      */
4274     if (((aExp == 0x7ff) && aSig) ||
4275         ((bExp == 0x7ff) && bSig) ||
4276         ((cExp == 0x7ff) && cSig)) {
4277         return propagateFloat64MulAddNaN(a, b, c, infzero, status);
4278     }
4279
4280     if (infzero) {
4281         float_raise(float_flag_invalid, status);
4282         return float64_default_nan(status);
4283     }
4284
4285     if (flags & float_muladd_negate_c) {
4286         cSign ^= 1;
4287     }
4288
4289     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4290
4291     /* Work out the sign and type of the product */
4292     pSign = aSign ^ bSign;
4293     if (flags & float_muladd_negate_product) {
4294         pSign ^= 1;
4295     }
4296     pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4297     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4298
4299     if (cExp == 0x7ff) {
4300         if (pInf && (pSign ^ cSign)) {
4301             /* addition of opposite-signed infinities => InvalidOperation */
4302             float_raise(float_flag_invalid, status);
4303             return float64_default_nan(status);
4304         }
4305         /* Otherwise generate an infinity of the same sign */
4306         return packFloat64(cSign ^ signflip, 0x7ff, 0);
4307     }
4308
4309     if (pInf) {
4310         return packFloat64(pSign ^ signflip, 0x7ff, 0);
4311     }
4312
4313     if (pZero) {
4314         if (cExp == 0) {
4315             if (cSig == 0) {
4316                 /* Adding two exact zeroes */
4317                 if (pSign == cSign) {
4318                     zSign = pSign;
4319                 } else if (status->float_rounding_mode == float_round_down) {
4320                     zSign = 1;
4321                 } else {
4322                     zSign = 0;
4323                 }
4324                 return packFloat64(zSign ^ signflip, 0, 0);
4325             }
4326             /* Exact zero plus a denorm */
4327             if (status->flush_to_zero) {
4328                 float_raise(float_flag_output_denormal, status);
4329                 return packFloat64(cSign ^ signflip, 0, 0);
4330             }
4331         }
4332         /* Zero plus something non-zero : just return the something */
4333         if (flags & float_muladd_halve_result) {
4334             if (cExp == 0) {
4335                 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4336             }
4337             /* Subtract one to halve, and one again because roundAndPackFloat64
4338              * wants one less than the true exponent.
4339              */
4340             cExp -= 2;
4341             cSig = (cSig | 0x0010000000000000ULL) << 10;
4342             return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status);
4343         }
4344         return packFloat64(cSign ^ signflip, cExp, cSig);
4345     }
4346
4347     if (aExp == 0) {
4348         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4349     }
4350     if (bExp == 0) {
4351         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4352     }
4353
4354     /* Calculate the actual result a * b + c */
4355
4356     /* Multiply first; this is easy. */
4357     /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4358      * because we want the true exponent, not the "one-less-than"
4359      * flavour that roundAndPackFloat64() takes.
4360      */
4361     pExp = aExp + bExp - 0x3fe;
4362     aSig = (aSig | LIT64(0x0010000000000000))<<10;
4363     bSig = (bSig | LIT64(0x0010000000000000))<<11;
4364     mul64To128(aSig, bSig, &pSig0, &pSig1);
4365     if ((int64_t)(pSig0 << 1) >= 0) {
4366         shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4367         pExp--;
4368     }
4369
4370     zSign = pSign ^ signflip;
4371
4372     /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4373      * bit in position 126.
4374      */
4375     if (cExp == 0) {
4376         if (!cSig) {
4377             /* Throw out the special case of c being an exact zero now */
4378             shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4379             if (flags & float_muladd_halve_result) {
4380                 pExp--;
4381             }
4382             return roundAndPackFloat64(zSign, pExp - 1,
4383                                        pSig1, status);
4384         }
4385         normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4386     }
4387
4388     /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4389      * significand of the addend, with the explicit bit in position 126.
4390      */
4391     cSig0 = cSig << (126 - 64 - 52);
4392     cSig1 = 0;
4393     cSig0 |= LIT64(0x4000000000000000);
4394     expDiff = pExp - cExp;
4395
4396     if (pSign == cSign) {
4397         /* Addition */
4398         if (expDiff > 0) {
4399             /* scale c to match p */
4400             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4401             zExp = pExp;
4402         } else if (expDiff < 0) {
4403             /* scale p to match c */
4404             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4405             zExp = cExp;
4406         } else {
4407             /* no scaling needed */
4408             zExp = cExp;
4409         }
4410         /* Add significands and make sure explicit bit ends up in posn 126 */
4411         add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4412         if ((int64_t)zSig0 < 0) {
4413             shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4414         } else {
4415             zExp--;
4416         }
4417         shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4418         if (flags & float_muladd_halve_result) {
4419             zExp--;
4420         }
4421         return roundAndPackFloat64(zSign, zExp, zSig1, status);
4422     } else {
4423         /* Subtraction */
4424         if (expDiff > 0) {
4425             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4426             sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4427             zExp = pExp;
4428         } else if (expDiff < 0) {
4429             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4430             sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4431             zExp = cExp;
4432             zSign ^= 1;
4433         } else {
4434             zExp = pExp;
4435             if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4436                 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4437             } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4438                 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4439                 zSign ^= 1;
4440             } else {
4441                 /* Exact zero */
4442                 zSign = signflip;
4443                 if (status->float_rounding_mode == float_round_down) {
4444                     zSign ^= 1;
4445                 }
4446                 return packFloat64(zSign, 0, 0);
4447             }
4448         }
4449         --zExp;
4450         /* Do the equivalent of normalizeRoundAndPackFloat64() but
4451          * starting with the significand in a pair of uint64_t.
4452          */
4453         if (zSig0) {
4454             shiftcount = countLeadingZeros64(zSig0) - 1;
4455             shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4456             if (zSig1) {
4457                 zSig0 |= 1;
4458             }
4459             zExp -= shiftcount;
4460         } else {
4461             shiftcount = countLeadingZeros64(zSig1);
4462             if (shiftcount == 0) {
4463                 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4464                 zExp -= 63;
4465             } else {
4466                 shiftcount--;
4467                 zSig0 = zSig1 << shiftcount;
4468                 zExp -= (shiftcount + 64);
4469             }
4470         }
4471         if (flags & float_muladd_halve_result) {
4472             zExp--;
4473         }
4474         return roundAndPackFloat64(zSign, zExp, zSig0, status);
4475     }
4476 }
4477
4478 /*----------------------------------------------------------------------------
4479 | Returns the square root of the double-precision floating-point value `a'.
4480 | The operation is performed according to the IEC/IEEE Standard for Binary
4481 | Floating-Point Arithmetic.
4482 *----------------------------------------------------------------------------*/
4483
4484 float64 float64_sqrt(float64 a, float_status *status)
4485 {
4486     flag aSign;
4487     int aExp, zExp;
4488     uint64_t aSig, zSig, doubleZSig;
4489     uint64_t rem0, rem1, term0, term1;
4490     a = float64_squash_input_denormal(a, status);
4491
4492     aSig = extractFloat64Frac( a );
4493     aExp = extractFloat64Exp( a );
4494     aSign = extractFloat64Sign( a );
4495     if ( aExp == 0x7FF ) {
4496         if (aSig) {
4497             return propagateFloat64NaN(a, a, status);
4498         }
4499         if ( ! aSign ) return a;
4500         float_raise(float_flag_invalid, status);
4501         return float64_default_nan(status);
4502     }
4503     if ( aSign ) {
4504         if ( ( aExp | aSig ) == 0 ) return a;
4505         float_raise(float_flag_invalid, status);
4506         return float64_default_nan(status);
4507     }
4508     if ( aExp == 0 ) {
4509         if ( aSig == 0 ) return float64_zero;
4510         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4511     }
4512     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4513     aSig |= LIT64( 0x0010000000000000 );
4514     zSig = estimateSqrt32( aExp, aSig>>21 );
4515     aSig <<= 9 - ( aExp & 1 );
4516     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4517     if ( ( zSig & 0x1FF ) <= 5 ) {
4518         doubleZSig = zSig<<1;
4519         mul64To128( zSig, zSig, &term0, &term1 );
4520         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4521         while ( (int64_t) rem0 < 0 ) {
4522             --zSig;
4523             doubleZSig -= 2;
4524             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4525         }
4526         zSig |= ( ( rem0 | rem1 ) != 0 );
4527     }
4528     return roundAndPackFloat64(0, zExp, zSig, status);
4529
4530 }
4531
4532 /*----------------------------------------------------------------------------
4533 | Returns the binary log of the double-precision floating-point value `a'.
4534 | The operation is performed according to the IEC/IEEE Standard for Binary
4535 | Floating-Point Arithmetic.
4536 *----------------------------------------------------------------------------*/
4537 float64 float64_log2(float64 a, float_status *status)
4538 {
4539     flag aSign, zSign;
4540     int aExp;
4541     uint64_t aSig, aSig0, aSig1, zSig, i;
4542     a = float64_squash_input_denormal(a, status);
4543
4544     aSig = extractFloat64Frac( a );
4545     aExp = extractFloat64Exp( a );
4546     aSign = extractFloat64Sign( a );
4547
4548     if ( aExp == 0 ) {
4549         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4550         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4551     }
4552     if ( aSign ) {
4553         float_raise(float_flag_invalid, status);
4554         return float64_default_nan(status);
4555     }
4556     if ( aExp == 0x7FF ) {
4557         if (aSig) {
4558             return propagateFloat64NaN(a, float64_zero, status);
4559         }
4560         return a;
4561     }
4562
4563     aExp -= 0x3FF;
4564     aSig |= LIT64( 0x0010000000000000 );
4565     zSign = aExp < 0;
4566     zSig = (uint64_t)aExp << 52;
4567     for (i = 1LL << 51; i > 0; i >>= 1) {
4568         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4569         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4570         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4571             aSig >>= 1;
4572             zSig |= i;
4573         }
4574     }
4575
4576     if ( zSign )
4577         zSig = -zSig;
4578     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4579 }
4580
4581 /*----------------------------------------------------------------------------
4582 | Returns 1 if the double-precision floating-point value `a' is equal to the
4583 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4584 | if either operand is a NaN.  Otherwise, the comparison is performed
4585 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4586 *----------------------------------------------------------------------------*/
4587
4588 int float64_eq(float64 a, float64 b, float_status *status)
4589 {
4590     uint64_t av, bv;
4591     a = float64_squash_input_denormal(a, status);
4592     b = float64_squash_input_denormal(b, status);
4593
4594     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4595          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4596        ) {
4597         float_raise(float_flag_invalid, status);
4598         return 0;
4599     }
4600     av = float64_val(a);
4601     bv = float64_val(b);
4602     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4603
4604 }
4605
4606 /*----------------------------------------------------------------------------
4607 | Returns 1 if the double-precision floating-point value `a' is less than or
4608 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4609 | exception is raised if either operand is a NaN.  The comparison is performed
4610 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4611 *----------------------------------------------------------------------------*/
4612
4613 int float64_le(float64 a, float64 b, float_status *status)
4614 {
4615     flag aSign, bSign;
4616     uint64_t av, bv;
4617     a = float64_squash_input_denormal(a, status);
4618     b = float64_squash_input_denormal(b, status);
4619
4620     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4621          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4622        ) {
4623         float_raise(float_flag_invalid, status);
4624         return 0;
4625     }
4626     aSign = extractFloat64Sign( a );
4627     bSign = extractFloat64Sign( b );
4628     av = float64_val(a);
4629     bv = float64_val(b);
4630     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4631     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4632
4633 }
4634
4635 /*----------------------------------------------------------------------------
4636 | Returns 1 if the double-precision floating-point value `a' is less than
4637 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4638 | raised if either operand is a NaN.  The comparison is performed according
4639 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4640 *----------------------------------------------------------------------------*/
4641
4642 int float64_lt(float64 a, float64 b, float_status *status)
4643 {
4644     flag aSign, bSign;
4645     uint64_t av, bv;
4646
4647     a = float64_squash_input_denormal(a, status);
4648     b = float64_squash_input_denormal(b, status);
4649     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4650          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4651        ) {
4652         float_raise(float_flag_invalid, status);
4653         return 0;
4654     }
4655     aSign = extractFloat64Sign( a );
4656     bSign = extractFloat64Sign( b );
4657     av = float64_val(a);
4658     bv = float64_val(b);
4659     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4660     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4661
4662 }
4663
4664 /*----------------------------------------------------------------------------
4665 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4666 | be compared, and 0 otherwise.  The invalid exception is raised if either
4667 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4668 | Standard for Binary Floating-Point Arithmetic.
4669 *----------------------------------------------------------------------------*/
4670
4671 int float64_unordered(float64 a, float64 b, float_status *status)
4672 {
4673     a = float64_squash_input_denormal(a, status);
4674     b = float64_squash_input_denormal(b, status);
4675
4676     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4677          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4678        ) {
4679         float_raise(float_flag_invalid, status);
4680         return 1;
4681     }
4682     return 0;
4683 }
4684
4685 /*----------------------------------------------------------------------------
4686 | Returns 1 if the double-precision floating-point value `a' is equal to the
4687 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4688 | exception.The comparison is performed according to the IEC/IEEE Standard
4689 | for Binary Floating-Point Arithmetic.
4690 *----------------------------------------------------------------------------*/
4691
4692 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4693 {
4694     uint64_t av, bv;
4695     a = float64_squash_input_denormal(a, status);
4696     b = float64_squash_input_denormal(b, status);
4697
4698     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4699          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4700        ) {
4701         if (float64_is_signaling_nan(a, status)
4702          || float64_is_signaling_nan(b, status)) {
4703             float_raise(float_flag_invalid, status);
4704         }
4705         return 0;
4706     }
4707     av = float64_val(a);
4708     bv = float64_val(b);
4709     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4710
4711 }
4712
4713 /*----------------------------------------------------------------------------
4714 | Returns 1 if the double-precision floating-point value `a' is less than or
4715 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4716 | cause an exception.  Otherwise, the comparison is performed according to the
4717 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4718 *----------------------------------------------------------------------------*/
4719
4720 int float64_le_quiet(float64 a, float64 b, float_status *status)
4721 {
4722     flag aSign, bSign;
4723     uint64_t av, bv;
4724     a = float64_squash_input_denormal(a, status);
4725     b = float64_squash_input_denormal(b, status);
4726
4727     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4728          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4729        ) {
4730         if (float64_is_signaling_nan(a, status)
4731          || float64_is_signaling_nan(b, status)) {
4732             float_raise(float_flag_invalid, status);
4733         }
4734         return 0;
4735     }
4736     aSign = extractFloat64Sign( a );
4737     bSign = extractFloat64Sign( b );
4738     av = float64_val(a);
4739     bv = float64_val(b);
4740     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4741     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4742
4743 }
4744
4745 /*----------------------------------------------------------------------------
4746 | Returns 1 if the double-precision floating-point value `a' is less than
4747 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4748 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4749 | Standard for Binary Floating-Point Arithmetic.
4750 *----------------------------------------------------------------------------*/
4751
4752 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4753 {
4754     flag aSign, bSign;
4755     uint64_t av, bv;
4756     a = float64_squash_input_denormal(a, status);
4757     b = float64_squash_input_denormal(b, status);
4758
4759     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4760          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4761        ) {
4762         if (float64_is_signaling_nan(a, status)
4763          || float64_is_signaling_nan(b, status)) {
4764             float_raise(float_flag_invalid, status);
4765         }
4766         return 0;
4767     }
4768     aSign = extractFloat64Sign( a );
4769     bSign = extractFloat64Sign( b );
4770     av = float64_val(a);
4771     bv = float64_val(b);
4772     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4773     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4774
4775 }
4776
4777 /*----------------------------------------------------------------------------
4778 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4779 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4780 | comparison is performed according to the IEC/IEEE Standard for Binary
4781 | Floating-Point Arithmetic.
4782 *----------------------------------------------------------------------------*/
4783
4784 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4785 {
4786     a = float64_squash_input_denormal(a, status);
4787     b = float64_squash_input_denormal(b, status);
4788
4789     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4790          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4791        ) {
4792         if (float64_is_signaling_nan(a, status)
4793          || float64_is_signaling_nan(b, status)) {
4794             float_raise(float_flag_invalid, status);
4795         }
4796         return 1;
4797     }
4798     return 0;
4799 }
4800
4801 /*----------------------------------------------------------------------------
4802 | Returns the result of converting the extended double-precision floating-
4803 | point value `a' to the 32-bit two's complement integer format.  The
4804 | conversion is performed according to the IEC/IEEE Standard for Binary
4805 | Floating-Point Arithmetic---which means in particular that the conversion
4806 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4807 | largest positive integer is returned.  Otherwise, if the conversion
4808 | overflows, the largest integer with the same sign as `a' is returned.
4809 *----------------------------------------------------------------------------*/
4810
4811 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4812 {
4813     flag aSign;
4814     int32_t aExp, shiftCount;
4815     uint64_t aSig;
4816
4817     aSig = extractFloatx80Frac( a );
4818     aExp = extractFloatx80Exp( a );
4819     aSign = extractFloatx80Sign( a );
4820     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4821     shiftCount = 0x4037 - aExp;
4822     if ( shiftCount <= 0 ) shiftCount = 1;
4823     shift64RightJamming( aSig, shiftCount, &aSig );
4824     return roundAndPackInt32(aSign, aSig, status);
4825
4826 }
4827
4828 /*----------------------------------------------------------------------------
4829 | Returns the result of converting the extended double-precision floating-
4830 | point value `a' to the 32-bit two's complement integer format.  The
4831 | conversion is performed according to the IEC/IEEE Standard for Binary
4832 | Floating-Point Arithmetic, except that the conversion is always rounded
4833 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4834 | Otherwise, if the conversion overflows, the largest integer with the same
4835 | sign as `a' is returned.
4836 *----------------------------------------------------------------------------*/
4837
4838 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4839 {
4840     flag aSign;
4841     int32_t aExp, shiftCount;
4842     uint64_t aSig, savedASig;
4843     int32_t z;
4844
4845     aSig = extractFloatx80Frac( a );
4846     aExp = extractFloatx80Exp( a );
4847     aSign = extractFloatx80Sign( a );
4848     if ( 0x401E < aExp ) {
4849         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4850         goto invalid;
4851     }
4852     else if ( aExp < 0x3FFF ) {
4853         if (aExp || aSig) {
4854             status->float_exception_flags |= float_flag_inexact;
4855         }
4856         return 0;
4857     }
4858     shiftCount = 0x403E - aExp;
4859     savedASig = aSig;
4860     aSig >>= shiftCount;
4861     z = aSig;
4862     if ( aSign ) z = - z;
4863     if ( ( z < 0 ) ^ aSign ) {
4864  invalid:
4865         float_raise(float_flag_invalid, status);
4866         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4867     }
4868     if ( ( aSig<<shiftCount ) != savedASig ) {
4869         status->float_exception_flags |= float_flag_inexact;
4870     }
4871     return z;
4872
4873 }
4874
4875 /*----------------------------------------------------------------------------
4876 | Returns the result of converting the extended double-precision floating-
4877 | point value `a' to the 64-bit two's complement integer format.  The
4878 | conversion is performed according to the IEC/IEEE Standard for Binary
4879 | Floating-Point Arithmetic---which means in particular that the conversion
4880 | is rounded according to the current rounding mode.  If `a' is a NaN,
4881 | the largest positive integer is returned.  Otherwise, if the conversion
4882 | overflows, the largest integer with the same sign as `a' is returned.
4883 *----------------------------------------------------------------------------*/
4884
4885 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4886 {
4887     flag aSign;
4888     int32_t aExp, shiftCount;
4889     uint64_t aSig, aSigExtra;
4890
4891     aSig = extractFloatx80Frac( a );
4892     aExp = extractFloatx80Exp( a );
4893     aSign = extractFloatx80Sign( a );
4894     shiftCount = 0x403E - aExp;
4895     if ( shiftCount <= 0 ) {
4896         if ( shiftCount ) {
4897             float_raise(float_flag_invalid, status);
4898             if (    ! aSign
4899                  || (    ( aExp == 0x7FFF )
4900                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
4901                ) {
4902                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4903             }
4904             return (int64_t) LIT64( 0x8000000000000000 );
4905         }
4906         aSigExtra = 0;
4907     }
4908     else {
4909         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4910     }
4911     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4912
4913 }
4914
4915 /*----------------------------------------------------------------------------
4916 | Returns the result of converting the extended double-precision floating-
4917 | point value `a' to the 64-bit two's complement integer format.  The
4918 | conversion is performed according to the IEC/IEEE Standard for Binary
4919 | Floating-Point Arithmetic, except that the conversion is always rounded
4920 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4921 | Otherwise, if the conversion overflows, the largest integer with the same
4922 | sign as `a' is returned.
4923 *----------------------------------------------------------------------------*/
4924
4925 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4926 {
4927     flag aSign;
4928     int32_t aExp, shiftCount;
4929     uint64_t aSig;
4930     int64_t z;
4931
4932     aSig = extractFloatx80Frac( a );
4933     aExp = extractFloatx80Exp( a );
4934     aSign = extractFloatx80Sign( a );
4935     shiftCount = aExp - 0x403E;
4936     if ( 0 <= shiftCount ) {
4937         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4938         if ( ( a.high != 0xC03E ) || aSig ) {
4939             float_raise(float_flag_invalid, status);
4940             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4941                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4942             }
4943         }
4944         return (int64_t) LIT64( 0x8000000000000000 );
4945     }
4946     else if ( aExp < 0x3FFF ) {
4947         if (aExp | aSig) {
4948             status->float_exception_flags |= float_flag_inexact;
4949         }
4950         return 0;
4951     }
4952     z = aSig>>( - shiftCount );
4953     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4954         status->float_exception_flags |= float_flag_inexact;
4955     }
4956     if ( aSign ) z = - z;
4957     return z;
4958
4959 }
4960
4961 /*----------------------------------------------------------------------------
4962 | Returns the result of converting the extended double-precision floating-
4963 | point value `a' to the single-precision floating-point format.  The
4964 | conversion is performed according to the IEC/IEEE Standard for Binary
4965 | Floating-Point Arithmetic.
4966 *----------------------------------------------------------------------------*/
4967
4968 float32 floatx80_to_float32(floatx80 a, float_status *status)
4969 {
4970     flag aSign;
4971     int32_t aExp;
4972     uint64_t aSig;
4973
4974     aSig = extractFloatx80Frac( a );
4975     aExp = extractFloatx80Exp( a );
4976     aSign = extractFloatx80Sign( a );
4977     if ( aExp == 0x7FFF ) {
4978         if ( (uint64_t) ( aSig<<1 ) ) {
4979             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4980         }
4981         return packFloat32( aSign, 0xFF, 0 );
4982     }
4983     shift64RightJamming( aSig, 33, &aSig );
4984     if ( aExp || aSig ) aExp -= 0x3F81;
4985     return roundAndPackFloat32(aSign, aExp, aSig, status);
4986
4987 }
4988
4989 /*----------------------------------------------------------------------------
4990 | Returns the result of converting the extended double-precision floating-
4991 | point value `a' to the double-precision floating-point format.  The
4992 | conversion is performed according to the IEC/IEEE Standard for Binary
4993 | Floating-Point Arithmetic.
4994 *----------------------------------------------------------------------------*/
4995
4996 float64 floatx80_to_float64(floatx80 a, float_status *status)
4997 {
4998     flag aSign;
4999     int32_t aExp;
5000     uint64_t aSig, zSig;
5001
5002     aSig = extractFloatx80Frac( a );
5003     aExp = extractFloatx80Exp( a );
5004     aSign = extractFloatx80Sign( a );
5005     if ( aExp == 0x7FFF ) {
5006         if ( (uint64_t) ( aSig<<1 ) ) {
5007             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5008         }
5009         return packFloat64( aSign, 0x7FF, 0 );
5010     }
5011     shift64RightJamming( aSig, 1, &zSig );
5012     if ( aExp || aSig ) aExp -= 0x3C01;
5013     return roundAndPackFloat64(aSign, aExp, zSig, status);
5014
5015 }
5016
5017 /*----------------------------------------------------------------------------
5018 | Returns the result of converting the extended double-precision floating-
5019 | point value `a' to the quadruple-precision floating-point format.  The
5020 | conversion is performed according to the IEC/IEEE Standard for Binary
5021 | Floating-Point Arithmetic.
5022 *----------------------------------------------------------------------------*/
5023
5024 float128 floatx80_to_float128(floatx80 a, float_status *status)
5025 {
5026     flag aSign;
5027     int aExp;
5028     uint64_t aSig, zSig0, zSig1;
5029
5030     aSig = extractFloatx80Frac( a );
5031     aExp = extractFloatx80Exp( a );
5032     aSign = extractFloatx80Sign( a );
5033     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5034         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5035     }
5036     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5037     return packFloat128( aSign, aExp, zSig0, zSig1 );
5038
5039 }
5040
5041 /*----------------------------------------------------------------------------
5042 | Rounds the extended double-precision floating-point value `a' to an integer,
5043 | and returns the result as an extended quadruple-precision floating-point
5044 | value.  The operation is performed according to the IEC/IEEE Standard for
5045 | Binary Floating-Point Arithmetic.
5046 *----------------------------------------------------------------------------*/
5047
5048 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5049 {
5050     flag aSign;
5051     int32_t aExp;
5052     uint64_t lastBitMask, roundBitsMask;
5053     floatx80 z;
5054
5055     aExp = extractFloatx80Exp( a );
5056     if ( 0x403E <= aExp ) {
5057         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5058             return propagateFloatx80NaN(a, a, status);
5059         }
5060         return a;
5061     }
5062     if ( aExp < 0x3FFF ) {
5063         if (    ( aExp == 0 )
5064              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5065             return a;
5066         }
5067         status->float_exception_flags |= float_flag_inexact;
5068         aSign = extractFloatx80Sign( a );
5069         switch (status->float_rounding_mode) {
5070          case float_round_nearest_even:
5071             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5072                ) {
5073                 return
5074                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5075             }
5076             break;
5077         case float_round_ties_away:
5078             if (aExp == 0x3FFE) {
5079                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5080             }
5081             break;
5082          case float_round_down:
5083             return
5084                   aSign ?
5085                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5086                 : packFloatx80( 0, 0, 0 );
5087          case float_round_up:
5088             return
5089                   aSign ? packFloatx80( 1, 0, 0 )
5090                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5091         }
5092         return packFloatx80( aSign, 0, 0 );
5093     }
5094     lastBitMask = 1;
5095     lastBitMask <<= 0x403E - aExp;
5096     roundBitsMask = lastBitMask - 1;
5097     z = a;
5098     switch (status->float_rounding_mode) {
5099     case float_round_nearest_even:
5100         z.low += lastBitMask>>1;
5101         if ((z.low & roundBitsMask) == 0) {
5102             z.low &= ~lastBitMask;
5103         }
5104         break;
5105     case float_round_ties_away:
5106         z.low += lastBitMask >> 1;
5107         break;
5108     case float_round_to_zero:
5109         break;
5110     case float_round_up:
5111         if (!extractFloatx80Sign(z)) {
5112             z.low += roundBitsMask;
5113         }
5114         break;
5115     case float_round_down:
5116         if (extractFloatx80Sign(z)) {
5117             z.low += roundBitsMask;
5118         }
5119         break;
5120     default:
5121         abort();
5122     }
5123     z.low &= ~ roundBitsMask;
5124     if ( z.low == 0 ) {
5125         ++z.high;
5126         z.low = LIT64( 0x8000000000000000 );
5127     }
5128     if (z.low != a.low) {
5129         status->float_exception_flags |= float_flag_inexact;
5130     }
5131     return z;
5132
5133 }
5134
5135 /*----------------------------------------------------------------------------
5136 | Returns the result of adding the absolute values of the extended double-
5137 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5138 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5139 | The addition is performed according to the IEC/IEEE Standard for Binary
5140 | Floating-Point Arithmetic.
5141 *----------------------------------------------------------------------------*/
5142
5143 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5144                                 float_status *status)
5145 {
5146     int32_t aExp, bExp, zExp;
5147     uint64_t aSig, bSig, zSig0, zSig1;
5148     int32_t expDiff;
5149
5150     aSig = extractFloatx80Frac( a );
5151     aExp = extractFloatx80Exp( a );
5152     bSig = extractFloatx80Frac( b );
5153     bExp = extractFloatx80Exp( b );
5154     expDiff = aExp - bExp;
5155     if ( 0 < expDiff ) {
5156         if ( aExp == 0x7FFF ) {
5157             if ((uint64_t)(aSig << 1)) {
5158                 return propagateFloatx80NaN(a, b, status);
5159             }
5160             return a;
5161         }
5162         if ( bExp == 0 ) --expDiff;
5163         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5164         zExp = aExp;
5165     }
5166     else if ( expDiff < 0 ) {
5167         if ( bExp == 0x7FFF ) {
5168             if ((uint64_t)(bSig << 1)) {
5169                 return propagateFloatx80NaN(a, b, status);
5170             }
5171             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5172         }
5173         if ( aExp == 0 ) ++expDiff;
5174         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5175         zExp = bExp;
5176     }
5177     else {
5178         if ( aExp == 0x7FFF ) {
5179             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5180                 return propagateFloatx80NaN(a, b, status);
5181             }
5182             return a;
5183         }
5184         zSig1 = 0;
5185         zSig0 = aSig + bSig;
5186         if ( aExp == 0 ) {
5187             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5188             goto roundAndPack;
5189         }
5190         zExp = aExp;
5191         goto shiftRight1;
5192     }
5193     zSig0 = aSig + bSig;
5194     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5195  shiftRight1:
5196     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5197     zSig0 |= LIT64( 0x8000000000000000 );
5198     ++zExp;
5199  roundAndPack:
5200     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5201                                 zSign, zExp, zSig0, zSig1, status);
5202 }
5203
5204 /*----------------------------------------------------------------------------
5205 | Returns the result of subtracting the absolute values of the extended
5206 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5207 | difference is negated before being returned.  `zSign' is ignored if the
5208 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5209 | Standard for Binary Floating-Point Arithmetic.
5210 *----------------------------------------------------------------------------*/
5211
5212 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5213                                 float_status *status)
5214 {
5215     int32_t aExp, bExp, zExp;
5216     uint64_t aSig, bSig, zSig0, zSig1;
5217     int32_t expDiff;
5218
5219     aSig = extractFloatx80Frac( a );
5220     aExp = extractFloatx80Exp( a );
5221     bSig = extractFloatx80Frac( b );
5222     bExp = extractFloatx80Exp( b );
5223     expDiff = aExp - bExp;
5224     if ( 0 < expDiff ) goto aExpBigger;
5225     if ( expDiff < 0 ) goto bExpBigger;
5226     if ( aExp == 0x7FFF ) {
5227         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5228             return propagateFloatx80NaN(a, b, status);
5229         }
5230         float_raise(float_flag_invalid, status);
5231         return floatx80_default_nan(status);
5232     }
5233     if ( aExp == 0 ) {
5234         aExp = 1;
5235         bExp = 1;
5236     }
5237     zSig1 = 0;
5238     if ( bSig < aSig ) goto aBigger;
5239     if ( aSig < bSig ) goto bBigger;
5240     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5241  bExpBigger:
5242     if ( bExp == 0x7FFF ) {
5243         if ((uint64_t)(bSig << 1)) {
5244             return propagateFloatx80NaN(a, b, status);
5245         }
5246         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5247     }
5248     if ( aExp == 0 ) ++expDiff;
5249     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5250  bBigger:
5251     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5252     zExp = bExp;
5253     zSign ^= 1;
5254     goto normalizeRoundAndPack;
5255  aExpBigger:
5256     if ( aExp == 0x7FFF ) {
5257         if ((uint64_t)(aSig << 1)) {
5258             return propagateFloatx80NaN(a, b, status);
5259         }
5260         return a;
5261     }
5262     if ( bExp == 0 ) --expDiff;
5263     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5264  aBigger:
5265     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5266     zExp = aExp;
5267  normalizeRoundAndPack:
5268     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5269                                          zSign, zExp, zSig0, zSig1, status);
5270 }
5271
5272 /*----------------------------------------------------------------------------
5273 | Returns the result of adding the extended double-precision floating-point
5274 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5275 | Standard for Binary Floating-Point Arithmetic.
5276 *----------------------------------------------------------------------------*/
5277
5278 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5279 {
5280     flag aSign, bSign;
5281
5282     aSign = extractFloatx80Sign( a );
5283     bSign = extractFloatx80Sign( b );
5284     if ( aSign == bSign ) {
5285         return addFloatx80Sigs(a, b, aSign, status);
5286     }
5287     else {
5288         return subFloatx80Sigs(a, b, aSign, status);
5289     }
5290
5291 }
5292
5293 /*----------------------------------------------------------------------------
5294 | Returns the result of subtracting the extended double-precision floating-
5295 | point values `a' and `b'.  The operation is performed according to the
5296 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5297 *----------------------------------------------------------------------------*/
5298
5299 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5300 {
5301     flag aSign, bSign;
5302
5303     aSign = extractFloatx80Sign( a );
5304     bSign = extractFloatx80Sign( b );
5305     if ( aSign == bSign ) {
5306         return subFloatx80Sigs(a, b, aSign, status);
5307     }
5308     else {
5309         return addFloatx80Sigs(a, b, aSign, status);
5310     }
5311
5312 }
5313
5314 /*----------------------------------------------------------------------------
5315 | Returns the result of multiplying the extended double-precision floating-
5316 | point values `a' and `b'.  The operation is performed according to the
5317 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5318 *----------------------------------------------------------------------------*/
5319
5320 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5321 {
5322     flag aSign, bSign, zSign;
5323     int32_t aExp, bExp, zExp;
5324     uint64_t aSig, bSig, zSig0, zSig1;
5325
5326     aSig = extractFloatx80Frac( a );
5327     aExp = extractFloatx80Exp( a );
5328     aSign = extractFloatx80Sign( a );
5329     bSig = extractFloatx80Frac( b );
5330     bExp = extractFloatx80Exp( b );
5331     bSign = extractFloatx80Sign( b );
5332     zSign = aSign ^ bSign;
5333     if ( aExp == 0x7FFF ) {
5334         if (    (uint64_t) ( aSig<<1 )
5335              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5336             return propagateFloatx80NaN(a, b, status);
5337         }
5338         if ( ( bExp | bSig ) == 0 ) goto invalid;
5339         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5340     }
5341     if ( bExp == 0x7FFF ) {
5342         if ((uint64_t)(bSig << 1)) {
5343             return propagateFloatx80NaN(a, b, status);
5344         }
5345         if ( ( aExp | aSig ) == 0 ) {
5346  invalid:
5347             float_raise(float_flag_invalid, status);
5348             return floatx80_default_nan(status);
5349         }
5350         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5351     }
5352     if ( aExp == 0 ) {
5353         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5354         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5355     }
5356     if ( bExp == 0 ) {
5357         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5358         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5359     }
5360     zExp = aExp + bExp - 0x3FFE;
5361     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5362     if ( 0 < (int64_t) zSig0 ) {
5363         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5364         --zExp;
5365     }
5366     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5367                                 zSign, zExp, zSig0, zSig1, status);
5368 }
5369
5370 /*----------------------------------------------------------------------------
5371 | Returns the result of dividing the extended double-precision floating-point
5372 | value `a' by the corresponding value `b'.  The operation is performed
5373 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5374 *----------------------------------------------------------------------------*/
5375
5376 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5377 {
5378     flag aSign, bSign, zSign;
5379     int32_t aExp, bExp, zExp;
5380     uint64_t aSig, bSig, zSig0, zSig1;
5381     uint64_t rem0, rem1, rem2, term0, term1, term2;
5382
5383     aSig = extractFloatx80Frac( a );
5384     aExp = extractFloatx80Exp( a );
5385     aSign = extractFloatx80Sign( a );
5386     bSig = extractFloatx80Frac( b );
5387     bExp = extractFloatx80Exp( b );
5388     bSign = extractFloatx80Sign( b );
5389     zSign = aSign ^ bSign;
5390     if ( aExp == 0x7FFF ) {
5391         if ((uint64_t)(aSig << 1)) {
5392             return propagateFloatx80NaN(a, b, status);
5393         }
5394         if ( bExp == 0x7FFF ) {
5395             if ((uint64_t)(bSig << 1)) {
5396                 return propagateFloatx80NaN(a, b, status);
5397             }
5398             goto invalid;
5399         }
5400         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5401     }
5402     if ( bExp == 0x7FFF ) {
5403         if ((uint64_t)(bSig << 1)) {
5404             return propagateFloatx80NaN(a, b, status);
5405         }
5406         return packFloatx80( zSign, 0, 0 );
5407     }
5408     if ( bExp == 0 ) {
5409         if ( bSig == 0 ) {
5410             if ( ( aExp | aSig ) == 0 ) {
5411  invalid:
5412                 float_raise(float_flag_invalid, status);
5413                 return floatx80_default_nan(status);
5414             }
5415             float_raise(float_flag_divbyzero, status);
5416             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5417         }
5418         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5419     }
5420     if ( aExp == 0 ) {
5421         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5422         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5423     }
5424     zExp = aExp - bExp + 0x3FFE;
5425     rem1 = 0;
5426     if ( bSig <= aSig ) {
5427         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5428         ++zExp;
5429     }
5430     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5431     mul64To128( bSig, zSig0, &term0, &term1 );
5432     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5433     while ( (int64_t) rem0 < 0 ) {
5434         --zSig0;
5435         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5436     }
5437     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5438     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5439         mul64To128( bSig, zSig1, &term1, &term2 );
5440         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5441         while ( (int64_t) rem1 < 0 ) {
5442             --zSig1;
5443             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5444         }
5445         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5446     }
5447     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5448                                 zSign, zExp, zSig0, zSig1, status);
5449 }
5450
5451 /*----------------------------------------------------------------------------
5452 | Returns the remainder of the extended double-precision floating-point value
5453 | `a' with respect to the corresponding value `b'.  The operation is performed
5454 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5455 *----------------------------------------------------------------------------*/
5456
5457 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5458 {
5459     flag aSign, zSign;
5460     int32_t aExp, bExp, expDiff;
5461     uint64_t aSig0, aSig1, bSig;
5462     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5463
5464     aSig0 = extractFloatx80Frac( a );
5465     aExp = extractFloatx80Exp( a );
5466     aSign = extractFloatx80Sign( a );
5467     bSig = extractFloatx80Frac( b );
5468     bExp = extractFloatx80Exp( b );
5469     if ( aExp == 0x7FFF ) {
5470         if (    (uint64_t) ( aSig0<<1 )
5471              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5472             return propagateFloatx80NaN(a, b, status);
5473         }
5474         goto invalid;
5475     }
5476     if ( bExp == 0x7FFF ) {
5477         if ((uint64_t)(bSig << 1)) {
5478             return propagateFloatx80NaN(a, b, status);
5479         }
5480         return a;
5481     }
5482     if ( bExp == 0 ) {
5483         if ( bSig == 0 ) {
5484  invalid:
5485             float_raise(float_flag_invalid, status);
5486             return floatx80_default_nan(status);
5487         }
5488         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5489     }
5490     if ( aExp == 0 ) {
5491         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5492         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5493     }
5494     bSig |= LIT64( 0x8000000000000000 );
5495     zSign = aSign;
5496     expDiff = aExp - bExp;
5497     aSig1 = 0;
5498     if ( expDiff < 0 ) {
5499         if ( expDiff < -1 ) return a;
5500         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5501         expDiff = 0;
5502     }
5503     q = ( bSig <= aSig0 );
5504     if ( q ) aSig0 -= bSig;
5505     expDiff -= 64;
5506     while ( 0 < expDiff ) {
5507         q = estimateDiv128To64( aSig0, aSig1, bSig );
5508         q = ( 2 < q ) ? q - 2 : 0;
5509         mul64To128( bSig, q, &term0, &term1 );
5510         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5511         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5512         expDiff -= 62;
5513     }
5514     expDiff += 64;
5515     if ( 0 < expDiff ) {
5516         q = estimateDiv128To64( aSig0, aSig1, bSig );
5517         q = ( 2 < q ) ? q - 2 : 0;
5518         q >>= 64 - expDiff;
5519         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5520         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5521         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5522         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5523             ++q;
5524             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5525         }
5526     }
5527     else {
5528         term1 = 0;
5529         term0 = bSig;
5530     }
5531     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5532     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5533          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5534               && ( q & 1 ) )
5535        ) {
5536         aSig0 = alternateASig0;
5537         aSig1 = alternateASig1;
5538         zSign = ! zSign;
5539     }
5540     return
5541         normalizeRoundAndPackFloatx80(
5542             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5543
5544 }
5545
5546 /*----------------------------------------------------------------------------
5547 | Returns the square root of the extended double-precision floating-point
5548 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5549 | for Binary Floating-Point Arithmetic.
5550 *----------------------------------------------------------------------------*/
5551
5552 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5553 {
5554     flag aSign;
5555     int32_t aExp, zExp;
5556     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5557     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5558
5559     aSig0 = extractFloatx80Frac( a );
5560     aExp = extractFloatx80Exp( a );
5561     aSign = extractFloatx80Sign( a );
5562     if ( aExp == 0x7FFF ) {
5563         if ((uint64_t)(aSig0 << 1)) {
5564             return propagateFloatx80NaN(a, a, status);
5565         }
5566         if ( ! aSign ) return a;
5567         goto invalid;
5568     }
5569     if ( aSign ) {
5570         if ( ( aExp | aSig0 ) == 0 ) return a;
5571  invalid:
5572         float_raise(float_flag_invalid, status);
5573         return floatx80_default_nan(status);
5574     }
5575     if ( aExp == 0 ) {
5576         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5577         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5578     }
5579     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5580     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5581     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5582     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5583     doubleZSig0 = zSig0<<1;
5584     mul64To128( zSig0, zSig0, &term0, &term1 );
5585     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5586     while ( (int64_t) rem0 < 0 ) {
5587         --zSig0;
5588         doubleZSig0 -= 2;
5589         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5590     }
5591     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5592     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5593         if ( zSig1 == 0 ) zSig1 = 1;
5594         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5595         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5596         mul64To128( zSig1, zSig1, &term2, &term3 );
5597         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5598         while ( (int64_t) rem1 < 0 ) {
5599             --zSig1;
5600             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5601             term3 |= 1;
5602             term2 |= doubleZSig0;
5603             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5604         }
5605         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5606     }
5607     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5608     zSig0 |= doubleZSig0;
5609     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5610                                 0, zExp, zSig0, zSig1, status);
5611 }
5612
5613 /*----------------------------------------------------------------------------
5614 | Returns 1 if the extended double-precision floating-point value `a' is equal
5615 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5616 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5617 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5618 *----------------------------------------------------------------------------*/
5619
5620 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5621 {
5622
5623     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5624               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5625          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5626               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5627        ) {
5628         float_raise(float_flag_invalid, status);
5629         return 0;
5630     }
5631     return
5632            ( a.low == b.low )
5633         && (    ( a.high == b.high )
5634              || (    ( a.low == 0 )
5635                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5636            );
5637
5638 }
5639
5640 /*----------------------------------------------------------------------------
5641 | Returns 1 if the extended double-precision floating-point value `a' is
5642 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5643 | invalid exception is raised if either operand is a NaN.  The comparison is
5644 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5645 | Arithmetic.
5646 *----------------------------------------------------------------------------*/
5647
5648 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5649 {
5650     flag aSign, bSign;
5651
5652     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5653               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5654          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5655               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5656        ) {
5657         float_raise(float_flag_invalid, status);
5658         return 0;
5659     }
5660     aSign = extractFloatx80Sign( a );
5661     bSign = extractFloatx80Sign( b );
5662     if ( aSign != bSign ) {
5663         return
5664                aSign
5665             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5666                  == 0 );
5667     }
5668     return
5669           aSign ? le128( b.high, b.low, a.high, a.low )
5670         : le128( a.high, a.low, b.high, b.low );
5671
5672 }
5673
5674 /*----------------------------------------------------------------------------
5675 | Returns 1 if the extended double-precision floating-point value `a' is
5676 | less than the corresponding value `b', and 0 otherwise.  The invalid
5677 | exception is raised if either operand is a NaN.  The comparison is performed
5678 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5679 *----------------------------------------------------------------------------*/
5680
5681 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5682 {
5683     flag aSign, bSign;
5684
5685     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5686               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5687          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5688               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5689        ) {
5690         float_raise(float_flag_invalid, status);
5691         return 0;
5692     }
5693     aSign = extractFloatx80Sign( a );
5694     bSign = extractFloatx80Sign( b );
5695     if ( aSign != bSign ) {
5696         return
5697                aSign
5698             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5699                  != 0 );
5700     }
5701     return
5702           aSign ? lt128( b.high, b.low, a.high, a.low )
5703         : lt128( a.high, a.low, b.high, b.low );
5704
5705 }
5706
5707 /*----------------------------------------------------------------------------
5708 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5709 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5710 | either operand is a NaN.   The comparison is performed according to the
5711 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5712 *----------------------------------------------------------------------------*/
5713 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5714 {
5715     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5716               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5717          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5718               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5719        ) {
5720         float_raise(float_flag_invalid, status);
5721         return 1;
5722     }
5723     return 0;
5724 }
5725
5726 /*----------------------------------------------------------------------------
5727 | Returns 1 if the extended double-precision floating-point value `a' is
5728 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5729 | cause an exception.  The comparison is performed according to the IEC/IEEE
5730 | Standard for Binary Floating-Point Arithmetic.
5731 *----------------------------------------------------------------------------*/
5732
5733 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5734 {
5735
5736     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5737               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5738          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5739               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5740        ) {
5741         if (floatx80_is_signaling_nan(a, status)
5742          || floatx80_is_signaling_nan(b, status)) {
5743             float_raise(float_flag_invalid, status);
5744         }
5745         return 0;
5746     }
5747     return
5748            ( a.low == b.low )
5749         && (    ( a.high == b.high )
5750              || (    ( a.low == 0 )
5751                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5752            );
5753
5754 }
5755
5756 /*----------------------------------------------------------------------------
5757 | Returns 1 if the extended double-precision floating-point value `a' is less
5758 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5759 | do not cause an exception.  Otherwise, the comparison is performed according
5760 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5761 *----------------------------------------------------------------------------*/
5762
5763 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5764 {
5765     flag aSign, bSign;
5766
5767     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5768               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5769          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5770               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5771        ) {
5772         if (floatx80_is_signaling_nan(a, status)
5773          || floatx80_is_signaling_nan(b, status)) {
5774             float_raise(float_flag_invalid, status);
5775         }
5776         return 0;
5777     }
5778     aSign = extractFloatx80Sign( a );
5779     bSign = extractFloatx80Sign( b );
5780     if ( aSign != bSign ) {
5781         return
5782                aSign
5783             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5784                  == 0 );
5785     }
5786     return
5787           aSign ? le128( b.high, b.low, a.high, a.low )
5788         : le128( a.high, a.low, b.high, b.low );
5789
5790 }
5791
5792 /*----------------------------------------------------------------------------
5793 | Returns 1 if the extended double-precision floating-point value `a' is less
5794 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5795 | an exception.  Otherwise, the comparison is performed according to the
5796 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5797 *----------------------------------------------------------------------------*/
5798
5799 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5800 {
5801     flag aSign, bSign;
5802
5803     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5804               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5805          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5806               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5807        ) {
5808         if (floatx80_is_signaling_nan(a, status)
5809          || floatx80_is_signaling_nan(b, status)) {
5810             float_raise(float_flag_invalid, status);
5811         }
5812         return 0;
5813     }
5814     aSign = extractFloatx80Sign( a );
5815     bSign = extractFloatx80Sign( b );
5816     if ( aSign != bSign ) {
5817         return
5818                aSign
5819             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5820                  != 0 );
5821     }
5822     return
5823           aSign ? lt128( b.high, b.low, a.high, a.low )
5824         : lt128( a.high, a.low, b.high, b.low );
5825
5826 }
5827
5828 /*----------------------------------------------------------------------------
5829 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5830 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5831 | The comparison is performed according to the IEC/IEEE Standard for Binary
5832 | Floating-Point Arithmetic.
5833 *----------------------------------------------------------------------------*/
5834 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5835 {
5836     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5837               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5838          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5839               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5840        ) {
5841         if (floatx80_is_signaling_nan(a, status)
5842          || floatx80_is_signaling_nan(b, status)) {
5843             float_raise(float_flag_invalid, status);
5844         }
5845         return 1;
5846     }
5847     return 0;
5848 }
5849
5850 /*----------------------------------------------------------------------------
5851 | Returns the result of converting the quadruple-precision floating-point
5852 | value `a' to the 32-bit two's complement integer format.  The conversion
5853 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5854 | Arithmetic---which means in particular that the conversion is rounded
5855 | according to the current rounding mode.  If `a' is a NaN, the largest
5856 | positive integer is returned.  Otherwise, if the conversion overflows, the
5857 | largest integer with the same sign as `a' is returned.
5858 *----------------------------------------------------------------------------*/
5859
5860 int32_t float128_to_int32(float128 a, float_status *status)
5861 {
5862     flag aSign;
5863     int32_t aExp, shiftCount;
5864     uint64_t aSig0, aSig1;
5865
5866     aSig1 = extractFloat128Frac1( a );
5867     aSig0 = extractFloat128Frac0( a );
5868     aExp = extractFloat128Exp( a );
5869     aSign = extractFloat128Sign( a );
5870     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5871     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5872     aSig0 |= ( aSig1 != 0 );
5873     shiftCount = 0x4028 - aExp;
5874     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5875     return roundAndPackInt32(aSign, aSig0, status);
5876
5877 }
5878
5879 /*----------------------------------------------------------------------------
5880 | Returns the result of converting the quadruple-precision floating-point
5881 | value `a' to the 32-bit two's complement integer format.  The conversion
5882 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5883 | Arithmetic, except that the conversion is always rounded toward zero.  If
5884 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5885 | conversion overflows, the largest integer with the same sign as `a' is
5886 | returned.
5887 *----------------------------------------------------------------------------*/
5888
5889 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5890 {
5891     flag aSign;
5892     int32_t aExp, shiftCount;
5893     uint64_t aSig0, aSig1, savedASig;
5894     int32_t z;
5895
5896     aSig1 = extractFloat128Frac1( a );
5897     aSig0 = extractFloat128Frac0( a );
5898     aExp = extractFloat128Exp( a );
5899     aSign = extractFloat128Sign( a );
5900     aSig0 |= ( aSig1 != 0 );
5901     if ( 0x401E < aExp ) {
5902         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5903         goto invalid;
5904     }
5905     else if ( aExp < 0x3FFF ) {
5906         if (aExp || aSig0) {
5907             status->float_exception_flags |= float_flag_inexact;
5908         }
5909         return 0;
5910     }
5911     aSig0 |= LIT64( 0x0001000000000000 );
5912     shiftCount = 0x402F - aExp;
5913     savedASig = aSig0;
5914     aSig0 >>= shiftCount;
5915     z = aSig0;
5916     if ( aSign ) z = - z;
5917     if ( ( z < 0 ) ^ aSign ) {
5918  invalid:
5919         float_raise(float_flag_invalid, status);
5920         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5921     }
5922     if ( ( aSig0<<shiftCount ) != savedASig ) {
5923         status->float_exception_flags |= float_flag_inexact;
5924     }
5925     return z;
5926
5927 }
5928
5929 /*----------------------------------------------------------------------------
5930 | Returns the result of converting the quadruple-precision floating-point
5931 | value `a' to the 64-bit two's complement integer format.  The conversion
5932 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5933 | Arithmetic---which means in particular that the conversion is rounded
5934 | according to the current rounding mode.  If `a' is a NaN, the largest
5935 | positive integer is returned.  Otherwise, if the conversion overflows, the
5936 | largest integer with the same sign as `a' is returned.
5937 *----------------------------------------------------------------------------*/
5938
5939 int64_t float128_to_int64(float128 a, float_status *status)
5940 {
5941     flag aSign;
5942     int32_t aExp, shiftCount;
5943     uint64_t aSig0, aSig1;
5944
5945     aSig1 = extractFloat128Frac1( a );
5946     aSig0 = extractFloat128Frac0( a );
5947     aExp = extractFloat128Exp( a );
5948     aSign = extractFloat128Sign( a );
5949     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5950     shiftCount = 0x402F - aExp;
5951     if ( shiftCount <= 0 ) {
5952         if ( 0x403E < aExp ) {
5953             float_raise(float_flag_invalid, status);
5954             if (    ! aSign
5955                  || (    ( aExp == 0x7FFF )
5956                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5957                     )
5958                ) {
5959                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5960             }
5961             return (int64_t) LIT64( 0x8000000000000000 );
5962         }
5963         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5964     }
5965     else {
5966         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5967     }
5968     return roundAndPackInt64(aSign, aSig0, aSig1, status);
5969
5970 }
5971
5972 /*----------------------------------------------------------------------------
5973 | Returns the result of converting the quadruple-precision floating-point
5974 | value `a' to the 64-bit two's complement integer format.  The conversion
5975 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5976 | Arithmetic, except that the conversion is always rounded toward zero.
5977 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5978 | the conversion overflows, the largest integer with the same sign as `a' is
5979 | returned.
5980 *----------------------------------------------------------------------------*/
5981
5982 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5983 {
5984     flag aSign;
5985     int32_t aExp, shiftCount;
5986     uint64_t aSig0, aSig1;
5987     int64_t z;
5988
5989     aSig1 = extractFloat128Frac1( a );
5990     aSig0 = extractFloat128Frac0( a );
5991     aExp = extractFloat128Exp( a );
5992     aSign = extractFloat128Sign( a );
5993     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5994     shiftCount = aExp - 0x402F;
5995     if ( 0 < shiftCount ) {
5996         if ( 0x403E <= aExp ) {
5997             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5998             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5999                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6000                 if (aSig1) {
6001                     status->float_exception_flags |= float_flag_inexact;
6002                 }
6003             }
6004             else {
6005                 float_raise(float_flag_invalid, status);
6006                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6007                     return LIT64( 0x7FFFFFFFFFFFFFFF );
6008                 }
6009             }
6010             return (int64_t) LIT64( 0x8000000000000000 );
6011         }
6012         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6013         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6014             status->float_exception_flags |= float_flag_inexact;
6015         }
6016     }
6017     else {
6018         if ( aExp < 0x3FFF ) {
6019             if ( aExp | aSig0 | aSig1 ) {
6020                 status->float_exception_flags |= float_flag_inexact;
6021             }
6022             return 0;
6023         }
6024         z = aSig0>>( - shiftCount );
6025         if (    aSig1
6026              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6027             status->float_exception_flags |= float_flag_inexact;
6028         }
6029     }
6030     if ( aSign ) z = - z;
6031     return z;
6032
6033 }
6034
6035 /*----------------------------------------------------------------------------
6036 | Returns the result of converting the quadruple-precision floating-point
6037 | value `a' to the single-precision floating-point format.  The conversion
6038 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6039 | Arithmetic.
6040 *----------------------------------------------------------------------------*/
6041
6042 float32 float128_to_float32(float128 a, float_status *status)
6043 {
6044     flag aSign;
6045     int32_t aExp;
6046     uint64_t aSig0, aSig1;
6047     uint32_t zSig;
6048
6049     aSig1 = extractFloat128Frac1( a );
6050     aSig0 = extractFloat128Frac0( a );
6051     aExp = extractFloat128Exp( a );
6052     aSign = extractFloat128Sign( a );
6053     if ( aExp == 0x7FFF ) {
6054         if ( aSig0 | aSig1 ) {
6055             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6056         }
6057         return packFloat32( aSign, 0xFF, 0 );
6058     }
6059     aSig0 |= ( aSig1 != 0 );
6060     shift64RightJamming( aSig0, 18, &aSig0 );
6061     zSig = aSig0;
6062     if ( aExp || zSig ) {
6063         zSig |= 0x40000000;
6064         aExp -= 0x3F81;
6065     }
6066     return roundAndPackFloat32(aSign, aExp, zSig, status);
6067
6068 }
6069
6070 /*----------------------------------------------------------------------------
6071 | Returns the result of converting the quadruple-precision floating-point
6072 | value `a' to the double-precision floating-point format.  The conversion
6073 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6074 | Arithmetic.
6075 *----------------------------------------------------------------------------*/
6076
6077 float64 float128_to_float64(float128 a, float_status *status)
6078 {
6079     flag aSign;
6080     int32_t aExp;
6081     uint64_t aSig0, aSig1;
6082
6083     aSig1 = extractFloat128Frac1( a );
6084     aSig0 = extractFloat128Frac0( a );
6085     aExp = extractFloat128Exp( a );
6086     aSign = extractFloat128Sign( a );
6087     if ( aExp == 0x7FFF ) {
6088         if ( aSig0 | aSig1 ) {
6089             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6090         }
6091         return packFloat64( aSign, 0x7FF, 0 );
6092     }
6093     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6094     aSig0 |= ( aSig1 != 0 );
6095     if ( aExp || aSig0 ) {
6096         aSig0 |= LIT64( 0x4000000000000000 );
6097         aExp -= 0x3C01;
6098     }
6099     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6100
6101 }
6102
6103 /*----------------------------------------------------------------------------
6104 | Returns the result of converting the quadruple-precision floating-point
6105 | value `a' to the extended double-precision floating-point format.  The
6106 | conversion is performed according to the IEC/IEEE Standard for Binary
6107 | Floating-Point Arithmetic.
6108 *----------------------------------------------------------------------------*/
6109
6110 floatx80 float128_to_floatx80(float128 a, float_status *status)
6111 {
6112     flag aSign;
6113     int32_t aExp;
6114     uint64_t aSig0, aSig1;
6115
6116     aSig1 = extractFloat128Frac1( a );
6117     aSig0 = extractFloat128Frac0( a );
6118     aExp = extractFloat128Exp( a );
6119     aSign = extractFloat128Sign( a );
6120     if ( aExp == 0x7FFF ) {
6121         if ( aSig0 | aSig1 ) {
6122             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6123         }
6124         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6125     }
6126     if ( aExp == 0 ) {
6127         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6128         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6129     }
6130     else {
6131         aSig0 |= LIT64( 0x0001000000000000 );
6132     }
6133     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6134     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6135
6136 }
6137
6138 /*----------------------------------------------------------------------------
6139 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6140 | returns the result as a quadruple-precision floating-point value.  The
6141 | operation is performed according to the IEC/IEEE Standard for Binary
6142 | Floating-Point Arithmetic.
6143 *----------------------------------------------------------------------------*/
6144
6145 float128 float128_round_to_int(float128 a, float_status *status)
6146 {
6147     flag aSign;
6148     int32_t aExp;
6149     uint64_t lastBitMask, roundBitsMask;
6150     float128 z;
6151
6152     aExp = extractFloat128Exp( a );
6153     if ( 0x402F <= aExp ) {
6154         if ( 0x406F <= aExp ) {
6155             if (    ( aExp == 0x7FFF )
6156                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6157                ) {
6158                 return propagateFloat128NaN(a, a, status);
6159             }
6160             return a;
6161         }
6162         lastBitMask = 1;
6163         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6164         roundBitsMask = lastBitMask - 1;
6165         z = a;
6166         switch (status->float_rounding_mode) {
6167         case float_round_nearest_even:
6168             if ( lastBitMask ) {
6169                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6170                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6171             }
6172             else {
6173                 if ( (int64_t) z.low < 0 ) {
6174                     ++z.high;
6175                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6176                 }
6177             }
6178             break;
6179         case float_round_ties_away:
6180             if (lastBitMask) {
6181                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6182             } else {
6183                 if ((int64_t) z.low < 0) {
6184                     ++z.high;
6185                 }
6186             }
6187             break;
6188         case float_round_to_zero:
6189             break;
6190         case float_round_up:
6191             if (!extractFloat128Sign(z)) {
6192                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6193             }
6194             break;
6195         case float_round_down:
6196             if (extractFloat128Sign(z)) {
6197                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6198             }
6199             break;
6200         default:
6201             abort();
6202         }
6203         z.low &= ~ roundBitsMask;
6204     }
6205     else {
6206         if ( aExp < 0x3FFF ) {
6207             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6208             status->float_exception_flags |= float_flag_inexact;
6209             aSign = extractFloat128Sign( a );
6210             switch (status->float_rounding_mode) {
6211              case float_round_nearest_even:
6212                 if (    ( aExp == 0x3FFE )
6213                      && (   extractFloat128Frac0( a )
6214                           | extractFloat128Frac1( a ) )
6215                    ) {
6216                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6217                 }
6218                 break;
6219             case float_round_ties_away:
6220                 if (aExp == 0x3FFE) {
6221                     return packFloat128(aSign, 0x3FFF, 0, 0);
6222                 }
6223                 break;
6224              case float_round_down:
6225                 return
6226                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6227                     : packFloat128( 0, 0, 0, 0 );
6228              case float_round_up:
6229                 return
6230                       aSign ? packFloat128( 1, 0, 0, 0 )
6231                     : packFloat128( 0, 0x3FFF, 0, 0 );
6232             }
6233             return packFloat128( aSign, 0, 0, 0 );
6234         }
6235         lastBitMask = 1;
6236         lastBitMask <<= 0x402F - aExp;
6237         roundBitsMask = lastBitMask - 1;
6238         z.low = 0;
6239         z.high = a.high;
6240         switch (status->float_rounding_mode) {
6241         case float_round_nearest_even:
6242             z.high += lastBitMask>>1;
6243             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6244                 z.high &= ~ lastBitMask;
6245             }
6246             break;
6247         case float_round_ties_away:
6248             z.high += lastBitMask>>1;
6249             break;
6250         case float_round_to_zero:
6251             break;
6252         case float_round_up:
6253             if (!extractFloat128Sign(z)) {
6254                 z.high |= ( a.low != 0 );
6255                 z.high += roundBitsMask;
6256             }
6257             break;
6258         case float_round_down:
6259             if (extractFloat128Sign(z)) {
6260                 z.high |= (a.low != 0);
6261                 z.high += roundBitsMask;
6262             }
6263             break;
6264         default:
6265             abort();
6266         }
6267         z.high &= ~ roundBitsMask;
6268     }
6269     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6270         status->float_exception_flags |= float_flag_inexact;
6271     }
6272     return z;
6273
6274 }
6275
6276 /*----------------------------------------------------------------------------
6277 | Returns the result of adding the absolute values of the quadruple-precision
6278 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6279 | before being returned.  `zSign' is ignored if the result is a NaN.
6280 | The addition is performed according to the IEC/IEEE Standard for Binary
6281 | Floating-Point Arithmetic.
6282 *----------------------------------------------------------------------------*/
6283
6284 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6285                                 float_status *status)
6286 {
6287     int32_t aExp, bExp, zExp;
6288     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6289     int32_t expDiff;
6290
6291     aSig1 = extractFloat128Frac1( a );
6292     aSig0 = extractFloat128Frac0( a );
6293     aExp = extractFloat128Exp( a );
6294     bSig1 = extractFloat128Frac1( b );
6295     bSig0 = extractFloat128Frac0( b );
6296     bExp = extractFloat128Exp( b );
6297     expDiff = aExp - bExp;
6298     if ( 0 < expDiff ) {
6299         if ( aExp == 0x7FFF ) {
6300             if (aSig0 | aSig1) {
6301                 return propagateFloat128NaN(a, b, status);
6302             }
6303             return a;
6304         }
6305         if ( bExp == 0 ) {
6306             --expDiff;
6307         }
6308         else {
6309             bSig0 |= LIT64( 0x0001000000000000 );
6310         }
6311         shift128ExtraRightJamming(
6312             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6313         zExp = aExp;
6314     }
6315     else if ( expDiff < 0 ) {
6316         if ( bExp == 0x7FFF ) {
6317             if (bSig0 | bSig1) {
6318                 return propagateFloat128NaN(a, b, status);
6319             }
6320             return packFloat128( zSign, 0x7FFF, 0, 0 );
6321         }
6322         if ( aExp == 0 ) {
6323             ++expDiff;
6324         }
6325         else {
6326             aSig0 |= LIT64( 0x0001000000000000 );
6327         }
6328         shift128ExtraRightJamming(
6329             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6330         zExp = bExp;
6331     }
6332     else {
6333         if ( aExp == 0x7FFF ) {
6334             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6335                 return propagateFloat128NaN(a, b, status);
6336             }
6337             return a;
6338         }
6339         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6340         if ( aExp == 0 ) {
6341             if (status->flush_to_zero) {
6342                 if (zSig0 | zSig1) {
6343                     float_raise(float_flag_output_denormal, status);
6344                 }
6345                 return packFloat128(zSign, 0, 0, 0);
6346             }
6347             return packFloat128( zSign, 0, zSig0, zSig1 );
6348         }
6349         zSig2 = 0;
6350         zSig0 |= LIT64( 0x0002000000000000 );
6351         zExp = aExp;
6352         goto shiftRight1;
6353     }
6354     aSig0 |= LIT64( 0x0001000000000000 );
6355     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6356     --zExp;
6357     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6358     ++zExp;
6359  shiftRight1:
6360     shift128ExtraRightJamming(
6361         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6362  roundAndPack:
6363     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6364
6365 }
6366
6367 /*----------------------------------------------------------------------------
6368 | Returns the result of subtracting the absolute values of the quadruple-
6369 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6370 | difference is negated before being returned.  `zSign' is ignored if the
6371 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6372 | Standard for Binary Floating-Point Arithmetic.
6373 *----------------------------------------------------------------------------*/
6374
6375 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6376                                 float_status *status)
6377 {
6378     int32_t aExp, bExp, zExp;
6379     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6380     int32_t expDiff;
6381
6382     aSig1 = extractFloat128Frac1( a );
6383     aSig0 = extractFloat128Frac0( a );
6384     aExp = extractFloat128Exp( a );
6385     bSig1 = extractFloat128Frac1( b );
6386     bSig0 = extractFloat128Frac0( b );
6387     bExp = extractFloat128Exp( b );
6388     expDiff = aExp - bExp;
6389     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6390     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6391     if ( 0 < expDiff ) goto aExpBigger;
6392     if ( expDiff < 0 ) goto bExpBigger;
6393     if ( aExp == 0x7FFF ) {
6394         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6395             return propagateFloat128NaN(a, b, status);
6396         }
6397         float_raise(float_flag_invalid, status);
6398         return float128_default_nan(status);
6399     }
6400     if ( aExp == 0 ) {
6401         aExp = 1;
6402         bExp = 1;
6403     }
6404     if ( bSig0 < aSig0 ) goto aBigger;
6405     if ( aSig0 < bSig0 ) goto bBigger;
6406     if ( bSig1 < aSig1 ) goto aBigger;
6407     if ( aSig1 < bSig1 ) goto bBigger;
6408     return packFloat128(status->float_rounding_mode == float_round_down,
6409                         0, 0, 0);
6410  bExpBigger:
6411     if ( bExp == 0x7FFF ) {
6412         if (bSig0 | bSig1) {
6413             return propagateFloat128NaN(a, b, status);
6414         }
6415         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6416     }
6417     if ( aExp == 0 ) {
6418         ++expDiff;
6419     }
6420     else {
6421         aSig0 |= LIT64( 0x4000000000000000 );
6422     }
6423     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6424     bSig0 |= LIT64( 0x4000000000000000 );
6425  bBigger:
6426     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6427     zExp = bExp;
6428     zSign ^= 1;
6429     goto normalizeRoundAndPack;
6430  aExpBigger:
6431     if ( aExp == 0x7FFF ) {
6432         if (aSig0 | aSig1) {
6433             return propagateFloat128NaN(a, b, status);
6434         }
6435         return a;
6436     }
6437     if ( bExp == 0 ) {
6438         --expDiff;
6439     }
6440     else {
6441         bSig0 |= LIT64( 0x4000000000000000 );
6442     }
6443     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6444     aSig0 |= LIT64( 0x4000000000000000 );
6445  aBigger:
6446     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6447     zExp = aExp;
6448  normalizeRoundAndPack:
6449     --zExp;
6450     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6451                                          status);
6452
6453 }
6454
6455 /*----------------------------------------------------------------------------
6456 | Returns the result of adding the quadruple-precision floating-point values
6457 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6458 | for Binary Floating-Point Arithmetic.
6459 *----------------------------------------------------------------------------*/
6460
6461 float128 float128_add(float128 a, float128 b, float_status *status)
6462 {
6463     flag aSign, bSign;
6464
6465     aSign = extractFloat128Sign( a );
6466     bSign = extractFloat128Sign( b );
6467     if ( aSign == bSign ) {
6468         return addFloat128Sigs(a, b, aSign, status);
6469     }
6470     else {
6471         return subFloat128Sigs(a, b, aSign, status);
6472     }
6473
6474 }
6475
6476 /*----------------------------------------------------------------------------
6477 | Returns the result of subtracting the quadruple-precision floating-point
6478 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6479 | Standard for Binary Floating-Point Arithmetic.
6480 *----------------------------------------------------------------------------*/
6481
6482 float128 float128_sub(float128 a, float128 b, float_status *status)
6483 {
6484     flag aSign, bSign;
6485
6486     aSign = extractFloat128Sign( a );
6487     bSign = extractFloat128Sign( b );
6488     if ( aSign == bSign ) {
6489         return subFloat128Sigs(a, b, aSign, status);
6490     }
6491     else {
6492         return addFloat128Sigs(a, b, aSign, status);
6493     }
6494
6495 }
6496
6497 /*----------------------------------------------------------------------------
6498 | Returns the result of multiplying the quadruple-precision floating-point
6499 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6500 | Standard for Binary Floating-Point Arithmetic.
6501 *----------------------------------------------------------------------------*/
6502
6503 float128 float128_mul(float128 a, float128 b, float_status *status)
6504 {
6505     flag aSign, bSign, zSign;
6506     int32_t aExp, bExp, zExp;
6507     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6508
6509     aSig1 = extractFloat128Frac1( a );
6510     aSig0 = extractFloat128Frac0( a );
6511     aExp = extractFloat128Exp( a );
6512     aSign = extractFloat128Sign( a );
6513     bSig1 = extractFloat128Frac1( b );
6514     bSig0 = extractFloat128Frac0( b );
6515     bExp = extractFloat128Exp( b );
6516     bSign = extractFloat128Sign( b );
6517     zSign = aSign ^ bSign;
6518     if ( aExp == 0x7FFF ) {
6519         if (    ( aSig0 | aSig1 )
6520              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6521             return propagateFloat128NaN(a, b, status);
6522         }
6523         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6524         return packFloat128( zSign, 0x7FFF, 0, 0 );
6525     }
6526     if ( bExp == 0x7FFF ) {
6527         if (bSig0 | bSig1) {
6528             return propagateFloat128NaN(a, b, status);
6529         }
6530         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6531  invalid:
6532             float_raise(float_flag_invalid, status);
6533             return float128_default_nan(status);
6534         }
6535         return packFloat128( zSign, 0x7FFF, 0, 0 );
6536     }
6537     if ( aExp == 0 ) {
6538         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6539         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6540     }
6541     if ( bExp == 0 ) {
6542         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6543         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6544     }
6545     zExp = aExp + bExp - 0x4000;
6546     aSig0 |= LIT64( 0x0001000000000000 );
6547     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6548     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6549     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6550     zSig2 |= ( zSig3 != 0 );
6551     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6552         shift128ExtraRightJamming(
6553             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6554         ++zExp;
6555     }
6556     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6557
6558 }
6559
6560 /*----------------------------------------------------------------------------
6561 | Returns the result of dividing the quadruple-precision floating-point value
6562 | `a' by the corresponding value `b'.  The operation is performed according to
6563 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6564 *----------------------------------------------------------------------------*/
6565
6566 float128 float128_div(float128 a, float128 b, float_status *status)
6567 {
6568     flag aSign, bSign, zSign;
6569     int32_t aExp, bExp, zExp;
6570     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6571     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6572
6573     aSig1 = extractFloat128Frac1( a );
6574     aSig0 = extractFloat128Frac0( a );
6575     aExp = extractFloat128Exp( a );
6576     aSign = extractFloat128Sign( a );
6577     bSig1 = extractFloat128Frac1( b );
6578     bSig0 = extractFloat128Frac0( b );
6579     bExp = extractFloat128Exp( b );
6580     bSign = extractFloat128Sign( b );
6581     zSign = aSign ^ bSign;
6582     if ( aExp == 0x7FFF ) {
6583         if (aSig0 | aSig1) {
6584             return propagateFloat128NaN(a, b, status);
6585         }
6586         if ( bExp == 0x7FFF ) {
6587             if (bSig0 | bSig1) {
6588                 return propagateFloat128NaN(a, b, status);
6589             }
6590             goto invalid;
6591         }
6592         return packFloat128( zSign, 0x7FFF, 0, 0 );
6593     }
6594     if ( bExp == 0x7FFF ) {
6595         if (bSig0 | bSig1) {
6596             return propagateFloat128NaN(a, b, status);
6597         }
6598         return packFloat128( zSign, 0, 0, 0 );
6599     }
6600     if ( bExp == 0 ) {
6601         if ( ( bSig0 | bSig1 ) == 0 ) {
6602             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6603  invalid:
6604                 float_raise(float_flag_invalid, status);
6605                 return float128_default_nan(status);
6606             }
6607             float_raise(float_flag_divbyzero, status);
6608             return packFloat128( zSign, 0x7FFF, 0, 0 );
6609         }
6610         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6611     }
6612     if ( aExp == 0 ) {
6613         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6614         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6615     }
6616     zExp = aExp - bExp + 0x3FFD;
6617     shortShift128Left(
6618         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6619     shortShift128Left(
6620         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6621     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6622         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6623         ++zExp;
6624     }
6625     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6626     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6627     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6628     while ( (int64_t) rem0 < 0 ) {
6629         --zSig0;
6630         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6631     }
6632     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6633     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6634         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6635         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6636         while ( (int64_t) rem1 < 0 ) {
6637             --zSig1;
6638             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6639         }
6640         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6641     }
6642     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6643     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6644
6645 }
6646
6647 /*----------------------------------------------------------------------------
6648 | Returns the remainder of the quadruple-precision floating-point value `a'
6649 | with respect to the corresponding value `b'.  The operation is performed
6650 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6651 *----------------------------------------------------------------------------*/
6652
6653 float128 float128_rem(float128 a, float128 b, float_status *status)
6654 {
6655     flag aSign, zSign;
6656     int32_t aExp, bExp, expDiff;
6657     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6658     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6659     int64_t sigMean0;
6660
6661     aSig1 = extractFloat128Frac1( a );
6662     aSig0 = extractFloat128Frac0( a );
6663     aExp = extractFloat128Exp( a );
6664     aSign = extractFloat128Sign( a );
6665     bSig1 = extractFloat128Frac1( b );
6666     bSig0 = extractFloat128Frac0( b );
6667     bExp = extractFloat128Exp( b );
6668     if ( aExp == 0x7FFF ) {
6669         if (    ( aSig0 | aSig1 )
6670              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6671             return propagateFloat128NaN(a, b, status);
6672         }
6673         goto invalid;
6674     }
6675     if ( bExp == 0x7FFF ) {
6676         if (bSig0 | bSig1) {
6677             return propagateFloat128NaN(a, b, status);
6678         }
6679         return a;
6680     }
6681     if ( bExp == 0 ) {
6682         if ( ( bSig0 | bSig1 ) == 0 ) {
6683  invalid:
6684             float_raise(float_flag_invalid, status);
6685             return float128_default_nan(status);
6686         }
6687         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6688     }
6689     if ( aExp == 0 ) {
6690         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6691         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6692     }
6693     expDiff = aExp - bExp;
6694     if ( expDiff < -1 ) return a;
6695     shortShift128Left(
6696         aSig0 | LIT64( 0x0001000000000000 ),
6697         aSig1,
6698         15 - ( expDiff < 0 ),
6699         &aSig0,
6700         &aSig1
6701     );
6702     shortShift128Left(
6703         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6704     q = le128( bSig0, bSig1, aSig0, aSig1 );
6705     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6706     expDiff -= 64;
6707     while ( 0 < expDiff ) {
6708         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6709         q = ( 4 < q ) ? q - 4 : 0;
6710         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6711         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6712         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6713         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6714         expDiff -= 61;
6715     }
6716     if ( -64 < expDiff ) {
6717         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6718         q = ( 4 < q ) ? q - 4 : 0;
6719         q >>= - expDiff;
6720         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6721         expDiff += 52;
6722         if ( expDiff < 0 ) {
6723             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6724         }
6725         else {
6726             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6727         }
6728         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6729         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6730     }
6731     else {
6732         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6733         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6734     }
6735     do {
6736         alternateASig0 = aSig0;
6737         alternateASig1 = aSig1;
6738         ++q;
6739         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6740     } while ( 0 <= (int64_t) aSig0 );
6741     add128(
6742         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6743     if (    ( sigMean0 < 0 )
6744          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6745         aSig0 = alternateASig0;
6746         aSig1 = alternateASig1;
6747     }
6748     zSign = ( (int64_t) aSig0 < 0 );
6749     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6750     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6751                                          status);
6752 }
6753
6754 /*----------------------------------------------------------------------------
6755 | Returns the square root of the quadruple-precision floating-point value `a'.
6756 | The operation is performed according to the IEC/IEEE Standard for Binary
6757 | Floating-Point Arithmetic.
6758 *----------------------------------------------------------------------------*/
6759
6760 float128 float128_sqrt(float128 a, float_status *status)
6761 {
6762     flag aSign;
6763     int32_t aExp, zExp;
6764     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6765     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6766
6767     aSig1 = extractFloat128Frac1( a );
6768     aSig0 = extractFloat128Frac0( a );
6769     aExp = extractFloat128Exp( a );
6770     aSign = extractFloat128Sign( a );
6771     if ( aExp == 0x7FFF ) {
6772         if (aSig0 | aSig1) {
6773             return propagateFloat128NaN(a, a, status);
6774         }
6775         if ( ! aSign ) return a;
6776         goto invalid;
6777     }
6778     if ( aSign ) {
6779         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6780  invalid:
6781         float_raise(float_flag_invalid, status);
6782         return float128_default_nan(status);
6783     }
6784     if ( aExp == 0 ) {
6785         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6786         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6787     }
6788     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6789     aSig0 |= LIT64( 0x0001000000000000 );
6790     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6791     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6792     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6793     doubleZSig0 = zSig0<<1;
6794     mul64To128( zSig0, zSig0, &term0, &term1 );
6795     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6796     while ( (int64_t) rem0 < 0 ) {
6797         --zSig0;
6798         doubleZSig0 -= 2;
6799         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6800     }
6801     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6802     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6803         if ( zSig1 == 0 ) zSig1 = 1;
6804         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6805         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6806         mul64To128( zSig1, zSig1, &term2, &term3 );
6807         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6808         while ( (int64_t) rem1 < 0 ) {
6809             --zSig1;
6810             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6811             term3 |= 1;
6812             term2 |= doubleZSig0;
6813             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6814         }
6815         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6816     }
6817     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6818     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6819
6820 }
6821
6822 /*----------------------------------------------------------------------------
6823 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6824 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6825 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6826 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6827 *----------------------------------------------------------------------------*/
6828
6829 int float128_eq(float128 a, float128 b, float_status *status)
6830 {
6831
6832     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6833               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6834          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6835               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6836        ) {
6837         float_raise(float_flag_invalid, status);
6838         return 0;
6839     }
6840     return
6841            ( a.low == b.low )
6842         && (    ( a.high == b.high )
6843              || (    ( a.low == 0 )
6844                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6845            );
6846
6847 }
6848
6849 /*----------------------------------------------------------------------------
6850 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6851 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6852 | exception is raised if either operand is a NaN.  The comparison is performed
6853 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6854 *----------------------------------------------------------------------------*/
6855
6856 int float128_le(float128 a, float128 b, float_status *status)
6857 {
6858     flag aSign, bSign;
6859
6860     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6861               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6862          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6863               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6864        ) {
6865         float_raise(float_flag_invalid, status);
6866         return 0;
6867     }
6868     aSign = extractFloat128Sign( a );
6869     bSign = extractFloat128Sign( b );
6870     if ( aSign != bSign ) {
6871         return
6872                aSign
6873             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6874                  == 0 );
6875     }
6876     return
6877           aSign ? le128( b.high, b.low, a.high, a.low )
6878         : le128( a.high, a.low, b.high, b.low );
6879
6880 }
6881
6882 /*----------------------------------------------------------------------------
6883 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6884 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6885 | raised if either operand is a NaN.  The comparison is performed according
6886 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6887 *----------------------------------------------------------------------------*/
6888
6889 int float128_lt(float128 a, float128 b, float_status *status)
6890 {
6891     flag aSign, bSign;
6892
6893     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6894               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6895          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6896               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6897        ) {
6898         float_raise(float_flag_invalid, status);
6899         return 0;
6900     }
6901     aSign = extractFloat128Sign( a );
6902     bSign = extractFloat128Sign( b );
6903     if ( aSign != bSign ) {
6904         return
6905                aSign
6906             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6907                  != 0 );
6908     }
6909     return
6910           aSign ? lt128( b.high, b.low, a.high, a.low )
6911         : lt128( a.high, a.low, b.high, b.low );
6912
6913 }
6914
6915 /*----------------------------------------------------------------------------
6916 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6917 | be compared, and 0 otherwise.  The invalid exception is raised if either
6918 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6919 | Standard for Binary Floating-Point Arithmetic.
6920 *----------------------------------------------------------------------------*/
6921
6922 int float128_unordered(float128 a, float128 b, float_status *status)
6923 {
6924     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6925               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6926          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6927               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6928        ) {
6929         float_raise(float_flag_invalid, status);
6930         return 1;
6931     }
6932     return 0;
6933 }
6934
6935 /*----------------------------------------------------------------------------
6936 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6937 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6938 | exception.  The comparison is performed according to the IEC/IEEE Standard
6939 | for Binary Floating-Point Arithmetic.
6940 *----------------------------------------------------------------------------*/
6941
6942 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6943 {
6944
6945     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6946               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6947          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6948               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6949        ) {
6950         if (float128_is_signaling_nan(a, status)
6951          || float128_is_signaling_nan(b, status)) {
6952             float_raise(float_flag_invalid, status);
6953         }
6954         return 0;
6955     }
6956     return
6957            ( a.low == b.low )
6958         && (    ( a.high == b.high )
6959              || (    ( a.low == 0 )
6960                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6961            );
6962
6963 }
6964
6965 /*----------------------------------------------------------------------------
6966 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6967 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6968 | cause an exception.  Otherwise, the comparison is performed according to the
6969 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6970 *----------------------------------------------------------------------------*/
6971
6972 int float128_le_quiet(float128 a, float128 b, float_status *status)
6973 {
6974     flag aSign, bSign;
6975
6976     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6977               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6978          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6979               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6980        ) {
6981         if (float128_is_signaling_nan(a, status)
6982          || float128_is_signaling_nan(b, status)) {
6983             float_raise(float_flag_invalid, status);
6984         }
6985         return 0;
6986     }
6987     aSign = extractFloat128Sign( a );
6988     bSign = extractFloat128Sign( b );
6989     if ( aSign != bSign ) {
6990         return
6991                aSign
6992             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6993                  == 0 );
6994     }
6995     return
6996           aSign ? le128( b.high, b.low, a.high, a.low )
6997         : le128( a.high, a.low, b.high, b.low );
6998
6999 }
7000
7001 /*----------------------------------------------------------------------------
7002 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7003 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7004 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7005 | Standard for Binary Floating-Point Arithmetic.
7006 *----------------------------------------------------------------------------*/
7007
7008 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7009 {
7010     flag aSign, bSign;
7011
7012     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7013               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7014          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7015               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7016        ) {
7017         if (float128_is_signaling_nan(a, status)
7018          || float128_is_signaling_nan(b, status)) {
7019             float_raise(float_flag_invalid, status);
7020         }
7021         return 0;
7022     }
7023     aSign = extractFloat128Sign( a );
7024     bSign = extractFloat128Sign( b );
7025     if ( aSign != bSign ) {
7026         return
7027                aSign
7028             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7029                  != 0 );
7030     }
7031     return
7032           aSign ? lt128( b.high, b.low, a.high, a.low )
7033         : lt128( a.high, a.low, b.high, b.low );
7034
7035 }
7036
7037 /*----------------------------------------------------------------------------
7038 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7039 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7040 | comparison is performed according to the IEC/IEEE Standard for Binary
7041 | Floating-Point Arithmetic.
7042 *----------------------------------------------------------------------------*/
7043
7044 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7045 {
7046     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7047               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7048          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7049               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7050        ) {
7051         if (float128_is_signaling_nan(a, status)
7052          || float128_is_signaling_nan(b, status)) {
7053             float_raise(float_flag_invalid, status);
7054         }
7055         return 1;
7056     }
7057     return 0;
7058 }
7059
7060 /* misc functions */
7061 float32 uint32_to_float32(uint32_t a, float_status *status)
7062 {
7063     return int64_to_float32(a, status);
7064 }
7065
7066 float64 uint32_to_float64(uint32_t a, float_status *status)
7067 {
7068     return int64_to_float64(a, status);
7069 }
7070
7071 uint32_t float32_to_uint32(float32 a, float_status *status)
7072 {
7073     int64_t v;
7074     uint32_t res;
7075     int old_exc_flags = get_float_exception_flags(status);
7076
7077     v = float32_to_int64(a, status);
7078     if (v < 0) {
7079         res = 0;
7080     } else if (v > 0xffffffff) {
7081         res = 0xffffffff;
7082     } else {
7083         return v;
7084     }
7085     set_float_exception_flags(old_exc_flags, status);
7086     float_raise(float_flag_invalid, status);
7087     return res;
7088 }
7089
7090 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status)
7091 {
7092     int64_t v;
7093     uint32_t res;
7094     int old_exc_flags = get_float_exception_flags(status);
7095
7096     v = float32_to_int64_round_to_zero(a, status);
7097     if (v < 0) {
7098         res = 0;
7099     } else if (v > 0xffffffff) {
7100         res = 0xffffffff;
7101     } else {
7102         return v;
7103     }
7104     set_float_exception_flags(old_exc_flags, status);
7105     float_raise(float_flag_invalid, status);
7106     return res;
7107 }
7108
7109 int16_t float32_to_int16(float32 a, float_status *status)
7110 {
7111     int32_t v;
7112     int16_t res;
7113     int old_exc_flags = get_float_exception_flags(status);
7114
7115     v = float32_to_int32(a, status);
7116     if (v < -0x8000) {
7117         res = -0x8000;
7118     } else if (v > 0x7fff) {
7119         res = 0x7fff;
7120     } else {
7121         return v;
7122     }
7123
7124     set_float_exception_flags(old_exc_flags, status);
7125     float_raise(float_flag_invalid, status);
7126     return res;
7127 }
7128
7129 uint16_t float32_to_uint16(float32 a, float_status *status)
7130 {
7131     int32_t v;
7132     uint16_t res;
7133     int old_exc_flags = get_float_exception_flags(status);
7134
7135     v = float32_to_int32(a, status);
7136     if (v < 0) {
7137         res = 0;
7138     } else if (v > 0xffff) {
7139         res = 0xffff;
7140     } else {
7141         return v;
7142     }
7143
7144     set_float_exception_flags(old_exc_flags, status);
7145     float_raise(float_flag_invalid, status);
7146     return res;
7147 }
7148
7149 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
7150 {
7151     int64_t v;
7152     uint16_t res;
7153     int old_exc_flags = get_float_exception_flags(status);
7154
7155     v = float32_to_int64_round_to_zero(a, status);
7156     if (v < 0) {
7157         res = 0;
7158     } else if (v > 0xffff) {
7159         res = 0xffff;
7160     } else {
7161         return v;
7162     }
7163     set_float_exception_flags(old_exc_flags, status);
7164     float_raise(float_flag_invalid, status);
7165     return res;
7166 }
7167
7168 uint32_t float64_to_uint32(float64 a, float_status *status)
7169 {
7170     uint64_t v;
7171     uint32_t res;
7172     int old_exc_flags = get_float_exception_flags(status);
7173
7174     v = float64_to_uint64(a, status);
7175     if (v > 0xffffffff) {
7176         res = 0xffffffff;
7177     } else {
7178         return v;
7179     }
7180     set_float_exception_flags(old_exc_flags, status);
7181     float_raise(float_flag_invalid, status);
7182     return res;
7183 }
7184
7185 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status)
7186 {
7187     uint64_t v;
7188     uint32_t res;
7189     int old_exc_flags = get_float_exception_flags(status);
7190
7191     v = float64_to_uint64_round_to_zero(a, status);
7192     if (v > 0xffffffff) {
7193         res = 0xffffffff;
7194     } else {
7195         return v;
7196     }
7197     set_float_exception_flags(old_exc_flags, status);
7198     float_raise(float_flag_invalid, status);
7199     return res;
7200 }
7201
7202 int16_t float64_to_int16(float64 a, float_status *status)
7203 {
7204     int64_t v;
7205     int16_t res;
7206     int old_exc_flags = get_float_exception_flags(status);
7207
7208     v = float64_to_int32(a, status);
7209     if (v < -0x8000) {
7210         res = -0x8000;
7211     } else if (v > 0x7fff) {
7212         res = 0x7fff;
7213     } else {
7214         return v;
7215     }
7216
7217     set_float_exception_flags(old_exc_flags, status);
7218     float_raise(float_flag_invalid, status);
7219     return res;
7220 }
7221
7222 uint16_t float64_to_uint16(float64 a, float_status *status)
7223 {
7224     int64_t v;
7225     uint16_t res;
7226     int old_exc_flags = get_float_exception_flags(status);
7227
7228     v = float64_to_int32(a, status);
7229     if (v < 0) {
7230         res = 0;
7231     } else if (v > 0xffff) {
7232         res = 0xffff;
7233     } else {
7234         return v;
7235     }
7236
7237     set_float_exception_flags(old_exc_flags, status);
7238     float_raise(float_flag_invalid, status);
7239     return res;
7240 }
7241
7242 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
7243 {
7244     int64_t v;
7245     uint16_t res;
7246     int old_exc_flags = get_float_exception_flags(status);
7247
7248     v = float64_to_int64_round_to_zero(a, status);
7249     if (v < 0) {
7250         res = 0;
7251     } else if (v > 0xffff) {
7252         res = 0xffff;
7253     } else {
7254         return v;
7255     }
7256     set_float_exception_flags(old_exc_flags, status);
7257     float_raise(float_flag_invalid, status);
7258     return res;
7259 }
7260
7261 /*----------------------------------------------------------------------------
7262 | Returns the result of converting the double-precision floating-point value
7263 | `a' to the 64-bit unsigned integer format.  The conversion is
7264 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7265 | Arithmetic---which means in particular that the conversion is rounded
7266 | according to the current rounding mode.  If `a' is a NaN, the largest
7267 | positive integer is returned.  If the conversion overflows, the
7268 | largest unsigned integer is returned.  If 'a' is negative, the value is
7269 | rounded and zero is returned; negative values that do not round to zero
7270 | will raise the inexact exception.
7271 *----------------------------------------------------------------------------*/
7272
7273 uint64_t float64_to_uint64(float64 a, float_status *status)
7274 {
7275     flag aSign;
7276     int aExp;
7277     int shiftCount;
7278     uint64_t aSig, aSigExtra;
7279     a = float64_squash_input_denormal(a, status);
7280
7281     aSig = extractFloat64Frac(a);
7282     aExp = extractFloat64Exp(a);
7283     aSign = extractFloat64Sign(a);
7284     if (aSign && (aExp > 1022)) {
7285         float_raise(float_flag_invalid, status);
7286         if (float64_is_any_nan(a)) {
7287             return LIT64(0xFFFFFFFFFFFFFFFF);
7288         } else {
7289             return 0;
7290         }
7291     }
7292     if (aExp) {
7293         aSig |= LIT64(0x0010000000000000);
7294     }
7295     shiftCount = 0x433 - aExp;
7296     if (shiftCount <= 0) {
7297         if (0x43E < aExp) {
7298             float_raise(float_flag_invalid, status);
7299             return LIT64(0xFFFFFFFFFFFFFFFF);
7300         }
7301         aSigExtra = 0;
7302         aSig <<= -shiftCount;
7303     } else {
7304         shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7305     }
7306     return roundAndPackUint64(aSign, aSig, aSigExtra, status);
7307 }
7308
7309 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
7310 {
7311     signed char current_rounding_mode = status->float_rounding_mode;
7312     set_float_rounding_mode(float_round_to_zero, status);
7313     int64_t v = float64_to_uint64(a, status);
7314     set_float_rounding_mode(current_rounding_mode, status);
7315     return v;
7316 }
7317
7318 #define COMPARE(s, nan_exp)                                                  \
7319 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7320                                       int is_quiet, float_status *status)    \
7321 {                                                                            \
7322     flag aSign, bSign;                                                       \
7323     uint ## s ## _t av, bv;                                                  \
7324     a = float ## s ## _squash_input_denormal(a, status);                     \
7325     b = float ## s ## _squash_input_denormal(b, status);                     \
7326                                                                              \
7327     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
7328          extractFloat ## s ## Frac( a ) ) ||                                 \
7329         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
7330           extractFloat ## s ## Frac( b ) )) {                                \
7331         if (!is_quiet ||                                                     \
7332             float ## s ## _is_signaling_nan(a, status) ||                  \
7333             float ## s ## _is_signaling_nan(b, status)) {                 \
7334             float_raise(float_flag_invalid, status);                         \
7335         }                                                                    \
7336         return float_relation_unordered;                                     \
7337     }                                                                        \
7338     aSign = extractFloat ## s ## Sign( a );                                  \
7339     bSign = extractFloat ## s ## Sign( b );                                  \
7340     av = float ## s ## _val(a);                                              \
7341     bv = float ## s ## _val(b);                                              \
7342     if ( aSign != bSign ) {                                                  \
7343         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
7344             /* zero case */                                                  \
7345             return float_relation_equal;                                     \
7346         } else {                                                             \
7347             return 1 - (2 * aSign);                                          \
7348         }                                                                    \
7349     } else {                                                                 \
7350         if (av == bv) {                                                      \
7351             return float_relation_equal;                                     \
7352         } else {                                                             \
7353             return 1 - 2 * (aSign ^ ( av < bv ));                            \
7354         }                                                                    \
7355     }                                                                        \
7356 }                                                                            \
7357                                                                              \
7358 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
7359 {                                                                            \
7360     return float ## s ## _compare_internal(a, b, 0, status);                 \
7361 }                                                                            \
7362                                                                              \
7363 int float ## s ## _compare_quiet(float ## s a, float ## s b,                 \
7364                                  float_status *status)                       \
7365 {                                                                            \
7366     return float ## s ## _compare_internal(a, b, 1, status);                 \
7367 }
7368
7369 COMPARE(32, 0xff)
7370 COMPARE(64, 0x7ff)
7371
7372 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7373                                             int is_quiet, float_status *status)
7374 {
7375     flag aSign, bSign;
7376
7377     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7378           ( extractFloatx80Frac( a )<<1 ) ) ||
7379         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7380           ( extractFloatx80Frac( b )<<1 ) )) {
7381         if (!is_quiet ||
7382             floatx80_is_signaling_nan(a, status) ||
7383             floatx80_is_signaling_nan(b, status)) {
7384             float_raise(float_flag_invalid, status);
7385         }
7386         return float_relation_unordered;
7387     }
7388     aSign = extractFloatx80Sign( a );
7389     bSign = extractFloatx80Sign( b );
7390     if ( aSign != bSign ) {
7391
7392         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7393              ( ( a.low | b.low ) == 0 ) ) {
7394             /* zero case */
7395             return float_relation_equal;
7396         } else {
7397             return 1 - (2 * aSign);
7398         }
7399     } else {
7400         if (a.low == b.low && a.high == b.high) {
7401             return float_relation_equal;
7402         } else {
7403             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7404         }
7405     }
7406 }
7407
7408 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7409 {
7410     return floatx80_compare_internal(a, b, 0, status);
7411 }
7412
7413 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7414 {
7415     return floatx80_compare_internal(a, b, 1, status);
7416 }
7417
7418 static inline int float128_compare_internal(float128 a, float128 b,
7419                                             int is_quiet, float_status *status)
7420 {
7421     flag aSign, bSign;
7422
7423     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7424           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7425         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7426           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7427         if (!is_quiet ||
7428             float128_is_signaling_nan(a, status) ||
7429             float128_is_signaling_nan(b, status)) {
7430             float_raise(float_flag_invalid, status);
7431         }
7432         return float_relation_unordered;
7433     }
7434     aSign = extractFloat128Sign( a );
7435     bSign = extractFloat128Sign( b );
7436     if ( aSign != bSign ) {
7437         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7438             /* zero case */
7439             return float_relation_equal;
7440         } else {
7441             return 1 - (2 * aSign);
7442         }
7443     } else {
7444         if (a.low == b.low && a.high == b.high) {
7445             return float_relation_equal;
7446         } else {
7447             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7448         }
7449     }
7450 }
7451
7452 int float128_compare(float128 a, float128 b, float_status *status)
7453 {
7454     return float128_compare_internal(a, b, 0, status);
7455 }
7456
7457 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7458 {
7459     return float128_compare_internal(a, b, 1, status);
7460 }
7461
7462 /* min() and max() functions. These can't be implemented as
7463  * 'compare and pick one input' because that would mishandle
7464  * NaNs and +0 vs -0.
7465  *
7466  * minnum() and maxnum() functions. These are similar to the min()
7467  * and max() functions but if one of the arguments is a QNaN and
7468  * the other is numerical then the numerical argument is returned.
7469  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7470  * and maxNum() operations. min() and max() are the typical min/max
7471  * semantics provided by many CPUs which predate that specification.
7472  *
7473  * minnummag() and maxnummag() functions correspond to minNumMag()
7474  * and minNumMag() from the IEEE-754 2008.
7475  */
7476 #define MINMAX(s)                                                       \
7477 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
7478                                                int ismin, int isieee,   \
7479                                                int ismag,               \
7480                                                float_status *status)    \
7481 {                                                                       \
7482     flag aSign, bSign;                                                  \
7483     uint ## s ## _t av, bv, aav, abv;                                   \
7484     a = float ## s ## _squash_input_denormal(a, status);                \
7485     b = float ## s ## _squash_input_denormal(b, status);                \
7486     if (float ## s ## _is_any_nan(a) ||                                 \
7487         float ## s ## _is_any_nan(b)) {                                 \
7488         if (isieee) {                                                   \
7489             if (float ## s ## _is_quiet_nan(a, status) &&               \
7490                 !float ## s ##_is_any_nan(b)) {                         \
7491                 return b;                                               \
7492             } else if (float ## s ## _is_quiet_nan(b, status) &&        \
7493                        !float ## s ## _is_any_nan(a)) {                \
7494                 return a;                                               \
7495             }                                                           \
7496         }                                                               \
7497         return propagateFloat ## s ## NaN(a, b, status);                \
7498     }                                                                   \
7499     aSign = extractFloat ## s ## Sign(a);                               \
7500     bSign = extractFloat ## s ## Sign(b);                               \
7501     av = float ## s ## _val(a);                                         \
7502     bv = float ## s ## _val(b);                                         \
7503     if (ismag) {                                                        \
7504         aav = float ## s ## _abs(av);                                   \
7505         abv = float ## s ## _abs(bv);                                   \
7506         if (aav != abv) {                                               \
7507             if (ismin) {                                                \
7508                 return (aav < abv) ? a : b;                             \
7509             } else {                                                    \
7510                 return (aav < abv) ? b : a;                             \
7511             }                                                           \
7512         }                                                               \
7513     }                                                                   \
7514     if (aSign != bSign) {                                               \
7515         if (ismin) {                                                    \
7516             return aSign ? a : b;                                       \
7517         } else {                                                        \
7518             return aSign ? b : a;                                       \
7519         }                                                               \
7520     } else {                                                            \
7521         if (ismin) {                                                    \
7522             return (aSign ^ (av < bv)) ? a : b;                         \
7523         } else {                                                        \
7524             return (aSign ^ (av < bv)) ? b : a;                         \
7525         }                                                               \
7526     }                                                                   \
7527 }                                                                       \
7528                                                                         \
7529 float ## s float ## s ## _min(float ## s a, float ## s b,               \
7530                               float_status *status)                     \
7531 {                                                                       \
7532     return float ## s ## _minmax(a, b, 1, 0, 0, status);                \
7533 }                                                                       \
7534                                                                         \
7535 float ## s float ## s ## _max(float ## s a, float ## s b,               \
7536                               float_status *status)                     \
7537 {                                                                       \
7538     return float ## s ## _minmax(a, b, 0, 0, 0, status);                \
7539 }                                                                       \
7540                                                                         \
7541 float ## s float ## s ## _minnum(float ## s a, float ## s b,            \
7542                                  float_status *status)                  \
7543 {                                                                       \
7544     return float ## s ## _minmax(a, b, 1, 1, 0, status);                \
7545 }                                                                       \
7546                                                                         \
7547 float ## s float ## s ## _maxnum(float ## s a, float ## s b,            \
7548                                  float_status *status)                  \
7549 {                                                                       \
7550     return float ## s ## _minmax(a, b, 0, 1, 0, status);                \
7551 }                                                                       \
7552                                                                         \
7553 float ## s float ## s ## _minnummag(float ## s a, float ## s b,         \
7554                                     float_status *status)               \
7555 {                                                                       \
7556     return float ## s ## _minmax(a, b, 1, 1, 1, status);                \
7557 }                                                                       \
7558                                                                         \
7559 float ## s float ## s ## _maxnummag(float ## s a, float ## s b,         \
7560                                     float_status *status)               \
7561 {                                                                       \
7562     return float ## s ## _minmax(a, b, 0, 1, 1, status);                \
7563 }
7564
7565 MINMAX(32)
7566 MINMAX(64)
7567
7568
7569 /* Multiply A by 2 raised to the power N.  */
7570 float32 float32_scalbn(float32 a, int n, float_status *status)
7571 {
7572     flag aSign;
7573     int16_t aExp;
7574     uint32_t aSig;
7575
7576     a = float32_squash_input_denormal(a, status);
7577     aSig = extractFloat32Frac( a );
7578     aExp = extractFloat32Exp( a );
7579     aSign = extractFloat32Sign( a );
7580
7581     if ( aExp == 0xFF ) {
7582         if ( aSig ) {
7583             return propagateFloat32NaN(a, a, status);
7584         }
7585         return a;
7586     }
7587     if (aExp != 0) {
7588         aSig |= 0x00800000;
7589     } else if (aSig == 0) {
7590         return a;
7591     } else {
7592         aExp++;
7593     }
7594
7595     if (n > 0x200) {
7596         n = 0x200;
7597     } else if (n < -0x200) {
7598         n = -0x200;
7599     }
7600
7601     aExp += n - 1;
7602     aSig <<= 7;
7603     return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
7604 }
7605
7606 float64 float64_scalbn(float64 a, int n, float_status *status)
7607 {
7608     flag aSign;
7609     int16_t aExp;
7610     uint64_t aSig;
7611
7612     a = float64_squash_input_denormal(a, status);
7613     aSig = extractFloat64Frac( a );
7614     aExp = extractFloat64Exp( a );
7615     aSign = extractFloat64Sign( a );
7616
7617     if ( aExp == 0x7FF ) {
7618         if ( aSig ) {
7619             return propagateFloat64NaN(a, a, status);
7620         }
7621         return a;
7622     }
7623     if (aExp != 0) {
7624         aSig |= LIT64( 0x0010000000000000 );
7625     } else if (aSig == 0) {
7626         return a;
7627     } else {
7628         aExp++;
7629     }
7630
7631     if (n > 0x1000) {
7632         n = 0x1000;
7633     } else if (n < -0x1000) {
7634         n = -0x1000;
7635     }
7636
7637     aExp += n - 1;
7638     aSig <<= 10;
7639     return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
7640 }
7641
7642 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7643 {
7644     flag aSign;
7645     int32_t aExp;
7646     uint64_t aSig;
7647
7648     aSig = extractFloatx80Frac( a );
7649     aExp = extractFloatx80Exp( a );
7650     aSign = extractFloatx80Sign( a );
7651
7652     if ( aExp == 0x7FFF ) {
7653         if ( aSig<<1 ) {
7654             return propagateFloatx80NaN(a, a, status);
7655         }
7656         return a;
7657     }
7658
7659     if (aExp == 0) {
7660         if (aSig == 0) {
7661             return a;
7662         }
7663         aExp++;
7664     }
7665
7666     if (n > 0x10000) {
7667         n = 0x10000;
7668     } else if (n < -0x10000) {
7669         n = -0x10000;
7670     }
7671
7672     aExp += n;
7673     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7674                                          aSign, aExp, aSig, 0, status);
7675 }
7676
7677 float128 float128_scalbn(float128 a, int n, float_status *status)
7678 {
7679     flag aSign;
7680     int32_t aExp;
7681     uint64_t aSig0, aSig1;
7682
7683     aSig1 = extractFloat128Frac1( a );
7684     aSig0 = extractFloat128Frac0( a );
7685     aExp = extractFloat128Exp( a );
7686     aSign = extractFloat128Sign( a );
7687     if ( aExp == 0x7FFF ) {
7688         if ( aSig0 | aSig1 ) {
7689             return propagateFloat128NaN(a, a, status);
7690         }
7691         return a;
7692     }
7693     if (aExp != 0) {
7694         aSig0 |= LIT64( 0x0001000000000000 );
7695     } else if (aSig0 == 0 && aSig1 == 0) {
7696         return a;
7697     } else {
7698         aExp++;
7699     }
7700
7701     if (n > 0x10000) {
7702         n = 0x10000;
7703     } else if (n < -0x10000) {
7704         n = -0x10000;
7705     }
7706
7707     aExp += n - 1;
7708     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7709                                          , status);
7710
7711 }