fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * The code in this source file is derived from release 2a of the SoftFloat
   5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6  * some later contributions) are provided under that license, as detailed below.
   7  * It has subsequently been modified by contributors to the QEMU Project,
   8  * so some portions are provided under:
   9  *  the SoftFloat-2a license
  10  *  the BSD license
  11  *  GPL-v2-or-later
  12  *
  13  * Any future contributions to this file after December 1st 2014 will be
  14  * taken to be licensed under the Softfloat-2a license unless specifically
  15  * indicated otherwise.
  16  */
  17
  18 /*
  19 ===============================================================================
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* BSD licensing:
  48  * Copyright (c) 2006, Fabrice Bellard
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are met:
  53  *
  54  * 1. Redistributions of source code must retain the above copyright notice,
  55  * this list of conditions and the following disclaimer.
  56  *
  57  * 2. Redistributions in binary form must reproduce the above copyright notice,
  58  * this list of conditions and the following disclaimer in the documentation
  59  * and/or other materials provided with the distribution.
  60  *
  61  * 3. Neither the name of the copyright holder nor the names of its contributors
  62  * may be used to endorse or promote products derived from this software without
  63  * specific prior written permission.
  64  *
  65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75  * THE POSSIBILITY OF SUCH DAMAGE.
  76  */
  77
  78 /* Portions of this work are licensed under the terms of the GNU GPL,
  79  * version 2 or later. See the COPYING file in the top-level directory.
  80  */
  81
  82 /* softfloat (and in particular the code in softfloat-specialize.h) is
  83  * target-dependent and needs the TARGET_* macros.
  84  */
  85 #include "config.h"
  86
  87 #include "fpu/softfloat.h"
  88
  89 /* We only need stdlib for abort() */
  90 #include <stdlib.h>
  91
  92 /*----------------------------------------------------------------------------
  93 | Primitive arithmetic functions, including multi-word arithmetic, and
  94 | division and square root approximations.  (Can be specialized to target if
  95 | desired.)
  96 *----------------------------------------------------------------------------*/
  97 #include "softfloat-macros.h"
  98
  99 /*----------------------------------------------------------------------------
 100 | Functions and definitions to determine:  (1) whether tininess for underflow
 101 | is detected before or after rounding by default, (2) what (if anything)
 102 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
 103 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 104 | are propagated from function inputs to output.  These details are target-
 105 | specific.
 106 *----------------------------------------------------------------------------*/
 107 #include "softfloat-specialize.h"
 108
 109 /*----------------------------------------------------------------------------
 110 | Returns the fraction bits of the half-precision floating-point value `a'.
 111 *----------------------------------------------------------------------------*/
 112
 113 static inline uint32_t extractFloat16Frac(float16 a)
 114 {
 115     return float16_val(a) & 0x3ff;
 116 }
 117
 118 /*----------------------------------------------------------------------------
 119 | Returns the exponent bits of the half-precision floating-point value `a'.
 120 *----------------------------------------------------------------------------*/
 121
 122 static inline int_fast16_t extractFloat16Exp(float16 a)
 123 {
 124     return (float16_val(a) >> 10) & 0x1f;
 125 }
 126
 127 /*----------------------------------------------------------------------------
 128 | Returns the sign bit of the single-precision floating-point value `a'.
 129 *----------------------------------------------------------------------------*/
 130
 131 static inline flag extractFloat16Sign(float16 a)
 132 {
 133     return float16_val(a)>>15;
 134 }
 135
 136 /*----------------------------------------------------------------------------
 137 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
 138 | and 7, and returns the properly rounded 32-bit integer corresponding to the
 139 | input.  If `zSign' is 1, the input is negated before being converted to an
 140 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 141 | is simply rounded to an integer, with the inexact exception raised if the
 142 | input cannot be represented exactly as an integer.  However, if the fixed-
 143 | point input is too large, the invalid exception is raised and the largest
 144 | positive or negative integer is returned.
 145 *----------------------------------------------------------------------------*/
 146
 147 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
 148 {
 149     int8_t roundingMode;
 150     flag roundNearestEven;
 151     int8_t roundIncrement, roundBits;
 152     int32_t z;
 153
 154     roundingMode = status->float_rounding_mode;
 155     roundNearestEven = ( roundingMode == float_round_nearest_even );
 156     switch (roundingMode) {
 157     case float_round_nearest_even:
 158     case float_round_ties_away:
 159         roundIncrement = 0x40;
 160         break;
 161     case float_round_to_zero:
 162         roundIncrement = 0;
 163         break;
 164     case float_round_up:
 165         roundIncrement = zSign ? 0 : 0x7f;
 166         break;
 167     case float_round_down:
 168         roundIncrement = zSign ? 0x7f : 0;
 169         break;
 170     default:
 171         abort();
 172     }
 173     roundBits = absZ & 0x7F;
 174     absZ = ( absZ + roundIncrement )>>7;
 175     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 176     z = absZ;
 177     if ( zSign ) z = - z;
 178     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 179         float_raise(float_flag_invalid, status);
 180         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
 181     }
 182     if (roundBits) {
 183         status->float_exception_flags |= float_flag_inexact;
 184     }
 185     return z;
 186
 187 }
 188
 189 /*----------------------------------------------------------------------------
 190 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 191 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 192 | and returns the properly rounded 64-bit integer corresponding to the input.
 193 | If `zSign' is 1, the input is negated before being converted to an integer.
 194 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 195 | the inexact exception raised if the input cannot be represented exactly as
 196 | an integer.  However, if the fixed-point input is too large, the invalid
 197 | exception is raised and the largest positive or negative integer is
 198 | returned.
 199 *----------------------------------------------------------------------------*/
 200
 201 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
 202                                float_status *status)
 203 {
 204     int8_t roundingMode;
 205     flag roundNearestEven, increment;
 206     int64_t z;
 207
 208     roundingMode = status->float_rounding_mode;
 209     roundNearestEven = ( roundingMode == float_round_nearest_even );
 210     switch (roundingMode) {
 211     case float_round_nearest_even:
 212     case float_round_ties_away:
 213         increment = ((int64_t) absZ1 < 0);
 214         break;
 215     case float_round_to_zero:
 216         increment = 0;
 217         break;
 218     case float_round_up:
 219         increment = !zSign && absZ1;
 220         break;
 221     case float_round_down:
 222         increment = zSign && absZ1;
 223         break;
 224     default:
 225         abort();
 226     }
 227     if ( increment ) {
 228         ++absZ0;
 229         if ( absZ0 == 0 ) goto overflow;
 230         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 231     }
 232     z = absZ0;
 233     if ( zSign ) z = - z;
 234     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 235  overflow:
 236         float_raise(float_flag_invalid, status);
 237         return
 238               zSign ? (int64_t) LIT64( 0x8000000000000000 )
 239             : LIT64( 0x7FFFFFFFFFFFFFFF );
 240     }
 241     if (absZ1) {
 242         status->float_exception_flags |= float_flag_inexact;
 243     }
 244     return z;
 245
 246 }
 247
 248 /*----------------------------------------------------------------------------
 249 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 250 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 251 | and returns the properly rounded 64-bit unsigned integer corresponding to the
 252 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
 253 | with the inexact exception raised if the input cannot be represented exactly
 254 | as an integer.  However, if the fixed-point input is too large, the invalid
 255 | exception is raised and the largest unsigned integer is returned.
 256 *----------------------------------------------------------------------------*/
 257
 258 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
 259                                 uint64_t absZ1, float_status *status)
 260 {
 261     int8_t roundingMode;
 262     flag roundNearestEven, increment;
 263
 264     roundingMode = status->float_rounding_mode;
 265     roundNearestEven = (roundingMode == float_round_nearest_even);
 266     switch (roundingMode) {
 267     case float_round_nearest_even:
 268     case float_round_ties_away:
 269         increment = ((int64_t)absZ1 < 0);
 270         break;
 271     case float_round_to_zero:
 272         increment = 0;
 273         break;
 274     case float_round_up:
 275         increment = !zSign && absZ1;
 276         break;
 277     case float_round_down:
 278         increment = zSign && absZ1;
 279         break;
 280     default:
 281         abort();
 282     }
 283     if (increment) {
 284         ++absZ0;
 285         if (absZ0 == 0) {
 286             float_raise(float_flag_invalid, status);
 287             return LIT64(0xFFFFFFFFFFFFFFFF);
 288         }
 289         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
 290     }
 291
 292     if (zSign && absZ0) {
 293         float_raise(float_flag_invalid, status);
 294         return 0;
 295     }
 296
 297     if (absZ1) {
 298         status->float_exception_flags |= float_flag_inexact;
 299     }
 300     return absZ0;
 301 }
 302
 303 /*----------------------------------------------------------------------------
 304 | Returns the fraction bits of the single-precision floating-point value `a'.
 305 *----------------------------------------------------------------------------*/
 306
 307 static inline uint32_t extractFloat32Frac( float32 a )
 308 {
 309
 310     return float32_val(a) & 0x007FFFFF;
 311
 312 }
 313
 314 /*----------------------------------------------------------------------------
 315 | Returns the exponent bits of the single-precision floating-point value `a'.
 316 *----------------------------------------------------------------------------*/
 317
 318 static inline int_fast16_t extractFloat32Exp(float32 a)
 319 {
 320
 321     return ( float32_val(a)>>23 ) & 0xFF;
 322
 323 }
 324
 325 /*----------------------------------------------------------------------------
 326 | Returns the sign bit of the single-precision floating-point value `a'.
 327 *----------------------------------------------------------------------------*/
 328
 329 static inline flag extractFloat32Sign( float32 a )
 330 {
 331
 332     return float32_val(a)>>31;
 333
 334 }
 335
 336 /*----------------------------------------------------------------------------
 337 | If `a' is denormal and we are in flush-to-zero mode then set the
 338 | input-denormal exception and return zero. Otherwise just return the value.
 339 *----------------------------------------------------------------------------*/
 340 float32 float32_squash_input_denormal(float32 a, float_status *status)
 341 {
 342     if (status->flush_inputs_to_zero) {
 343         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
 344             float_raise(float_flag_input_denormal, status);
 345             return make_float32(float32_val(a) & 0x80000000);
 346         }
 347     }
 348     return a;
 349 }
 350
 351 /*----------------------------------------------------------------------------
 352 | Normalizes the subnormal single-precision floating-point value represented
 353 | by the denormalized significand `aSig'.  The normalized exponent and
 354 | significand are stored at the locations pointed to by `zExpPtr' and
 355 | `zSigPtr', respectively.
 356 *----------------------------------------------------------------------------*/
 357
 358 static void
 359  normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
 360 {
 361     int8_t shiftCount;
 362
 363     shiftCount = countLeadingZeros32( aSig ) - 8;
 364     *zSigPtr = aSig<<shiftCount;
 365     *zExpPtr = 1 - shiftCount;
 366
 367 }
 368
 369 /*----------------------------------------------------------------------------
 370 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 371 | single-precision floating-point value, returning the result.  After being
 372 | shifted into the proper positions, the three fields are simply added
 373 | together to form the result.  This means that any integer portion of `zSig'
 374 | will be added into the exponent.  Since a properly normalized significand
 375 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 376 | than the desired result exponent whenever `zSig' is a complete, normalized
 377 | significand.
 378 *----------------------------------------------------------------------------*/
 379
 380 static inline float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
 381 {
 382
 383     return make_float32(
 384           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
 385
 386 }
 387
 388 /*----------------------------------------------------------------------------
 389 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 390 | and significand `zSig', and returns the proper single-precision floating-
 391 | point value corresponding to the abstract input.  Ordinarily, the abstract
 392 | value is simply rounded and packed into the single-precision format, with
 393 | the inexact exception raised if the abstract input cannot be represented
 394 | exactly.  However, if the abstract value is too large, the overflow and
 395 | inexact exceptions are raised and an infinity or maximal finite value is
 396 | returned.  If the abstract value is too small, the input value is rounded to
 397 | a subnormal number, and the underflow and inexact exceptions are raised if
 398 | the abstract input cannot be represented exactly as a subnormal single-
 399 | precision floating-point number.
 400 |     The input significand `zSig' has its binary point between bits 30
 401 | and 29, which is 7 bits to the left of the usual location.  This shifted
 402 | significand must be normalized or smaller.  If `zSig' is not normalized,
 403 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 404 | and it must not require rounding.  In the usual case that `zSig' is
 405 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 406 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 407 | Binary Floating-Point Arithmetic.
 408 *----------------------------------------------------------------------------*/
 409
 410 static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig,
 411                                    float_status *status)
 412 {
 413     int8_t roundingMode;
 414     flag roundNearestEven;
 415     int8_t roundIncrement, roundBits;
 416     flag isTiny;
 417
 418     roundingMode = status->float_rounding_mode;
 419     roundNearestEven = ( roundingMode == float_round_nearest_even );
 420     switch (roundingMode) {
 421     case float_round_nearest_even:
 422     case float_round_ties_away:
 423         roundIncrement = 0x40;
 424         break;
 425     case float_round_to_zero:
 426         roundIncrement = 0;
 427         break;
 428     case float_round_up:
 429         roundIncrement = zSign ? 0 : 0x7f;
 430         break;
 431     case float_round_down:
 432         roundIncrement = zSign ? 0x7f : 0;
 433         break;
 434     default:
 435         abort();
 436         break;
 437     }
 438     roundBits = zSig & 0x7F;
 439     if ( 0xFD <= (uint16_t) zExp ) {
 440         if (    ( 0xFD < zExp )
 441              || (    ( zExp == 0xFD )
 442                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
 443            ) {
 444             float_raise(float_flag_overflow | float_flag_inexact, status);
 445             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 446         }
 447         if ( zExp < 0 ) {
 448             if (status->flush_to_zero) {
 449                 float_raise(float_flag_output_denormal, status);
 450                 return packFloat32(zSign, 0, 0);
 451             }
 452             isTiny =
 453                 (status->float_detect_tininess
 454                  == float_tininess_before_rounding)
 455                 || ( zExp < -1 )
 456                 || ( zSig + roundIncrement < 0x80000000 );
 457             shift32RightJamming( zSig, - zExp, &zSig );
 458             zExp = 0;
 459             roundBits = zSig & 0x7F;
 460             if (isTiny && roundBits) {
 461                 float_raise(float_flag_underflow, status);
 462             }
 463         }
 464     }
 465     if (roundBits) {
 466         status->float_exception_flags |= float_flag_inexact;
 467     }
 468     zSig = ( zSig + roundIncrement )>>7;
 469     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 470     if ( zSig == 0 ) zExp = 0;
 471     return packFloat32( zSign, zExp, zSig );
 472
 473 }
 474
 475 /*----------------------------------------------------------------------------
 476 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 477 | and significand `zSig', and returns the proper single-precision floating-
 478 | point value corresponding to the abstract input.  This routine is just like
 479 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 480 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 481 | floating-point exponent.
 482 *----------------------------------------------------------------------------*/
 483
 484 static float32
 485  normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig,
 486                               float_status *status)
 487 {
 488     int8_t shiftCount;
 489
 490     shiftCount = countLeadingZeros32( zSig ) - 1;
 491     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
 492                                status);
 493
 494 }
 495
 496 /*----------------------------------------------------------------------------
 497 | Returns the fraction bits of the double-precision floating-point value `a'.
 498 *----------------------------------------------------------------------------*/
 499
 500 static inline uint64_t extractFloat64Frac( float64 a )
 501 {
 502
 503     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 504
 505 }
 506
 507 /*----------------------------------------------------------------------------
 508 | Returns the exponent bits of the double-precision floating-point value `a'.
 509 *----------------------------------------------------------------------------*/
 510
 511 static inline int_fast16_t extractFloat64Exp(float64 a)
 512 {
 513
 514     return ( float64_val(a)>>52 ) & 0x7FF;
 515
 516 }
 517
 518 /*----------------------------------------------------------------------------
 519 | Returns the sign bit of the double-precision floating-point value `a'.
 520 *----------------------------------------------------------------------------*/
 521
 522 static inline flag extractFloat64Sign( float64 a )
 523 {
 524
 525     return float64_val(a)>>63;
 526
 527 }
 528
 529 /*----------------------------------------------------------------------------
 530 | If `a' is denormal and we are in flush-to-zero mode then set the
 531 | input-denormal exception and return zero. Otherwise just return the value.
 532 *----------------------------------------------------------------------------*/
 533 float64 float64_squash_input_denormal(float64 a, float_status *status)
 534 {
 535     if (status->flush_inputs_to_zero) {
 536         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
 537             float_raise(float_flag_input_denormal, status);
 538             return make_float64(float64_val(a) & (1ULL << 63));
 539         }
 540     }
 541     return a;
 542 }
 543
 544 /*----------------------------------------------------------------------------
 545 | Normalizes the subnormal double-precision floating-point value represented
 546 | by the denormalized significand `aSig'.  The normalized exponent and
 547 | significand are stored at the locations pointed to by `zExpPtr' and
 548 | `zSigPtr', respectively.
 549 *----------------------------------------------------------------------------*/
 550
 551 static void
 552  normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
 553 {
 554     int8_t shiftCount;
 555
 556     shiftCount = countLeadingZeros64( aSig ) - 11;
 557     *zSigPtr = aSig<<shiftCount;
 558     *zExpPtr = 1 - shiftCount;
 559
 560 }
 561
 562 /*----------------------------------------------------------------------------
 563 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 564 | double-precision floating-point value, returning the result.  After being
 565 | shifted into the proper positions, the three fields are simply added
 566 | together to form the result.  This means that any integer portion of `zSig'
 567 | will be added into the exponent.  Since a properly normalized significand
 568 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 569 | than the desired result exponent whenever `zSig' is a complete, normalized
 570 | significand.
 571 *----------------------------------------------------------------------------*/
 572
 573 static inline float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
 574 {
 575
 576     return make_float64(
 577         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
 578
 579 }
 580
 581 /*----------------------------------------------------------------------------
 582 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 583 | and significand `zSig', and returns the proper double-precision floating-
 584 | point value corresponding to the abstract input.  Ordinarily, the abstract
 585 | value is simply rounded and packed into the double-precision format, with
 586 | the inexact exception raised if the abstract input cannot be represented
 587 | exactly.  However, if the abstract value is too large, the overflow and
 588 | inexact exceptions are raised and an infinity or maximal finite value is
 589 | returned.  If the abstract value is too small, the input value is rounded to
 590 | a subnormal number, and the underflow and inexact exceptions are raised if
 591 | the abstract input cannot be represented exactly as a subnormal double-
 592 | precision floating-point number.
 593 |     The input significand `zSig' has its binary point between bits 62
 594 | and 61, which is 10 bits to the left of the usual location.  This shifted
 595 | significand must be normalized or smaller.  If `zSig' is not normalized,
 596 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 597 | and it must not require rounding.  In the usual case that `zSig' is
 598 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 599 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 600 | Binary Floating-Point Arithmetic.
 601 *----------------------------------------------------------------------------*/
 602
 603 static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig,
 604                                    float_status *status)
 605 {
 606     int8_t roundingMode;
 607     flag roundNearestEven;
 608     int_fast16_t roundIncrement, roundBits;
 609     flag isTiny;
 610
 611     roundingMode = status->float_rounding_mode;
 612     roundNearestEven = ( roundingMode == float_round_nearest_even );
 613     switch (roundingMode) {
 614     case float_round_nearest_even:
 615     case float_round_ties_away:
 616         roundIncrement = 0x200;
 617         break;
 618     case float_round_to_zero:
 619         roundIncrement = 0;
 620         break;
 621     case float_round_up:
 622         roundIncrement = zSign ? 0 : 0x3ff;
 623         break;
 624     case float_round_down:
 625         roundIncrement = zSign ? 0x3ff : 0;
 626         break;
 627     default:
 628         abort();
 629     }
 630     roundBits = zSig & 0x3FF;
 631     if ( 0x7FD <= (uint16_t) zExp ) {
 632         if (    ( 0x7FD < zExp )
 633              || (    ( zExp == 0x7FD )
 634                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
 635            ) {
 636             float_raise(float_flag_overflow | float_flag_inexact, status);
 637             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 638         }
 639         if ( zExp < 0 ) {
 640             if (status->flush_to_zero) {
 641                 float_raise(float_flag_output_denormal, status);
 642                 return packFloat64(zSign, 0, 0);
 643             }
 644             isTiny =
 645                    (status->float_detect_tininess
 646                     == float_tininess_before_rounding)
 647                 || ( zExp < -1 )
 648                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 649             shift64RightJamming( zSig, - zExp, &zSig );
 650             zExp = 0;
 651             roundBits = zSig & 0x3FF;
 652             if (isTiny && roundBits) {
 653                 float_raise(float_flag_underflow, status);
 654             }
 655         }
 656     }
 657     if (roundBits) {
 658         status->float_exception_flags |= float_flag_inexact;
 659     }
 660     zSig = ( zSig + roundIncrement )>>10;
 661     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 662     if ( zSig == 0 ) zExp = 0;
 663     return packFloat64( zSign, zExp, zSig );
 664
 665 }
 666
 667 /*----------------------------------------------------------------------------
 668 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 669 | and significand `zSig', and returns the proper double-precision floating-
 670 | point value corresponding to the abstract input.  This routine is just like
 671 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 672 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 673 | floating-point exponent.
 674 *----------------------------------------------------------------------------*/
 675
 676 static float64
 677  normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig,
 678                               float_status *status)
 679 {
 680     int8_t shiftCount;
 681
 682     shiftCount = countLeadingZeros64( zSig ) - 1;
 683     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
 684                                status);
 685
 686 }
 687
 688 /*----------------------------------------------------------------------------
 689 | Returns the fraction bits of the extended double-precision floating-point
 690 | value `a'.
 691 *----------------------------------------------------------------------------*/
 692
 693 static inline uint64_t extractFloatx80Frac( floatx80 a )
 694 {
 695
 696     return a.low;
 697
 698 }
 699
 700 /*----------------------------------------------------------------------------
 701 | Returns the exponent bits of the extended double-precision floating-point
 702 | value `a'.
 703 *----------------------------------------------------------------------------*/
 704
 705 static inline int32_t extractFloatx80Exp( floatx80 a )
 706 {
 707
 708     return a.high & 0x7FFF;
 709
 710 }
 711
 712 /*----------------------------------------------------------------------------
 713 | Returns the sign bit of the extended double-precision floating-point value
 714 | `a'.
 715 *----------------------------------------------------------------------------*/
 716
 717 static inline flag extractFloatx80Sign( floatx80 a )
 718 {
 719
 720     return a.high>>15;
 721
 722 }
 723
 724 /*----------------------------------------------------------------------------
 725 | Normalizes the subnormal extended double-precision floating-point value
 726 | represented by the denormalized significand `aSig'.  The normalized exponent
 727 | and significand are stored at the locations pointed to by `zExpPtr' and
 728 | `zSigPtr', respectively.
 729 *----------------------------------------------------------------------------*/
 730
 731 static void
 732  normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
 733 {
 734     int8_t shiftCount;
 735
 736     shiftCount = countLeadingZeros64( aSig );
 737     *zSigPtr = aSig<<shiftCount;
 738     *zExpPtr = 1 - shiftCount;
 739
 740 }
 741
 742 /*----------------------------------------------------------------------------
 743 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 744 | extended double-precision floating-point value, returning the result.
 745 *----------------------------------------------------------------------------*/
 746
 747 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
 748 {
 749     floatx80 z;
 750
 751     z.low = zSig;
 752     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
 753     return z;
 754
 755 }
 756
 757 /*----------------------------------------------------------------------------
 758 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 759 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 760 | and returns the proper extended double-precision floating-point value
 761 | corresponding to the abstract input.  Ordinarily, the abstract value is
 762 | rounded and packed into the extended double-precision format, with the
 763 | inexact exception raised if the abstract input cannot be represented
 764 | exactly.  However, if the abstract value is too large, the overflow and
 765 | inexact exceptions are raised and an infinity or maximal finite value is
 766 | returned.  If the abstract value is too small, the input value is rounded to
 767 | a subnormal number, and the underflow and inexact exceptions are raised if
 768 | the abstract input cannot be represented exactly as a subnormal extended
 769 | double-precision floating-point number.
 770 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 771 | number of bits as single or double precision, respectively.  Otherwise, the
 772 | result is rounded to the full precision of the extended double-precision
 773 | format.
 774 |     The input significand must be normalized or smaller.  If the input
 775 | significand is not normalized, `zExp' must be 0; in that case, the result
 776 | returned is a subnormal number, and it must not require rounding.  The
 777 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 778 | Floating-Point Arithmetic.
 779 *----------------------------------------------------------------------------*/
 780
 781 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
 782                                      int32_t zExp, uint64_t zSig0, uint64_t zSig1,
 783                                      float_status *status)
 784 {
 785     int8_t roundingMode;
 786     flag roundNearestEven, increment, isTiny;
 787     int64_t roundIncrement, roundMask, roundBits;
 788
 789     roundingMode = status->float_rounding_mode;
 790     roundNearestEven = ( roundingMode == float_round_nearest_even );
 791     if ( roundingPrecision == 80 ) goto precision80;
 792     if ( roundingPrecision == 64 ) {
 793         roundIncrement = LIT64( 0x0000000000000400 );
 794         roundMask = LIT64( 0x00000000000007FF );
 795     }
 796     else if ( roundingPrecision == 32 ) {
 797         roundIncrement = LIT64( 0x0000008000000000 );
 798         roundMask = LIT64( 0x000000FFFFFFFFFF );
 799     }
 800     else {
 801         goto precision80;
 802     }
 803     zSig0 |= ( zSig1 != 0 );
 804     switch (roundingMode) {
 805     case float_round_nearest_even:
 806     case float_round_ties_away:
 807         break;
 808     case float_round_to_zero:
 809         roundIncrement = 0;
 810         break;
 811     case float_round_up:
 812         roundIncrement = zSign ? 0 : roundMask;
 813         break;
 814     case float_round_down:
 815         roundIncrement = zSign ? roundMask : 0;
 816         break;
 817     default:
 818         abort();
 819     }
 820     roundBits = zSig0 & roundMask;
 821     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 822         if (    ( 0x7FFE < zExp )
 823              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 824            ) {
 825             goto overflow;
 826         }
 827         if ( zExp <= 0 ) {
 828             if (status->flush_to_zero) {
 829                 float_raise(float_flag_output_denormal, status);
 830                 return packFloatx80(zSign, 0, 0);
 831             }
 832             isTiny =
 833                    (status->float_detect_tininess
 834                     == float_tininess_before_rounding)
 835                 || ( zExp < 0 )
 836                 || ( zSig0 <= zSig0 + roundIncrement );
 837             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 838             zExp = 0;
 839             roundBits = zSig0 & roundMask;
 840             if (isTiny && roundBits) {
 841                 float_raise(float_flag_underflow, status);
 842             }
 843             if (roundBits) {
 844                 status->float_exception_flags |= float_flag_inexact;
 845             }
 846             zSig0 += roundIncrement;
 847             if ( (int64_t) zSig0 < 0 ) zExp = 1;
 848             roundIncrement = roundMask + 1;
 849             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 850                 roundMask |= roundIncrement;
 851             }
 852             zSig0 &= ~ roundMask;
 853             return packFloatx80( zSign, zExp, zSig0 );
 854         }
 855     }
 856     if (roundBits) {
 857         status->float_exception_flags |= float_flag_inexact;
 858     }
 859     zSig0 += roundIncrement;
 860     if ( zSig0 < roundIncrement ) {
 861         ++zExp;
 862         zSig0 = LIT64( 0x8000000000000000 );
 863     }
 864     roundIncrement = roundMask + 1;
 865     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 866         roundMask |= roundIncrement;
 867     }
 868     zSig0 &= ~ roundMask;
 869     if ( zSig0 == 0 ) zExp = 0;
 870     return packFloatx80( zSign, zExp, zSig0 );
 871  precision80:
 872     switch (roundingMode) {
 873     case float_round_nearest_even:
 874     case float_round_ties_away:
 875         increment = ((int64_t)zSig1 < 0);
 876         break;
 877     case float_round_to_zero:
 878         increment = 0;
 879         break;
 880     case float_round_up:
 881         increment = !zSign && zSig1;
 882         break;
 883     case float_round_down:
 884         increment = zSign && zSig1;
 885         break;
 886     default:
 887         abort();
 888     }
 889     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 890         if (    ( 0x7FFE < zExp )
 891              || (    ( zExp == 0x7FFE )
 892                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 893                   && increment
 894                 )
 895            ) {
 896             roundMask = 0;
 897  overflow:
 898             float_raise(float_flag_overflow | float_flag_inexact, status);
 899             if (    ( roundingMode == float_round_to_zero )
 900                  || ( zSign && ( roundingMode == float_round_up ) )
 901                  || ( ! zSign && ( roundingMode == float_round_down ) )
 902                ) {
 903                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 904             }
 905             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 906         }
 907         if ( zExp <= 0 ) {
 908             isTiny =
 909                    (status->float_detect_tininess
 910                     == float_tininess_before_rounding)
 911                 || ( zExp < 0 )
 912                 || ! increment
 913                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 914             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 915             zExp = 0;
 916             if (isTiny && zSig1) {
 917                 float_raise(float_flag_underflow, status);
 918             }
 919             if (zSig1) {
 920                 status->float_exception_flags |= float_flag_inexact;
 921             }
 922             switch (roundingMode) {
 923             case float_round_nearest_even:
 924             case float_round_ties_away:
 925                 increment = ((int64_t)zSig1 < 0);
 926                 break;
 927             case float_round_to_zero:
 928                 increment = 0;
 929                 break;
 930             case float_round_up:
 931                 increment = !zSign && zSig1;
 932                 break;
 933             case float_round_down:
 934                 increment = zSign && zSig1;
 935                 break;
 936             default:
 937                 abort();
 938             }
 939             if ( increment ) {
 940                 ++zSig0;
 941                 zSig0 &=
 942                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 943                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
 944             }
 945             return packFloatx80( zSign, zExp, zSig0 );
 946         }
 947     }
 948     if (zSig1) {
 949         status->float_exception_flags |= float_flag_inexact;
 950     }
 951     if ( increment ) {
 952         ++zSig0;
 953         if ( zSig0 == 0 ) {
 954             ++zExp;
 955             zSig0 = LIT64( 0x8000000000000000 );
 956         }
 957         else {
 958             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 959         }
 960     }
 961     else {
 962         if ( zSig0 == 0 ) zExp = 0;
 963     }
 964     return packFloatx80( zSign, zExp, zSig0 );
 965
 966 }
 967
 968 /*----------------------------------------------------------------------------
 969 | Takes an abstract floating-point value having sign `zSign', exponent
 970 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 971 | and returns the proper extended double-precision floating-point value
 972 | corresponding to the abstract input.  This routine is just like
 973 | `roundAndPackFloatx80' except that the input significand does not have to be
 974 | normalized.
 975 *----------------------------------------------------------------------------*/
 976
 977 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
 978                                               flag zSign, int32_t zExp,
 979                                               uint64_t zSig0, uint64_t zSig1,
 980                                               float_status *status)
 981 {
 982     int8_t shiftCount;
 983
 984     if ( zSig0 == 0 ) {
 985         zSig0 = zSig1;
 986         zSig1 = 0;
 987         zExp -= 64;
 988     }
 989     shiftCount = countLeadingZeros64( zSig0 );
 990     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 991     zExp -= shiftCount;
 992     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
 993                                 zSig0, zSig1, status);
 994
 995 }
 996
 997 /*----------------------------------------------------------------------------
 998 | Returns the least-significant 64 fraction bits of the quadruple-precision
 999 | floating-point value `a'.
1000 *----------------------------------------------------------------------------*/
1001
1002 static inline uint64_t extractFloat128Frac1( float128 a )
1003 {
1004
1005     return a.low;
1006
1007 }
1008
1009 /*----------------------------------------------------------------------------
1010 | Returns the most-significant 48 fraction bits of the quadruple-precision
1011 | floating-point value `a'.
1012 *----------------------------------------------------------------------------*/
1013
1014 static inline uint64_t extractFloat128Frac0( float128 a )
1015 {
1016
1017     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
1018
1019 }
1020
1021 /*----------------------------------------------------------------------------
1022 | Returns the exponent bits of the quadruple-precision floating-point value
1023 | `a'.
1024 *----------------------------------------------------------------------------*/
1025
1026 static inline int32_t extractFloat128Exp( float128 a )
1027 {
1028
1029     return ( a.high>>48 ) & 0x7FFF;
1030
1031 }
1032
1033 /*----------------------------------------------------------------------------
1034 | Returns the sign bit of the quadruple-precision floating-point value `a'.
1035 *----------------------------------------------------------------------------*/
1036
1037 static inline flag extractFloat128Sign( float128 a )
1038 {
1039
1040     return a.high>>63;
1041
1042 }
1043
1044 /*----------------------------------------------------------------------------
1045 | Normalizes the subnormal quadruple-precision floating-point value
1046 | represented by the denormalized significand formed by the concatenation of
1047 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
1048 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
1049 | significand are stored at the location pointed to by `zSig0Ptr', and the
1050 | least significant 64 bits of the normalized significand are stored at the
1051 | location pointed to by `zSig1Ptr'.
1052 *----------------------------------------------------------------------------*/
1053
1054 static void
1055  normalizeFloat128Subnormal(
1056      uint64_t aSig0,
1057      uint64_t aSig1,
1058      int32_t *zExpPtr,
1059      uint64_t *zSig0Ptr,
1060      uint64_t *zSig1Ptr
1061  )
1062 {
1063     int8_t shiftCount;
1064
1065     if ( aSig0 == 0 ) {
1066         shiftCount = countLeadingZeros64( aSig1 ) - 15;
1067         if ( shiftCount < 0 ) {
1068             *zSig0Ptr = aSig1>>( - shiftCount );
1069             *zSig1Ptr = aSig1<<( shiftCount & 63 );
1070         }
1071         else {
1072             *zSig0Ptr = aSig1<<shiftCount;
1073             *zSig1Ptr = 0;
1074         }
1075         *zExpPtr = - shiftCount - 63;
1076     }
1077     else {
1078         shiftCount = countLeadingZeros64( aSig0 ) - 15;
1079         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1080         *zExpPtr = 1 - shiftCount;
1081     }
1082
1083 }
1084
1085 /*----------------------------------------------------------------------------
1086 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1087 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1088 | floating-point value, returning the result.  After being shifted into the
1089 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1090 | added together to form the most significant 32 bits of the result.  This
1091 | means that any integer portion of `zSig0' will be added into the exponent.
1092 | Since a properly normalized significand will have an integer portion equal
1093 | to 1, the `zExp' input should be 1 less than the desired result exponent
1094 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1095 | significand.
1096 *----------------------------------------------------------------------------*/
1097
1098 static inline float128
1099  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
1100 {
1101     float128 z;
1102
1103     z.low = zSig1;
1104     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1105     return z;
1106
1107 }
1108
1109 /*----------------------------------------------------------------------------
1110 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1111 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1112 | and `zSig2', and returns the proper quadruple-precision floating-point value
1113 | corresponding to the abstract input.  Ordinarily, the abstract value is
1114 | simply rounded and packed into the quadruple-precision format, with the
1115 | inexact exception raised if the abstract input cannot be represented
1116 | exactly.  However, if the abstract value is too large, the overflow and
1117 | inexact exceptions are raised and an infinity or maximal finite value is
1118 | returned.  If the abstract value is too small, the input value is rounded to
1119 | a subnormal number, and the underflow and inexact exceptions are raised if
1120 | the abstract input cannot be represented exactly as a subnormal quadruple-
1121 | precision floating-point number.
1122 |     The input significand must be normalized or smaller.  If the input
1123 | significand is not normalized, `zExp' must be 0; in that case, the result
1124 | returned is a subnormal number, and it must not require rounding.  In the
1125 | usual case that the input significand is normalized, `zExp' must be 1 less
1126 | than the ``true'' floating-point exponent.  The handling of underflow and
1127 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1128 *----------------------------------------------------------------------------*/
1129
1130 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
1131                                      uint64_t zSig0, uint64_t zSig1,
1132                                      uint64_t zSig2, float_status *status)
1133 {
1134     int8_t roundingMode;
1135     flag roundNearestEven, increment, isTiny;
1136
1137     roundingMode = status->float_rounding_mode;
1138     roundNearestEven = ( roundingMode == float_round_nearest_even );
1139     switch (roundingMode) {
1140     case float_round_nearest_even:
1141     case float_round_ties_away:
1142         increment = ((int64_t)zSig2 < 0);
1143         break;
1144     case float_round_to_zero:
1145         increment = 0;
1146         break;
1147     case float_round_up:
1148         increment = !zSign && zSig2;
1149         break;
1150     case float_round_down:
1151         increment = zSign && zSig2;
1152         break;
1153     default:
1154         abort();
1155     }
1156     if ( 0x7FFD <= (uint32_t) zExp ) {
1157         if (    ( 0x7FFD < zExp )
1158              || (    ( zExp == 0x7FFD )
1159                   && eq128(
1160                          LIT64( 0x0001FFFFFFFFFFFF ),
1161                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1162                          zSig0,
1163                          zSig1
1164                      )
1165                   && increment
1166                 )
1167            ) {
1168             float_raise(float_flag_overflow | float_flag_inexact, status);
1169             if (    ( roundingMode == float_round_to_zero )
1170                  || ( zSign && ( roundingMode == float_round_up ) )
1171                  || ( ! zSign && ( roundingMode == float_round_down ) )
1172                ) {
1173                 return
1174                     packFloat128(
1175                         zSign,
1176                         0x7FFE,
1177                         LIT64( 0x0000FFFFFFFFFFFF ),
1178                         LIT64( 0xFFFFFFFFFFFFFFFF )
1179                     );
1180             }
1181             return packFloat128( zSign, 0x7FFF, 0, 0 );
1182         }
1183         if ( zExp < 0 ) {
1184             if (status->flush_to_zero) {
1185                 float_raise(float_flag_output_denormal, status);
1186                 return packFloat128(zSign, 0, 0, 0);
1187             }
1188             isTiny =
1189                    (status->float_detect_tininess
1190                     == float_tininess_before_rounding)
1191                 || ( zExp < -1 )
1192                 || ! increment
1193                 || lt128(
1194                        zSig0,
1195                        zSig1,
1196                        LIT64( 0x0001FFFFFFFFFFFF ),
1197                        LIT64( 0xFFFFFFFFFFFFFFFF )
1198                    );
1199             shift128ExtraRightJamming(
1200                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1201             zExp = 0;
1202             if (isTiny && zSig2) {
1203                 float_raise(float_flag_underflow, status);
1204             }
1205             switch (roundingMode) {
1206             case float_round_nearest_even:
1207             case float_round_ties_away:
1208                 increment = ((int64_t)zSig2 < 0);
1209                 break;
1210             case float_round_to_zero:
1211                 increment = 0;
1212                 break;
1213             case float_round_up:
1214                 increment = !zSign && zSig2;
1215                 break;
1216             case float_round_down:
1217                 increment = zSign && zSig2;
1218                 break;
1219             default:
1220                 abort();
1221             }
1222         }
1223     }
1224     if (zSig2) {
1225         status->float_exception_flags |= float_flag_inexact;
1226     }
1227     if ( increment ) {
1228         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1229         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1230     }
1231     else {
1232         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1233     }
1234     return packFloat128( zSign, zExp, zSig0, zSig1 );
1235
1236 }
1237
1238 /*----------------------------------------------------------------------------
1239 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1240 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1241 | returns the proper quadruple-precision floating-point value corresponding
1242 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1243 | except that the input significand has fewer bits and does not have to be
1244 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1245 | point exponent.
1246 *----------------------------------------------------------------------------*/
1247
1248 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
1249                                               uint64_t zSig0, uint64_t zSig1,
1250                                               float_status *status)
1251 {
1252     int8_t shiftCount;
1253     uint64_t zSig2;
1254
1255     if ( zSig0 == 0 ) {
1256         zSig0 = zSig1;
1257         zSig1 = 0;
1258         zExp -= 64;
1259     }
1260     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1261     if ( 0 <= shiftCount ) {
1262         zSig2 = 0;
1263         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1264     }
1265     else {
1266         shift128ExtraRightJamming(
1267             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1268     }
1269     zExp -= shiftCount;
1270     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
1271
1272 }
1273
1274 /*----------------------------------------------------------------------------
1275 | Returns the result of converting the 32-bit two's complement integer `a'
1276 | to the single-precision floating-point format.  The conversion is performed
1277 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1278 *----------------------------------------------------------------------------*/
1279
1280 float32 int32_to_float32(int32_t a, float_status *status)
1281 {
1282     flag zSign;
1283
1284     if ( a == 0 ) return float32_zero;
1285     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1286     zSign = ( a < 0 );
1287     return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
1288 }
1289
1290 /*----------------------------------------------------------------------------
1291 | Returns the result of converting the 32-bit two's complement integer `a'
1292 | to the double-precision floating-point format.  The conversion is performed
1293 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1294 *----------------------------------------------------------------------------*/
1295
1296 float64 int32_to_float64(int32_t a, float_status *status)
1297 {
1298     flag zSign;
1299     uint32_t absA;
1300     int8_t shiftCount;
1301     uint64_t zSig;
1302
1303     if ( a == 0 ) return float64_zero;
1304     zSign = ( a < 0 );
1305     absA = zSign ? - a : a;
1306     shiftCount = countLeadingZeros32( absA ) + 21;
1307     zSig = absA;
1308     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1309
1310 }
1311
1312 /*----------------------------------------------------------------------------
1313 | Returns the result of converting the 32-bit two's complement integer `a'
1314 | to the extended double-precision floating-point format.  The conversion
1315 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1316 | Arithmetic.
1317 *----------------------------------------------------------------------------*/
1318
1319 floatx80 int32_to_floatx80(int32_t a, float_status *status)
1320 {
1321     flag zSign;
1322     uint32_t absA;
1323     int8_t shiftCount;
1324     uint64_t zSig;
1325
1326     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1327     zSign = ( a < 0 );
1328     absA = zSign ? - a : a;
1329     shiftCount = countLeadingZeros32( absA ) + 32;
1330     zSig = absA;
1331     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1332
1333 }
1334
1335 /*----------------------------------------------------------------------------
1336 | Returns the result of converting the 32-bit two's complement integer `a' to
1337 | the quadruple-precision floating-point format.  The conversion is performed
1338 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1339 *----------------------------------------------------------------------------*/
1340
1341 float128 int32_to_float128(int32_t a, float_status *status)
1342 {
1343     flag zSign;
1344     uint32_t absA;
1345     int8_t shiftCount;
1346     uint64_t zSig0;
1347
1348     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1349     zSign = ( a < 0 );
1350     absA = zSign ? - a : a;
1351     shiftCount = countLeadingZeros32( absA ) + 17;
1352     zSig0 = absA;
1353     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1354
1355 }
1356
1357 /*----------------------------------------------------------------------------
1358 | Returns the result of converting the 64-bit two's complement integer `a'
1359 | to the single-precision floating-point format.  The conversion is performed
1360 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1361 *----------------------------------------------------------------------------*/
1362
1363 float32 int64_to_float32(int64_t a, float_status *status)
1364 {
1365     flag zSign;
1366     uint64_t absA;
1367     int8_t shiftCount;
1368
1369     if ( a == 0 ) return float32_zero;
1370     zSign = ( a < 0 );
1371     absA = zSign ? - a : a;
1372     shiftCount = countLeadingZeros64( absA ) - 40;
1373     if ( 0 <= shiftCount ) {
1374         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1375     }
1376     else {
1377         shiftCount += 7;
1378         if ( shiftCount < 0 ) {
1379             shift64RightJamming( absA, - shiftCount, &absA );
1380         }
1381         else {
1382             absA <<= shiftCount;
1383         }
1384         return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
1385     }
1386
1387 }
1388
1389 /*----------------------------------------------------------------------------
1390 | Returns the result of converting the 64-bit two's complement integer `a'
1391 | to the double-precision floating-point format.  The conversion is performed
1392 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1393 *----------------------------------------------------------------------------*/
1394
1395 float64 int64_to_float64(int64_t a, float_status *status)
1396 {
1397     flag zSign;
1398
1399     if ( a == 0 ) return float64_zero;
1400     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1401         return packFloat64( 1, 0x43E, 0 );
1402     }
1403     zSign = ( a < 0 );
1404     return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
1405 }
1406
1407 /*----------------------------------------------------------------------------
1408 | Returns the result of converting the 64-bit two's complement integer `a'
1409 | to the extended double-precision floating-point format.  The conversion
1410 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1411 | Arithmetic.
1412 *----------------------------------------------------------------------------*/
1413
1414 floatx80 int64_to_floatx80(int64_t a, float_status *status)
1415 {
1416     flag zSign;
1417     uint64_t absA;
1418     int8_t shiftCount;
1419
1420     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1421     zSign = ( a < 0 );
1422     absA = zSign ? - a : a;
1423     shiftCount = countLeadingZeros64( absA );
1424     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1425
1426 }
1427
1428 /*----------------------------------------------------------------------------
1429 | Returns the result of converting the 64-bit two's complement integer `a' to
1430 | the quadruple-precision floating-point format.  The conversion is performed
1431 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1432 *----------------------------------------------------------------------------*/
1433
1434 float128 int64_to_float128(int64_t a, float_status *status)
1435 {
1436     flag zSign;
1437     uint64_t absA;
1438     int8_t shiftCount;
1439     int32_t zExp;
1440     uint64_t zSig0, zSig1;
1441
1442     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1443     zSign = ( a < 0 );
1444     absA = zSign ? - a : a;
1445     shiftCount = countLeadingZeros64( absA ) + 49;
1446     zExp = 0x406E - shiftCount;
1447     if ( 64 <= shiftCount ) {
1448         zSig1 = 0;
1449         zSig0 = absA;
1450         shiftCount -= 64;
1451     }
1452     else {
1453         zSig1 = absA;
1454         zSig0 = 0;
1455     }
1456     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1457     return packFloat128( zSign, zExp, zSig0, zSig1 );
1458
1459 }
1460
1461 /*----------------------------------------------------------------------------
1462 | Returns the result of converting the 64-bit unsigned integer `a'
1463 | to the single-precision floating-point format.  The conversion is performed
1464 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1465 *----------------------------------------------------------------------------*/
1466
1467 float32 uint64_to_float32(uint64_t a, float_status *status)
1468 {
1469     int shiftcount;
1470
1471     if (a == 0) {
1472         return float32_zero;
1473     }
1474
1475     /* Determine (left) shift needed to put first set bit into bit posn 23
1476      * (since packFloat32() expects the binary point between bits 23 and 22);
1477      * this is the fast case for smallish numbers.
1478      */
1479     shiftcount = countLeadingZeros64(a) - 40;
1480     if (shiftcount >= 0) {
1481         return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1482     }
1483     /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1484      * expects the binary point between bits 30 and 29, hence the + 7.
1485      */
1486     shiftcount += 7;
1487     if (shiftcount < 0) {
1488         shift64RightJamming(a, -shiftcount, &a);
1489     } else {
1490         a <<= shiftcount;
1491     }
1492
1493     return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
1494 }
1495
1496 /*----------------------------------------------------------------------------
1497 | Returns the result of converting the 64-bit unsigned integer `a'
1498 | to the double-precision floating-point format.  The conversion is performed
1499 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1500 *----------------------------------------------------------------------------*/
1501
1502 float64 uint64_to_float64(uint64_t a, float_status *status)
1503 {
1504     int exp = 0x43C;
1505     int shiftcount;
1506
1507     if (a == 0) {
1508         return float64_zero;
1509     }
1510
1511     shiftcount = countLeadingZeros64(a) - 1;
1512     if (shiftcount < 0) {
1513         shift64RightJamming(a, -shiftcount, &a);
1514     } else {
1515         a <<= shiftcount;
1516     }
1517     return roundAndPackFloat64(0, exp - shiftcount, a, status);
1518 }
1519
1520 /*----------------------------------------------------------------------------
1521 | Returns the result of converting the 64-bit unsigned integer `a'
1522 | to the quadruple-precision floating-point format.  The conversion is performed
1523 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1524 *----------------------------------------------------------------------------*/
1525
1526 float128 uint64_to_float128(uint64_t a, float_status *status)
1527 {
1528     if (a == 0) {
1529         return float128_zero;
1530     }
1531     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
1532 }
1533
1534 /*----------------------------------------------------------------------------
1535 | Returns the result of converting the single-precision floating-point value
1536 | `a' to the 32-bit two's complement integer format.  The conversion is
1537 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1538 | Arithmetic---which means in particular that the conversion is rounded
1539 | according to the current rounding mode.  If `a' is a NaN, the largest
1540 | positive integer is returned.  Otherwise, if the conversion overflows, the
1541 | largest integer with the same sign as `a' is returned.
1542 *----------------------------------------------------------------------------*/
1543
1544 int32_t float32_to_int32(float32 a, float_status *status)
1545 {
1546     flag aSign;
1547     int_fast16_t aExp, shiftCount;
1548     uint32_t aSig;
1549     uint64_t aSig64;
1550
1551     a = float32_squash_input_denormal(a, status);
1552     aSig = extractFloat32Frac( a );
1553     aExp = extractFloat32Exp( a );
1554     aSign = extractFloat32Sign( a );
1555     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1556     if ( aExp ) aSig |= 0x00800000;
1557     shiftCount = 0xAF - aExp;
1558     aSig64 = aSig;
1559     aSig64 <<= 32;
1560     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1561     return roundAndPackInt32(aSign, aSig64, status);
1562
1563 }
1564
1565 /*----------------------------------------------------------------------------
1566 | Returns the result of converting the single-precision floating-point value
1567 | `a' to the 32-bit two's complement integer format.  The conversion is
1568 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1569 | Arithmetic, except that the conversion is always rounded toward zero.
1570 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1571 | the conversion overflows, the largest integer with the same sign as `a' is
1572 | returned.
1573 *----------------------------------------------------------------------------*/
1574
1575 int32_t float32_to_int32_round_to_zero(float32 a, float_status *status)
1576 {
1577     flag aSign;
1578     int_fast16_t aExp, shiftCount;
1579     uint32_t aSig;
1580     int32_t z;
1581     a = float32_squash_input_denormal(a, status);
1582
1583     aSig = extractFloat32Frac( a );
1584     aExp = extractFloat32Exp( a );
1585     aSign = extractFloat32Sign( a );
1586     shiftCount = aExp - 0x9E;
1587     if ( 0 <= shiftCount ) {
1588         if ( float32_val(a) != 0xCF000000 ) {
1589             float_raise(float_flag_invalid, status);
1590             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1591         }
1592         return (int32_t) 0x80000000;
1593     }
1594     else if ( aExp <= 0x7E ) {
1595         if (aExp | aSig) {
1596             status->float_exception_flags |= float_flag_inexact;
1597         }
1598         return 0;
1599     }
1600     aSig = ( aSig | 0x00800000 )<<8;
1601     z = aSig>>( - shiftCount );
1602     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1603         status->float_exception_flags |= float_flag_inexact;
1604     }
1605     if ( aSign ) z = - z;
1606     return z;
1607
1608 }
1609
1610 /*----------------------------------------------------------------------------
1611 | Returns the result of converting the single-precision floating-point value
1612 | `a' to the 16-bit two's complement integer format.  The conversion is
1613 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1614 | Arithmetic, except that the conversion is always rounded toward zero.
1615 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1616 | the conversion overflows, the largest integer with the same sign as `a' is
1617 | returned.
1618 *----------------------------------------------------------------------------*/
1619
1620 int_fast16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
1621 {
1622     flag aSign;
1623     int_fast16_t aExp, shiftCount;
1624     uint32_t aSig;
1625     int32_t z;
1626
1627     aSig = extractFloat32Frac( a );
1628     aExp = extractFloat32Exp( a );
1629     aSign = extractFloat32Sign( a );
1630     shiftCount = aExp - 0x8E;
1631     if ( 0 <= shiftCount ) {
1632         if ( float32_val(a) != 0xC7000000 ) {
1633             float_raise(float_flag_invalid, status);
1634             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1635                 return 0x7FFF;
1636             }
1637         }
1638         return (int32_t) 0xffff8000;
1639     }
1640     else if ( aExp <= 0x7E ) {
1641         if ( aExp | aSig ) {
1642             status->float_exception_flags |= float_flag_inexact;
1643         }
1644         return 0;
1645     }
1646     shiftCount -= 0x10;
1647     aSig = ( aSig | 0x00800000 )<<8;
1648     z = aSig>>( - shiftCount );
1649     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1650         status->float_exception_flags |= float_flag_inexact;
1651     }
1652     if ( aSign ) {
1653         z = - z;
1654     }
1655     return z;
1656
1657 }
1658
1659 /*----------------------------------------------------------------------------
1660 | Returns the result of converting the single-precision floating-point value
1661 | `a' to the 64-bit two's complement integer format.  The conversion is
1662 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1663 | Arithmetic---which means in particular that the conversion is rounded
1664 | according to the current rounding mode.  If `a' is a NaN, the largest
1665 | positive integer is returned.  Otherwise, if the conversion overflows, the
1666 | largest integer with the same sign as `a' is returned.
1667 *----------------------------------------------------------------------------*/
1668
1669 int64_t float32_to_int64(float32 a, float_status *status)
1670 {
1671     flag aSign;
1672     int_fast16_t aExp, shiftCount;
1673     uint32_t aSig;
1674     uint64_t aSig64, aSigExtra;
1675     a = float32_squash_input_denormal(a, status);
1676
1677     aSig = extractFloat32Frac( a );
1678     aExp = extractFloat32Exp( a );
1679     aSign = extractFloat32Sign( a );
1680     shiftCount = 0xBE - aExp;
1681     if ( shiftCount < 0 ) {
1682         float_raise(float_flag_invalid, status);
1683         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1684             return LIT64( 0x7FFFFFFFFFFFFFFF );
1685         }
1686         return (int64_t) LIT64( 0x8000000000000000 );
1687     }
1688     if ( aExp ) aSig |= 0x00800000;
1689     aSig64 = aSig;
1690     aSig64 <<= 40;
1691     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1692     return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
1693
1694 }
1695
1696 /*----------------------------------------------------------------------------
1697 | Returns the result of converting the single-precision floating-point value
1698 | `a' to the 64-bit unsigned integer format.  The conversion is
1699 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1700 | Arithmetic---which means in particular that the conversion is rounded
1701 | according to the current rounding mode.  If `a' is a NaN, the largest
1702 | unsigned integer is returned.  Otherwise, if the conversion overflows, the
1703 | largest unsigned integer is returned.  If the 'a' is negative, the result
1704 | is rounded and zero is returned; values that do not round to zero will
1705 | raise the inexact exception flag.
1706 *----------------------------------------------------------------------------*/
1707
1708 uint64_t float32_to_uint64(float32 a, float_status *status)
1709 {
1710     flag aSign;
1711     int_fast16_t aExp, shiftCount;
1712     uint32_t aSig;
1713     uint64_t aSig64, aSigExtra;
1714     a = float32_squash_input_denormal(a, status);
1715
1716     aSig = extractFloat32Frac(a);
1717     aExp = extractFloat32Exp(a);
1718     aSign = extractFloat32Sign(a);
1719     if ((aSign) && (aExp > 126)) {
1720         float_raise(float_flag_invalid, status);
1721         if (float32_is_any_nan(a)) {
1722             return LIT64(0xFFFFFFFFFFFFFFFF);
1723         } else {
1724             return 0;
1725         }
1726     }
1727     shiftCount = 0xBE - aExp;
1728     if (aExp) {
1729         aSig |= 0x00800000;
1730     }
1731     if (shiftCount < 0) {
1732         float_raise(float_flag_invalid, status);
1733         return LIT64(0xFFFFFFFFFFFFFFFF);
1734     }
1735
1736     aSig64 = aSig;
1737     aSig64 <<= 40;
1738     shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1739     return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
1740 }
1741
1742 /*----------------------------------------------------------------------------
1743 | Returns the result of converting the single-precision floating-point value
1744 | `a' to the 64-bit unsigned integer format.  The conversion is
1745 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1746 | Arithmetic, except that the conversion is always rounded toward zero.  If
1747 | `a' is a NaN, the largest unsigned integer is returned.  Otherwise, if the
1748 | conversion overflows, the largest unsigned integer is returned.  If the
1749 | 'a' is negative, the result is rounded and zero is returned; values that do
1750 | not round to zero will raise the inexact flag.
1751 *----------------------------------------------------------------------------*/
1752
1753 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status)
1754 {
1755     signed char current_rounding_mode = status->float_rounding_mode;
1756     set_float_rounding_mode(float_round_to_zero, status);
1757     int64_t v = float32_to_uint64(a, status);
1758     set_float_rounding_mode(current_rounding_mode, status);
1759     return v;
1760 }
1761
1762 /*----------------------------------------------------------------------------
1763 | Returns the result of converting the single-precision floating-point value
1764 | `a' to the 64-bit two's complement integer format.  The conversion is
1765 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1766 | Arithmetic, except that the conversion is always rounded toward zero.  If
1767 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1768 | conversion overflows, the largest integer with the same sign as `a' is
1769 | returned.
1770 *----------------------------------------------------------------------------*/
1771
1772 int64_t float32_to_int64_round_to_zero(float32 a, float_status *status)
1773 {
1774     flag aSign;
1775     int_fast16_t aExp, shiftCount;
1776     uint32_t aSig;
1777     uint64_t aSig64;
1778     int64_t z;
1779     a = float32_squash_input_denormal(a, status);
1780
1781     aSig = extractFloat32Frac( a );
1782     aExp = extractFloat32Exp( a );
1783     aSign = extractFloat32Sign( a );
1784     shiftCount = aExp - 0xBE;
1785     if ( 0 <= shiftCount ) {
1786         if ( float32_val(a) != 0xDF000000 ) {
1787             float_raise(float_flag_invalid, status);
1788             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1789                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1790             }
1791         }
1792         return (int64_t) LIT64( 0x8000000000000000 );
1793     }
1794     else if ( aExp <= 0x7E ) {
1795         if (aExp | aSig) {
1796             status->float_exception_flags |= float_flag_inexact;
1797         }
1798         return 0;
1799     }
1800     aSig64 = aSig | 0x00800000;
1801     aSig64 <<= 40;
1802     z = aSig64>>( - shiftCount );
1803     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1804         status->float_exception_flags |= float_flag_inexact;
1805     }
1806     if ( aSign ) z = - z;
1807     return z;
1808
1809 }
1810
1811 /*----------------------------------------------------------------------------
1812 | Returns the result of converting the single-precision floating-point value
1813 | `a' to the double-precision floating-point format.  The conversion is
1814 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1815 | Arithmetic.
1816 *----------------------------------------------------------------------------*/
1817
1818 float64 float32_to_float64(float32 a, float_status *status)
1819 {
1820     flag aSign;
1821     int_fast16_t aExp;
1822     uint32_t aSig;
1823     a = float32_squash_input_denormal(a, status);
1824
1825     aSig = extractFloat32Frac( a );
1826     aExp = extractFloat32Exp( a );
1827     aSign = extractFloat32Sign( a );
1828     if ( aExp == 0xFF ) {
1829         if (aSig) {
1830             return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
1831         }
1832         return packFloat64( aSign, 0x7FF, 0 );
1833     }
1834     if ( aExp == 0 ) {
1835         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1836         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1837         --aExp;
1838     }
1839     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1840
1841 }
1842
1843 /*----------------------------------------------------------------------------
1844 | Returns the result of converting the single-precision floating-point value
1845 | `a' to the extended double-precision floating-point format.  The conversion
1846 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1847 | Arithmetic.
1848 *----------------------------------------------------------------------------*/
1849
1850 floatx80 float32_to_floatx80(float32 a, float_status *status)
1851 {
1852     flag aSign;
1853     int_fast16_t aExp;
1854     uint32_t aSig;
1855
1856     a = float32_squash_input_denormal(a, status);
1857     aSig = extractFloat32Frac( a );
1858     aExp = extractFloat32Exp( a );
1859     aSign = extractFloat32Sign( a );
1860     if ( aExp == 0xFF ) {
1861         if (aSig) {
1862             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
1863         }
1864         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1865     }
1866     if ( aExp == 0 ) {
1867         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1868         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1869     }
1870     aSig |= 0x00800000;
1871     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1872
1873 }
1874
1875 /*----------------------------------------------------------------------------
1876 | Returns the result of converting the single-precision floating-point value
1877 | `a' to the double-precision floating-point format.  The conversion is
1878 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1879 | Arithmetic.
1880 *----------------------------------------------------------------------------*/
1881
1882 float128 float32_to_float128(float32 a, float_status *status)
1883 {
1884     flag aSign;
1885     int_fast16_t aExp;
1886     uint32_t aSig;
1887
1888     a = float32_squash_input_denormal(a, status);
1889     aSig = extractFloat32Frac( a );
1890     aExp = extractFloat32Exp( a );
1891     aSign = extractFloat32Sign( a );
1892     if ( aExp == 0xFF ) {
1893         if (aSig) {
1894             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
1895         }
1896         return packFloat128( aSign, 0x7FFF, 0, 0 );
1897     }
1898     if ( aExp == 0 ) {
1899         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1900         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1901         --aExp;
1902     }
1903     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1904
1905 }
1906
1907 /*----------------------------------------------------------------------------
1908 | Rounds the single-precision floating-point value `a' to an integer, and
1909 | returns the result as a single-precision floating-point value.  The
1910 | operation is performed according to the IEC/IEEE Standard for Binary
1911 | Floating-Point Arithmetic.
1912 *----------------------------------------------------------------------------*/
1913
1914 float32 float32_round_to_int(float32 a, float_status *status)
1915 {
1916     flag aSign;
1917     int_fast16_t aExp;
1918     uint32_t lastBitMask, roundBitsMask;
1919     uint32_t z;
1920     a = float32_squash_input_denormal(a, status);
1921
1922     aExp = extractFloat32Exp( a );
1923     if ( 0x96 <= aExp ) {
1924         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1925             return propagateFloat32NaN(a, a, status);
1926         }
1927         return a;
1928     }
1929     if ( aExp <= 0x7E ) {
1930         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1931         status->float_exception_flags |= float_flag_inexact;
1932         aSign = extractFloat32Sign( a );
1933         switch (status->float_rounding_mode) {
1934          case float_round_nearest_even:
1935             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1936                 return packFloat32( aSign, 0x7F, 0 );
1937             }
1938             break;
1939         case float_round_ties_away:
1940             if (aExp == 0x7E) {
1941                 return packFloat32(aSign, 0x7F, 0);
1942             }
1943             break;
1944          case float_round_down:
1945             return make_float32(aSign ? 0xBF800000 : 0);
1946          case float_round_up:
1947             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1948         }
1949         return packFloat32( aSign, 0, 0 );
1950     }
1951     lastBitMask = 1;
1952     lastBitMask <<= 0x96 - aExp;
1953     roundBitsMask = lastBitMask - 1;
1954     z = float32_val(a);
1955     switch (status->float_rounding_mode) {
1956     case float_round_nearest_even:
1957         z += lastBitMask>>1;
1958         if ((z & roundBitsMask) == 0) {
1959             z &= ~lastBitMask;
1960         }
1961         break;
1962     case float_round_ties_away:
1963         z += lastBitMask >> 1;
1964         break;
1965     case float_round_to_zero:
1966         break;
1967     case float_round_up:
1968         if (!extractFloat32Sign(make_float32(z))) {
1969             z += roundBitsMask;
1970         }
1971         break;
1972     case float_round_down:
1973         if (extractFloat32Sign(make_float32(z))) {
1974             z += roundBitsMask;
1975         }
1976         break;
1977     default:
1978         abort();
1979     }
1980     z &= ~ roundBitsMask;
1981     if (z != float32_val(a)) {
1982         status->float_exception_flags |= float_flag_inexact;
1983     }
1984     return make_float32(z);
1985
1986 }
1987
1988 /*----------------------------------------------------------------------------
1989 | Returns the result of adding the absolute values of the single-precision
1990 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1991 | before being returned.  `zSign' is ignored if the result is a NaN.
1992 | The addition is performed according to the IEC/IEEE Standard for Binary
1993 | Floating-Point Arithmetic.
1994 *----------------------------------------------------------------------------*/
1995
1996 static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
1997                               float_status *status)
1998 {
1999     int_fast16_t aExp, bExp, zExp;
2000     uint32_t aSig, bSig, zSig;
2001     int_fast16_t expDiff;
2002
2003     aSig = extractFloat32Frac( a );
2004     aExp = extractFloat32Exp( a );
2005     bSig = extractFloat32Frac( b );
2006     bExp = extractFloat32Exp( b );
2007     expDiff = aExp - bExp;
2008     aSig <<= 6;
2009     bSig <<= 6;
2010     if ( 0 < expDiff ) {
2011         if ( aExp == 0xFF ) {
2012             if (aSig) {
2013                 return propagateFloat32NaN(a, b, status);
2014             }
2015             return a;
2016         }
2017         if ( bExp == 0 ) {
2018             --expDiff;
2019         }
2020         else {
2021             bSig |= 0x20000000;
2022         }
2023         shift32RightJamming( bSig, expDiff, &bSig );
2024         zExp = aExp;
2025     }
2026     else if ( expDiff < 0 ) {
2027         if ( bExp == 0xFF ) {
2028             if (bSig) {
2029                 return propagateFloat32NaN(a, b, status);
2030             }
2031             return packFloat32( zSign, 0xFF, 0 );
2032         }
2033         if ( aExp == 0 ) {
2034             ++expDiff;
2035         }
2036         else {
2037             aSig |= 0x20000000;
2038         }
2039         shift32RightJamming( aSig, - expDiff, &aSig );
2040         zExp = bExp;
2041     }
2042     else {
2043         if ( aExp == 0xFF ) {
2044             if (aSig | bSig) {
2045                 return propagateFloat32NaN(a, b, status);
2046             }
2047             return a;
2048         }
2049         if ( aExp == 0 ) {
2050             if (status->flush_to_zero) {
2051                 if (aSig | bSig) {
2052                     float_raise(float_flag_output_denormal, status);
2053                 }
2054                 return packFloat32(zSign, 0, 0);
2055             }
2056             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2057         }
2058         zSig = 0x40000000 + aSig + bSig;
2059         zExp = aExp;
2060         goto roundAndPack;
2061     }
2062     aSig |= 0x20000000;
2063     zSig = ( aSig + bSig )<<1;
2064     --zExp;
2065     if ( (int32_t) zSig < 0 ) {
2066         zSig = aSig + bSig;
2067         ++zExp;
2068     }
2069  roundAndPack:
2070     return roundAndPackFloat32(zSign, zExp, zSig, status);
2071
2072 }
2073
2074 /*----------------------------------------------------------------------------
2075 | Returns the result of subtracting the absolute values of the single-
2076 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2077 | difference is negated before being returned.  `zSign' is ignored if the
2078 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2079 | Standard for Binary Floating-Point Arithmetic.
2080 *----------------------------------------------------------------------------*/
2081
2082 static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
2083                               float_status *status)
2084 {
2085     int_fast16_t aExp, bExp, zExp;
2086     uint32_t aSig, bSig, zSig;
2087     int_fast16_t expDiff;
2088
2089     aSig = extractFloat32Frac( a );
2090     aExp = extractFloat32Exp( a );
2091     bSig = extractFloat32Frac( b );
2092     bExp = extractFloat32Exp( b );
2093     expDiff = aExp - bExp;
2094     aSig <<= 7;
2095     bSig <<= 7;
2096     if ( 0 < expDiff ) goto aExpBigger;
2097     if ( expDiff < 0 ) goto bExpBigger;
2098     if ( aExp == 0xFF ) {
2099         if (aSig | bSig) {
2100             return propagateFloat32NaN(a, b, status);
2101         }
2102         float_raise(float_flag_invalid, status);
2103         return float32_default_nan;
2104     }
2105     if ( aExp == 0 ) {
2106         aExp = 1;
2107         bExp = 1;
2108     }
2109     if ( bSig < aSig ) goto aBigger;
2110     if ( aSig < bSig ) goto bBigger;
2111     return packFloat32(status->float_rounding_mode == float_round_down, 0, 0);
2112  bExpBigger:
2113     if ( bExp == 0xFF ) {
2114         if (bSig) {
2115             return propagateFloat32NaN(a, b, status);
2116         }
2117         return packFloat32( zSign ^ 1, 0xFF, 0 );
2118     }
2119     if ( aExp == 0 ) {
2120         ++expDiff;
2121     }
2122     else {
2123         aSig |= 0x40000000;
2124     }
2125     shift32RightJamming( aSig, - expDiff, &aSig );
2126     bSig |= 0x40000000;
2127  bBigger:
2128     zSig = bSig - aSig;
2129     zExp = bExp;
2130     zSign ^= 1;
2131     goto normalizeRoundAndPack;
2132  aExpBigger:
2133     if ( aExp == 0xFF ) {
2134         if (aSig) {
2135             return propagateFloat32NaN(a, b, status);
2136         }
2137         return a;
2138     }
2139     if ( bExp == 0 ) {
2140         --expDiff;
2141     }
2142     else {
2143         bSig |= 0x40000000;
2144     }
2145     shift32RightJamming( bSig, expDiff, &bSig );
2146     aSig |= 0x40000000;
2147  aBigger:
2148     zSig = aSig - bSig;
2149     zExp = aExp;
2150  normalizeRoundAndPack:
2151     --zExp;
2152     return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
2153
2154 }
2155
2156 /*----------------------------------------------------------------------------
2157 | Returns the result of adding the single-precision floating-point values `a'
2158 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2159 | Binary Floating-Point Arithmetic.
2160 *----------------------------------------------------------------------------*/
2161
2162 float32 float32_add(float32 a, float32 b, float_status *status)
2163 {
2164     flag aSign, bSign;
2165     a = float32_squash_input_denormal(a, status);
2166     b = float32_squash_input_denormal(b, status);
2167
2168     aSign = extractFloat32Sign( a );
2169     bSign = extractFloat32Sign( b );
2170     if ( aSign == bSign ) {
2171         return addFloat32Sigs(a, b, aSign, status);
2172     }
2173     else {
2174         return subFloat32Sigs(a, b, aSign, status);
2175     }
2176
2177 }
2178
2179 /*----------------------------------------------------------------------------
2180 | Returns the result of subtracting the single-precision floating-point values
2181 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2182 | for Binary Floating-Point Arithmetic.
2183 *----------------------------------------------------------------------------*/
2184
2185 float32 float32_sub(float32 a, float32 b, float_status *status)
2186 {
2187     flag aSign, bSign;
2188     a = float32_squash_input_denormal(a, status);
2189     b = float32_squash_input_denormal(b, status);
2190
2191     aSign = extractFloat32Sign( a );
2192     bSign = extractFloat32Sign( b );
2193     if ( aSign == bSign ) {
2194         return subFloat32Sigs(a, b, aSign, status);
2195     }
2196     else {
2197         return addFloat32Sigs(a, b, aSign, status);
2198     }
2199
2200 }
2201
2202 /*----------------------------------------------------------------------------
2203 | Returns the result of multiplying the single-precision floating-point values
2204 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2205 | for Binary Floating-Point Arithmetic.
2206 *----------------------------------------------------------------------------*/
2207
2208 float32 float32_mul(float32 a, float32 b, float_status *status)
2209 {
2210     flag aSign, bSign, zSign;
2211     int_fast16_t aExp, bExp, zExp;
2212     uint32_t aSig, bSig;
2213     uint64_t zSig64;
2214     uint32_t zSig;
2215
2216     a = float32_squash_input_denormal(a, status);
2217     b = float32_squash_input_denormal(b, status);
2218
2219     aSig = extractFloat32Frac( a );
2220     aExp = extractFloat32Exp( a );
2221     aSign = extractFloat32Sign( a );
2222     bSig = extractFloat32Frac( b );
2223     bExp = extractFloat32Exp( b );
2224     bSign = extractFloat32Sign( b );
2225     zSign = aSign ^ bSign;
2226     if ( aExp == 0xFF ) {
2227         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2228             return propagateFloat32NaN(a, b, status);
2229         }
2230         if ( ( bExp | bSig ) == 0 ) {
2231             float_raise(float_flag_invalid, status);
2232             return float32_default_nan;
2233         }
2234         return packFloat32( zSign, 0xFF, 0 );
2235     }
2236     if ( bExp == 0xFF ) {
2237         if (bSig) {
2238             return propagateFloat32NaN(a, b, status);
2239         }
2240         if ( ( aExp | aSig ) == 0 ) {
2241             float_raise(float_flag_invalid, status);
2242             return float32_default_nan;
2243         }
2244         return packFloat32( zSign, 0xFF, 0 );
2245     }
2246     if ( aExp == 0 ) {
2247         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2248         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2249     }
2250     if ( bExp == 0 ) {
2251         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2252         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2253     }
2254     zExp = aExp + bExp - 0x7F;
2255     aSig = ( aSig | 0x00800000 )<<7;
2256     bSig = ( bSig | 0x00800000 )<<8;
2257     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2258     zSig = zSig64;
2259     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2260         zSig <<= 1;
2261         --zExp;
2262     }
2263     return roundAndPackFloat32(zSign, zExp, zSig, status);
2264
2265 }
2266
2267 /*----------------------------------------------------------------------------
2268 | Returns the result of dividing the single-precision floating-point value `a'
2269 | by the corresponding value `b'.  The operation is performed according to the
2270 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2271 *----------------------------------------------------------------------------*/
2272
2273 float32 float32_div(float32 a, float32 b, float_status *status)
2274 {
2275     flag aSign, bSign, zSign;
2276     int_fast16_t aExp, bExp, zExp;
2277     uint32_t aSig, bSig, zSig;
2278     a = float32_squash_input_denormal(a, status);
2279     b = float32_squash_input_denormal(b, status);
2280
2281     aSig = extractFloat32Frac( a );
2282     aExp = extractFloat32Exp( a );
2283     aSign = extractFloat32Sign( a );
2284     bSig = extractFloat32Frac( b );
2285     bExp = extractFloat32Exp( b );
2286     bSign = extractFloat32Sign( b );
2287     zSign = aSign ^ bSign;
2288     if ( aExp == 0xFF ) {
2289         if (aSig) {
2290             return propagateFloat32NaN(a, b, status);
2291         }
2292         if ( bExp == 0xFF ) {
2293             if (bSig) {
2294                 return propagateFloat32NaN(a, b, status);
2295             }
2296             float_raise(float_flag_invalid, status);
2297             return float32_default_nan;
2298         }
2299         return packFloat32( zSign, 0xFF, 0 );
2300     }
2301     if ( bExp == 0xFF ) {
2302         if (bSig) {
2303             return propagateFloat32NaN(a, b, status);
2304         }
2305         return packFloat32( zSign, 0, 0 );
2306     }
2307     if ( bExp == 0 ) {
2308         if ( bSig == 0 ) {
2309             if ( ( aExp | aSig ) == 0 ) {
2310                 float_raise(float_flag_invalid, status);
2311                 return float32_default_nan;
2312             }
2313             float_raise(float_flag_divbyzero, status);
2314             return packFloat32( zSign, 0xFF, 0 );
2315         }
2316         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2317     }
2318     if ( aExp == 0 ) {
2319         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2320         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2321     }
2322     zExp = aExp - bExp + 0x7D;
2323     aSig = ( aSig | 0x00800000 )<<7;
2324     bSig = ( bSig | 0x00800000 )<<8;
2325     if ( bSig <= ( aSig + aSig ) ) {
2326         aSig >>= 1;
2327         ++zExp;
2328     }
2329     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2330     if ( ( zSig & 0x3F ) == 0 ) {
2331         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2332     }
2333     return roundAndPackFloat32(zSign, zExp, zSig, status);
2334
2335 }
2336
2337 /*----------------------------------------------------------------------------
2338 | Returns the remainder of the single-precision floating-point value `a'
2339 | with respect to the corresponding value `b'.  The operation is performed
2340 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2341 *----------------------------------------------------------------------------*/
2342
2343 float32 float32_rem(float32 a, float32 b, float_status *status)
2344 {
2345     flag aSign, zSign;
2346     int_fast16_t aExp, bExp, expDiff;
2347     uint32_t aSig, bSig;
2348     uint32_t q;
2349     uint64_t aSig64, bSig64, q64;
2350     uint32_t alternateASig;
2351     int32_t sigMean;
2352     a = float32_squash_input_denormal(a, status);
2353     b = float32_squash_input_denormal(b, status);
2354
2355     aSig = extractFloat32Frac( a );
2356     aExp = extractFloat32Exp( a );
2357     aSign = extractFloat32Sign( a );
2358     bSig = extractFloat32Frac( b );
2359     bExp = extractFloat32Exp( b );
2360     if ( aExp == 0xFF ) {
2361         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2362             return propagateFloat32NaN(a, b, status);
2363         }
2364         float_raise(float_flag_invalid, status);
2365         return float32_default_nan;
2366     }
2367     if ( bExp == 0xFF ) {
2368         if (bSig) {
2369             return propagateFloat32NaN(a, b, status);
2370         }
2371         return a;
2372     }
2373     if ( bExp == 0 ) {
2374         if ( bSig == 0 ) {
2375             float_raise(float_flag_invalid, status);
2376             return float32_default_nan;
2377         }
2378         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2379     }
2380     if ( aExp == 0 ) {
2381         if ( aSig == 0 ) return a;
2382         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2383     }
2384     expDiff = aExp - bExp;
2385     aSig |= 0x00800000;
2386     bSig |= 0x00800000;
2387     if ( expDiff < 32 ) {
2388         aSig <<= 8;
2389         bSig <<= 8;
2390         if ( expDiff < 0 ) {
2391             if ( expDiff < -1 ) return a;
2392             aSig >>= 1;
2393         }
2394         q = ( bSig <= aSig );
2395         if ( q ) aSig -= bSig;
2396         if ( 0 < expDiff ) {
2397             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2398             q >>= 32 - expDiff;
2399             bSig >>= 2;
2400             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2401         }
2402         else {
2403             aSig >>= 2;
2404             bSig >>= 2;
2405         }
2406     }
2407     else {
2408         if ( bSig <= aSig ) aSig -= bSig;
2409         aSig64 = ( (uint64_t) aSig )<<40;
2410         bSig64 = ( (uint64_t) bSig )<<40;
2411         expDiff -= 64;
2412         while ( 0 < expDiff ) {
2413             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2414             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2415             aSig64 = - ( ( bSig * q64 )<<38 );
2416             expDiff -= 62;
2417         }
2418         expDiff += 64;
2419         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2420         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2421         q = q64>>( 64 - expDiff );
2422         bSig <<= 6;
2423         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2424     }
2425     do {
2426         alternateASig = aSig;
2427         ++q;
2428         aSig -= bSig;
2429     } while ( 0 <= (int32_t) aSig );
2430     sigMean = aSig + alternateASig;
2431     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2432         aSig = alternateASig;
2433     }
2434     zSign = ( (int32_t) aSig < 0 );
2435     if ( zSign ) aSig = - aSig;
2436     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
2437 }
2438
2439 /*----------------------------------------------------------------------------
2440 | Returns the result of multiplying the single-precision floating-point values
2441 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2442 | multiplication.  The operation is performed according to the IEC/IEEE
2443 | Standard for Binary Floating-Point Arithmetic 754-2008.
2444 | The flags argument allows the caller to select negation of the
2445 | addend, the intermediate product, or the final result. (The difference
2446 | between this and having the caller do a separate negation is that negating
2447 | externally will flip the sign bit on NaNs.)
2448 *----------------------------------------------------------------------------*/
2449
2450 float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2451                        float_status *status)
2452 {
2453     flag aSign, bSign, cSign, zSign;
2454     int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2455     uint32_t aSig, bSig, cSig;
2456     flag pInf, pZero, pSign;
2457     uint64_t pSig64, cSig64, zSig64;
2458     uint32_t pSig;
2459     int shiftcount;
2460     flag signflip, infzero;
2461
2462     a = float32_squash_input_denormal(a, status);
2463     b = float32_squash_input_denormal(b, status);
2464     c = float32_squash_input_denormal(c, status);
2465     aSig = extractFloat32Frac(a);
2466     aExp = extractFloat32Exp(a);
2467     aSign = extractFloat32Sign(a);
2468     bSig = extractFloat32Frac(b);
2469     bExp = extractFloat32Exp(b);
2470     bSign = extractFloat32Sign(b);
2471     cSig = extractFloat32Frac(c);
2472     cExp = extractFloat32Exp(c);
2473     cSign = extractFloat32Sign(c);
2474
2475     infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2476                (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2477
2478     /* It is implementation-defined whether the cases of (0,inf,qnan)
2479      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2480      * they return if they do), so we have to hand this information
2481      * off to the target-specific pick-a-NaN routine.
2482      */
2483     if (((aExp == 0xff) && aSig) ||
2484         ((bExp == 0xff) && bSig) ||
2485         ((cExp == 0xff) && cSig)) {
2486         return propagateFloat32MulAddNaN(a, b, c, infzero, status);
2487     }
2488
2489     if (infzero) {
2490         float_raise(float_flag_invalid, status);
2491         return float32_default_nan;
2492     }
2493
2494     if (flags & float_muladd_negate_c) {
2495         cSign ^= 1;
2496     }
2497
2498     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2499
2500     /* Work out the sign and type of the product */
2501     pSign = aSign ^ bSign;
2502     if (flags & float_muladd_negate_product) {
2503         pSign ^= 1;
2504     }
2505     pInf = (aExp == 0xff) || (bExp == 0xff);
2506     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2507
2508     if (cExp == 0xff) {
2509         if (pInf && (pSign ^ cSign)) {
2510             /* addition of opposite-signed infinities => InvalidOperation */
2511             float_raise(float_flag_invalid, status);
2512             return float32_default_nan;
2513         }
2514         /* Otherwise generate an infinity of the same sign */
2515         return packFloat32(cSign ^ signflip, 0xff, 0);
2516     }
2517
2518     if (pInf) {
2519         return packFloat32(pSign ^ signflip, 0xff, 0);
2520     }
2521
2522     if (pZero) {
2523         if (cExp == 0) {
2524             if (cSig == 0) {
2525                 /* Adding two exact zeroes */
2526                 if (pSign == cSign) {
2527                     zSign = pSign;
2528                 } else if (status->float_rounding_mode == float_round_down) {
2529                     zSign = 1;
2530                 } else {
2531                     zSign = 0;
2532                 }
2533                 return packFloat32(zSign ^ signflip, 0, 0);
2534             }
2535             /* Exact zero plus a denorm */
2536             if (status->flush_to_zero) {
2537                 float_raise(float_flag_output_denormal, status);
2538                 return packFloat32(cSign ^ signflip, 0, 0);
2539             }
2540         }
2541         /* Zero plus something non-zero : just return the something */
2542         if (flags & float_muladd_halve_result) {
2543             if (cExp == 0) {
2544                 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2545             }
2546             /* Subtract one to halve, and one again because roundAndPackFloat32
2547              * wants one less than the true exponent.
2548              */
2549             cExp -= 2;
2550             cSig = (cSig | 0x00800000) << 7;
2551             return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
2552         }
2553         return packFloat32(cSign ^ signflip, cExp, cSig);
2554     }
2555
2556     if (aExp == 0) {
2557         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2558     }
2559     if (bExp == 0) {
2560         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2561     }
2562
2563     /* Calculate the actual result a * b + c */
2564
2565     /* Multiply first; this is easy. */
2566     /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2567      * because we want the true exponent, not the "one-less-than"
2568      * flavour that roundAndPackFloat32() takes.
2569      */
2570     pExp = aExp + bExp - 0x7e;
2571     aSig = (aSig | 0x00800000) << 7;
2572     bSig = (bSig | 0x00800000) << 8;
2573     pSig64 = (uint64_t)aSig * bSig;
2574     if ((int64_t)(pSig64 << 1) >= 0) {
2575         pSig64 <<= 1;
2576         pExp--;
2577     }
2578
2579     zSign = pSign ^ signflip;
2580
2581     /* Now pSig64 is the significand of the multiply, with the explicit bit in
2582      * position 62.
2583      */
2584     if (cExp == 0) {
2585         if (!cSig) {
2586             /* Throw out the special case of c being an exact zero now */
2587             shift64RightJamming(pSig64, 32, &pSig64);
2588             pSig = pSig64;
2589             if (flags & float_muladd_halve_result) {
2590                 pExp--;
2591             }
2592             return roundAndPackFloat32(zSign, pExp - 1,
2593                                        pSig, status);
2594         }
2595         normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2596     }
2597
2598     cSig64 = (uint64_t)cSig << (62 - 23);
2599     cSig64 |= LIT64(0x4000000000000000);
2600     expDiff = pExp - cExp;
2601
2602     if (pSign == cSign) {
2603         /* Addition */
2604         if (expDiff > 0) {
2605             /* scale c to match p */
2606             shift64RightJamming(cSig64, expDiff, &cSig64);
2607             zExp = pExp;
2608         } else if (expDiff < 0) {
2609             /* scale p to match c */
2610             shift64RightJamming(pSig64, -expDiff, &pSig64);
2611             zExp = cExp;
2612         } else {
2613             /* no scaling needed */
2614             zExp = cExp;
2615         }
2616         /* Add significands and make sure explicit bit ends up in posn 62 */
2617         zSig64 = pSig64 + cSig64;
2618         if ((int64_t)zSig64 < 0) {
2619             shift64RightJamming(zSig64, 1, &zSig64);
2620         } else {
2621             zExp--;
2622         }
2623     } else {
2624         /* Subtraction */
2625         if (expDiff > 0) {
2626             shift64RightJamming(cSig64, expDiff, &cSig64);
2627             zSig64 = pSig64 - cSig64;
2628             zExp = pExp;
2629         } else if (expDiff < 0) {
2630             shift64RightJamming(pSig64, -expDiff, &pSig64);
2631             zSig64 = cSig64 - pSig64;
2632             zExp = cExp;
2633             zSign ^= 1;
2634         } else {
2635             zExp = pExp;
2636             if (cSig64 < pSig64) {
2637                 zSig64 = pSig64 - cSig64;
2638             } else if (pSig64 < cSig64) {
2639                 zSig64 = cSig64 - pSig64;
2640                 zSign ^= 1;
2641             } else {
2642                 /* Exact zero */
2643                 zSign = signflip;
2644                 if (status->float_rounding_mode == float_round_down) {
2645                     zSign ^= 1;
2646                 }
2647                 return packFloat32(zSign, 0, 0);
2648             }
2649         }
2650         --zExp;
2651         /* Normalize to put the explicit bit back into bit 62. */
2652         shiftcount = countLeadingZeros64(zSig64) - 1;
2653         zSig64 <<= shiftcount;
2654         zExp -= shiftcount;
2655     }
2656     if (flags & float_muladd_halve_result) {
2657         zExp--;
2658     }
2659
2660     shift64RightJamming(zSig64, 32, &zSig64);
2661     return roundAndPackFloat32(zSign, zExp, zSig64, status);
2662 }
2663
2664
2665 /*----------------------------------------------------------------------------
2666 | Returns the square root of the single-precision floating-point value `a'.
2667 | The operation is performed according to the IEC/IEEE Standard for Binary
2668 | Floating-Point Arithmetic.
2669 *----------------------------------------------------------------------------*/
2670
2671 float32 float32_sqrt(float32 a, float_status *status)
2672 {
2673     flag aSign;
2674     int_fast16_t aExp, zExp;
2675     uint32_t aSig, zSig;
2676     uint64_t rem, term;
2677     a = float32_squash_input_denormal(a, status);
2678
2679     aSig = extractFloat32Frac( a );
2680     aExp = extractFloat32Exp( a );
2681     aSign = extractFloat32Sign( a );
2682     if ( aExp == 0xFF ) {
2683         if (aSig) {
2684             return propagateFloat32NaN(a, float32_zero, status);
2685         }
2686         if ( ! aSign ) return a;
2687         float_raise(float_flag_invalid, status);
2688         return float32_default_nan;
2689     }
2690     if ( aSign ) {
2691         if ( ( aExp | aSig ) == 0 ) return a;
2692         float_raise(float_flag_invalid, status);
2693         return float32_default_nan;
2694     }
2695     if ( aExp == 0 ) {
2696         if ( aSig == 0 ) return float32_zero;
2697         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2698     }
2699     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2700     aSig = ( aSig | 0x00800000 )<<8;
2701     zSig = estimateSqrt32( aExp, aSig ) + 2;
2702     if ( ( zSig & 0x7F ) <= 5 ) {
2703         if ( zSig < 2 ) {
2704             zSig = 0x7FFFFFFF;
2705             goto roundAndPack;
2706         }
2707         aSig >>= aExp & 1;
2708         term = ( (uint64_t) zSig ) * zSig;
2709         rem = ( ( (uint64_t) aSig )<<32 ) - term;
2710         while ( (int64_t) rem < 0 ) {
2711             --zSig;
2712             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2713         }
2714         zSig |= ( rem != 0 );
2715     }
2716     shift32RightJamming( zSig, 1, &zSig );
2717  roundAndPack:
2718     return roundAndPackFloat32(0, zExp, zSig, status);
2719
2720 }
2721
2722 /*----------------------------------------------------------------------------
2723 | Returns the binary exponential of the single-precision floating-point value
2724 | `a'. The operation is performed according to the IEC/IEEE Standard for
2725 | Binary Floating-Point Arithmetic.
2726 |
2727 | Uses the following identities:
2728 |
2729 | 1. -------------------------------------------------------------------------
2730 |      x    x*ln(2)
2731 |     2  = e
2732 |
2733 | 2. -------------------------------------------------------------------------
2734 |                      2     3     4     5           n
2735 |      x        x     x     x     x     x           x
2736 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2737 |               1!    2!    3!    4!    5!          n!
2738 *----------------------------------------------------------------------------*/
2739
2740 static const float64 float32_exp2_coefficients[15] =
2741 {
2742     const_float64( 0x3ff0000000000000ll ), /*  1 */
2743     const_float64( 0x3fe0000000000000ll ), /*  2 */
2744     const_float64( 0x3fc5555555555555ll ), /*  3 */
2745     const_float64( 0x3fa5555555555555ll ), /*  4 */
2746     const_float64( 0x3f81111111111111ll ), /*  5 */
2747     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2748     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2749     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2750     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2751     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2752     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2753     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2754     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2755     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2756     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2757 };
2758
2759 float32 float32_exp2(float32 a, float_status *status)
2760 {
2761     flag aSign;
2762     int_fast16_t aExp;
2763     uint32_t aSig;
2764     float64 r, x, xn;
2765     int i;
2766     a = float32_squash_input_denormal(a, status);
2767
2768     aSig = extractFloat32Frac( a );
2769     aExp = extractFloat32Exp( a );
2770     aSign = extractFloat32Sign( a );
2771
2772     if ( aExp == 0xFF) {
2773         if (aSig) {
2774             return propagateFloat32NaN(a, float32_zero, status);
2775         }
2776         return (aSign) ? float32_zero : a;
2777     }
2778     if (aExp == 0) {
2779         if (aSig == 0) return float32_one;
2780     }
2781
2782     float_raise(float_flag_inexact, status);
2783
2784     /* ******************************* */
2785     /* using float64 for approximation */
2786     /* ******************************* */
2787     x = float32_to_float64(a, status);
2788     x = float64_mul(x, float64_ln2, status);
2789
2790     xn = x;
2791     r = float64_one;
2792     for (i = 0 ; i < 15 ; i++) {
2793         float64 f;
2794
2795         f = float64_mul(xn, float32_exp2_coefficients[i], status);
2796         r = float64_add(r, f, status);
2797
2798         xn = float64_mul(xn, x, status);
2799     }
2800
2801     return float64_to_float32(r, status);
2802 }
2803
2804 /*----------------------------------------------------------------------------
2805 | Returns the binary log of the single-precision floating-point value `a'.
2806 | The operation is performed according to the IEC/IEEE Standard for Binary
2807 | Floating-Point Arithmetic.
2808 *----------------------------------------------------------------------------*/
2809 float32 float32_log2(float32 a, float_status *status)
2810 {
2811     flag aSign, zSign;
2812     int_fast16_t aExp;
2813     uint32_t aSig, zSig, i;
2814
2815     a = float32_squash_input_denormal(a, status);
2816     aSig = extractFloat32Frac( a );
2817     aExp = extractFloat32Exp( a );
2818     aSign = extractFloat32Sign( a );
2819
2820     if ( aExp == 0 ) {
2821         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2822         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2823     }
2824     if ( aSign ) {
2825         float_raise(float_flag_invalid, status);
2826         return float32_default_nan;
2827     }
2828     if ( aExp == 0xFF ) {
2829         if (aSig) {
2830             return propagateFloat32NaN(a, float32_zero, status);
2831         }
2832         return a;
2833     }
2834
2835     aExp -= 0x7F;
2836     aSig |= 0x00800000;
2837     zSign = aExp < 0;
2838     zSig = aExp << 23;
2839
2840     for (i = 1 << 22; i > 0; i >>= 1) {
2841         aSig = ( (uint64_t)aSig * aSig ) >> 23;
2842         if ( aSig & 0x01000000 ) {
2843             aSig >>= 1;
2844             zSig |= i;
2845         }
2846     }
2847
2848     if ( zSign )
2849         zSig = -zSig;
2850
2851     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
2852 }
2853
2854 /*----------------------------------------------------------------------------
2855 | Returns 1 if the single-precision floating-point value `a' is equal to
2856 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2857 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2858 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2859 *----------------------------------------------------------------------------*/
2860
2861 int float32_eq(float32 a, float32 b, float_status *status)
2862 {
2863     uint32_t av, bv;
2864     a = float32_squash_input_denormal(a, status);
2865     b = float32_squash_input_denormal(b, status);
2866
2867     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2868          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2869        ) {
2870         float_raise(float_flag_invalid, status);
2871         return 0;
2872     }
2873     av = float32_val(a);
2874     bv = float32_val(b);
2875     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2876 }
2877
2878 /*----------------------------------------------------------------------------
2879 | Returns 1 if the single-precision floating-point value `a' is less than
2880 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
2881 | exception is raised if either operand is a NaN.  The comparison is performed
2882 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2883 *----------------------------------------------------------------------------*/
2884
2885 int float32_le(float32 a, float32 b, float_status *status)
2886 {
2887     flag aSign, bSign;
2888     uint32_t av, bv;
2889     a = float32_squash_input_denormal(a, status);
2890     b = float32_squash_input_denormal(b, status);
2891
2892     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2893          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2894        ) {
2895         float_raise(float_flag_invalid, status);
2896         return 0;
2897     }
2898     aSign = extractFloat32Sign( a );
2899     bSign = extractFloat32Sign( b );
2900     av = float32_val(a);
2901     bv = float32_val(b);
2902     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2903     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2904
2905 }
2906
2907 /*----------------------------------------------------------------------------
2908 | Returns 1 if the single-precision floating-point value `a' is less than
2909 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2910 | raised if either operand is a NaN.  The comparison is performed according
2911 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2912 *----------------------------------------------------------------------------*/
2913
2914 int float32_lt(float32 a, float32 b, float_status *status)
2915 {
2916     flag aSign, bSign;
2917     uint32_t av, bv;
2918     a = float32_squash_input_denormal(a, status);
2919     b = float32_squash_input_denormal(b, status);
2920
2921     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2922          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2923        ) {
2924         float_raise(float_flag_invalid, status);
2925         return 0;
2926     }
2927     aSign = extractFloat32Sign( a );
2928     bSign = extractFloat32Sign( b );
2929     av = float32_val(a);
2930     bv = float32_val(b);
2931     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2932     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2933
2934 }
2935
2936 /*----------------------------------------------------------------------------
2937 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2938 | be compared, and 0 otherwise.  The invalid exception is raised if either
2939 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
2940 | Standard for Binary Floating-Point Arithmetic.
2941 *----------------------------------------------------------------------------*/
2942
2943 int float32_unordered(float32 a, float32 b, float_status *status)
2944 {
2945     a = float32_squash_input_denormal(a, status);
2946     b = float32_squash_input_denormal(b, status);
2947
2948     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2949          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2950        ) {
2951         float_raise(float_flag_invalid, status);
2952         return 1;
2953     }
2954     return 0;
2955 }
2956
2957 /*----------------------------------------------------------------------------
2958 | Returns 1 if the single-precision floating-point value `a' is equal to
2959 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2960 | exception.  The comparison is performed according to the IEC/IEEE Standard
2961 | for Binary Floating-Point Arithmetic.
2962 *----------------------------------------------------------------------------*/
2963
2964 int float32_eq_quiet(float32 a, float32 b, float_status *status)
2965 {
2966     a = float32_squash_input_denormal(a, status);
2967     b = float32_squash_input_denormal(b, status);
2968
2969     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2970          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2971        ) {
2972         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2973             float_raise(float_flag_invalid, status);
2974         }
2975         return 0;
2976     }
2977     return ( float32_val(a) == float32_val(b) ) ||
2978             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2979 }
2980
2981 /*----------------------------------------------------------------------------
2982 | Returns 1 if the single-precision floating-point value `a' is less than or
2983 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2984 | cause an exception.  Otherwise, the comparison is performed according to the
2985 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2986 *----------------------------------------------------------------------------*/
2987
2988 int float32_le_quiet(float32 a, float32 b, float_status *status)
2989 {
2990     flag aSign, bSign;
2991     uint32_t av, bv;
2992     a = float32_squash_input_denormal(a, status);
2993     b = float32_squash_input_denormal(b, status);
2994
2995     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2996          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2997        ) {
2998         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2999             float_raise(float_flag_invalid, status);
3000         }
3001         return 0;
3002     }
3003     aSign = extractFloat32Sign( a );
3004     bSign = extractFloat32Sign( b );
3005     av = float32_val(a);
3006     bv = float32_val(b);
3007     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3008     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3009
3010 }
3011
3012 /*----------------------------------------------------------------------------
3013 | Returns 1 if the single-precision floating-point value `a' is less than
3014 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3015 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3016 | Standard for Binary Floating-Point Arithmetic.
3017 *----------------------------------------------------------------------------*/
3018
3019 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3020 {
3021     flag aSign, bSign;
3022     uint32_t av, bv;
3023     a = float32_squash_input_denormal(a, status);
3024     b = float32_squash_input_denormal(b, status);
3025
3026     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3027          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3028        ) {
3029         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
3030             float_raise(float_flag_invalid, status);
3031         }
3032         return 0;
3033     }
3034     aSign = extractFloat32Sign( a );
3035     bSign = extractFloat32Sign( b );
3036     av = float32_val(a);
3037     bv = float32_val(b);
3038     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3039     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3040
3041 }
3042
3043 /*----------------------------------------------------------------------------
3044 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3045 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3046 | comparison is performed according to the IEC/IEEE Standard for Binary
3047 | Floating-Point Arithmetic.
3048 *----------------------------------------------------------------------------*/
3049
3050 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3051 {
3052     a = float32_squash_input_denormal(a, status);
3053     b = float32_squash_input_denormal(b, status);
3054
3055     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3056          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3057        ) {
3058         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
3059             float_raise(float_flag_invalid, status);
3060         }
3061         return 1;
3062     }
3063     return 0;
3064 }
3065
3066 /*----------------------------------------------------------------------------
3067 | Returns the result of converting the double-precision floating-point value
3068 | `a' to the 32-bit two's complement integer format.  The conversion is
3069 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3070 | Arithmetic---which means in particular that the conversion is rounded
3071 | according to the current rounding mode.  If `a' is a NaN, the largest
3072 | positive integer is returned.  Otherwise, if the conversion overflows, the
3073 | largest integer with the same sign as `a' is returned.
3074 *----------------------------------------------------------------------------*/
3075
3076 int32_t float64_to_int32(float64 a, float_status *status)
3077 {
3078     flag aSign;
3079     int_fast16_t aExp, shiftCount;
3080     uint64_t aSig;
3081     a = float64_squash_input_denormal(a, status);
3082
3083     aSig = extractFloat64Frac( a );
3084     aExp = extractFloat64Exp( a );
3085     aSign = extractFloat64Sign( a );
3086     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3087     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3088     shiftCount = 0x42C - aExp;
3089     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3090     return roundAndPackInt32(aSign, aSig, status);
3091
3092 }
3093
3094 /*----------------------------------------------------------------------------
3095 | Returns the result of converting the double-precision floating-point value
3096 | `a' to the 32-bit two's complement integer format.  The conversion is
3097 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3098 | Arithmetic, except that the conversion is always rounded toward zero.
3099 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3100 | the conversion overflows, the largest integer with the same sign as `a' is
3101 | returned.
3102 *----------------------------------------------------------------------------*/
3103
3104 int32_t float64_to_int32_round_to_zero(float64 a, float_status *status)
3105 {
3106     flag aSign;
3107     int_fast16_t aExp, shiftCount;
3108     uint64_t aSig, savedASig;
3109     int32_t z;
3110     a = float64_squash_input_denormal(a, status);
3111
3112     aSig = extractFloat64Frac( a );
3113     aExp = extractFloat64Exp( a );
3114     aSign = extractFloat64Sign( a );
3115     if ( 0x41E < aExp ) {
3116         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3117         goto invalid;
3118     }
3119     else if ( aExp < 0x3FF ) {
3120         if (aExp || aSig) {
3121             status->float_exception_flags |= float_flag_inexact;
3122         }
3123         return 0;
3124     }
3125     aSig |= LIT64( 0x0010000000000000 );
3126     shiftCount = 0x433 - aExp;
3127     savedASig = aSig;
3128     aSig >>= shiftCount;
3129     z = aSig;
3130     if ( aSign ) z = - z;
3131     if ( ( z < 0 ) ^ aSign ) {
3132  invalid:
3133         float_raise(float_flag_invalid, status);
3134         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3135     }
3136     if ( ( aSig<<shiftCount ) != savedASig ) {
3137         status->float_exception_flags |= float_flag_inexact;
3138     }
3139     return z;
3140
3141 }
3142
3143 /*----------------------------------------------------------------------------
3144 | Returns the result of converting the double-precision floating-point value
3145 | `a' to the 16-bit two's complement integer format.  The conversion is
3146 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3147 | Arithmetic, except that the conversion is always rounded toward zero.
3148 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3149 | the conversion overflows, the largest integer with the same sign as `a' is
3150 | returned.
3151 *----------------------------------------------------------------------------*/
3152
3153 int_fast16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
3154 {
3155     flag aSign;
3156     int_fast16_t aExp, shiftCount;
3157     uint64_t aSig, savedASig;
3158     int32_t z;
3159
3160     aSig = extractFloat64Frac( a );
3161     aExp = extractFloat64Exp( a );
3162     aSign = extractFloat64Sign( a );
3163     if ( 0x40E < aExp ) {
3164         if ( ( aExp == 0x7FF ) && aSig ) {
3165             aSign = 0;
3166         }
3167         goto invalid;
3168     }
3169     else if ( aExp < 0x3FF ) {
3170         if ( aExp || aSig ) {
3171             status->float_exception_flags |= float_flag_inexact;
3172         }
3173         return 0;
3174     }
3175     aSig |= LIT64( 0x0010000000000000 );
3176     shiftCount = 0x433 - aExp;
3177     savedASig = aSig;
3178     aSig >>= shiftCount;
3179     z = aSig;
3180     if ( aSign ) {
3181         z = - z;
3182     }
3183     if ( ( (int16_t)z < 0 ) ^ aSign ) {
3184  invalid:
3185         float_raise(float_flag_invalid, status);
3186         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3187     }
3188     if ( ( aSig<<shiftCount ) != savedASig ) {
3189         status->float_exception_flags |= float_flag_inexact;
3190     }
3191     return z;
3192 }
3193
3194 /*----------------------------------------------------------------------------
3195 | Returns the result of converting the double-precision floating-point value
3196 | `a' to the 64-bit two's complement integer format.  The conversion is
3197 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3198 | Arithmetic---which means in particular that the conversion is rounded
3199 | according to the current rounding mode.  If `a' is a NaN, the largest
3200 | positive integer is returned.  Otherwise, if the conversion overflows, the
3201 | largest integer with the same sign as `a' is returned.
3202 *----------------------------------------------------------------------------*/
3203
3204 int64_t float64_to_int64(float64 a, float_status *status)
3205 {
3206     flag aSign;
3207     int_fast16_t aExp, shiftCount;
3208     uint64_t aSig, aSigExtra;
3209     a = float64_squash_input_denormal(a, status);
3210
3211     aSig = extractFloat64Frac( a );
3212     aExp = extractFloat64Exp( a );
3213     aSign = extractFloat64Sign( a );
3214     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3215     shiftCount = 0x433 - aExp;
3216     if ( shiftCount <= 0 ) {
3217         if ( 0x43E < aExp ) {
3218             float_raise(float_flag_invalid, status);
3219             if (    ! aSign
3220                  || (    ( aExp == 0x7FF )
3221                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
3222                ) {
3223                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3224             }
3225             return (int64_t) LIT64( 0x8000000000000000 );
3226         }
3227         aSigExtra = 0;
3228         aSig <<= - shiftCount;
3229     }
3230     else {
3231         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3232     }
3233     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
3234
3235 }
3236
3237 /*----------------------------------------------------------------------------
3238 | Returns the result of converting the double-precision floating-point value
3239 | `a' to the 64-bit two's complement integer format.  The conversion is
3240 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3241 | Arithmetic, except that the conversion is always rounded toward zero.
3242 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3243 | the conversion overflows, the largest integer with the same sign as `a' is
3244 | returned.
3245 *----------------------------------------------------------------------------*/
3246
3247 int64_t float64_to_int64_round_to_zero(float64 a, float_status *status)
3248 {
3249     flag aSign;
3250     int_fast16_t aExp, shiftCount;
3251     uint64_t aSig;
3252     int64_t z;
3253     a = float64_squash_input_denormal(a, status);
3254
3255     aSig = extractFloat64Frac( a );
3256     aExp = extractFloat64Exp( a );
3257     aSign = extractFloat64Sign( a );
3258     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3259     shiftCount = aExp - 0x433;
3260     if ( 0 <= shiftCount ) {
3261         if ( 0x43E <= aExp ) {
3262             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3263                 float_raise(float_flag_invalid, status);
3264                 if (    ! aSign
3265                      || (    ( aExp == 0x7FF )
3266                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
3267                    ) {
3268                     return LIT64( 0x7FFFFFFFFFFFFFFF );
3269                 }
3270             }
3271             return (int64_t) LIT64( 0x8000000000000000 );
3272         }
3273         z = aSig<<shiftCount;
3274     }
3275     else {
3276         if ( aExp < 0x3FE ) {
3277             if (aExp | aSig) {
3278                 status->float_exception_flags |= float_flag_inexact;
3279             }
3280             return 0;
3281         }
3282         z = aSig>>( - shiftCount );
3283         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3284             status->float_exception_flags |= float_flag_inexact;
3285         }
3286     }
3287     if ( aSign ) z = - z;
3288     return z;
3289
3290 }
3291
3292 /*----------------------------------------------------------------------------
3293 | Returns the result of converting the double-precision floating-point value
3294 | `a' to the single-precision floating-point format.  The conversion is
3295 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3296 | Arithmetic.
3297 *----------------------------------------------------------------------------*/
3298
3299 float32 float64_to_float32(float64 a, float_status *status)
3300 {
3301     flag aSign;
3302     int_fast16_t aExp;
3303     uint64_t aSig;
3304     uint32_t zSig;
3305     a = float64_squash_input_denormal(a, status);
3306
3307     aSig = extractFloat64Frac( a );
3308     aExp = extractFloat64Exp( a );
3309     aSign = extractFloat64Sign( a );
3310     if ( aExp == 0x7FF ) {
3311         if (aSig) {
3312             return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3313         }
3314         return packFloat32( aSign, 0xFF, 0 );
3315     }
3316     shift64RightJamming( aSig, 22, &aSig );
3317     zSig = aSig;
3318     if ( aExp || zSig ) {
3319         zSig |= 0x40000000;
3320         aExp -= 0x381;
3321     }
3322     return roundAndPackFloat32(aSign, aExp, zSig, status);
3323
3324 }
3325
3326
3327 /*----------------------------------------------------------------------------
3328 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3329 | half-precision floating-point value, returning the result.  After being
3330 | shifted into the proper positions, the three fields are simply added
3331 | together to form the result.  This means that any integer portion of `zSig'
3332 | will be added into the exponent.  Since a properly normalized significand
3333 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3334 | than the desired result exponent whenever `zSig' is a complete, normalized
3335 | significand.
3336 *----------------------------------------------------------------------------*/
3337 static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3338 {
3339     return make_float16(
3340         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3341 }
3342
3343 /*----------------------------------------------------------------------------
3344 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3345 | and significand `zSig', and returns the proper half-precision floating-
3346 | point value corresponding to the abstract input.  Ordinarily, the abstract
3347 | value is simply rounded and packed into the half-precision format, with
3348 | the inexact exception raised if the abstract input cannot be represented
3349 | exactly.  However, if the abstract value is too large, the overflow and
3350 | inexact exceptions are raised and an infinity or maximal finite value is
3351 | returned.  If the abstract value is too small, the input value is rounded to
3352 | a subnormal number, and the underflow and inexact exceptions are raised if
3353 | the abstract input cannot be represented exactly as a subnormal half-
3354 | precision floating-point number.
3355 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3356 | ARM-style "alternative representation", which omits the NaN and Inf
3357 | encodings in order to raise the maximum representable exponent by one.
3358 |     The input significand `zSig' has its binary point between bits 22
3359 | and 23, which is 13 bits to the left of the usual location.  This shifted
3360 | significand must be normalized or smaller.  If `zSig' is not normalized,
3361 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3362 | and it must not require rounding.  In the usual case that `zSig' is
3363 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3364 | Note the slightly odd position of the binary point in zSig compared with the
3365 | other roundAndPackFloat functions. This should probably be fixed if we
3366 | need to implement more float16 routines than just conversion.
3367 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3368 | Binary Floating-Point Arithmetic.
3369 *----------------------------------------------------------------------------*/
3370
3371 static float16 roundAndPackFloat16(flag zSign, int_fast16_t zExp,
3372                                    uint32_t zSig, flag ieee,
3373                                    float_status *status)
3374 {
3375     int maxexp = ieee ? 29 : 30;
3376     uint32_t mask;
3377     uint32_t increment;
3378     bool rounding_bumps_exp;
3379     bool is_tiny = false;
3380
3381     /* Calculate the mask of bits of the mantissa which are not
3382      * representable in half-precision and will be lost.
3383      */
3384     if (zExp < 1) {
3385         /* Will be denormal in halfprec */
3386         mask = 0x00ffffff;
3387         if (zExp >= -11) {
3388             mask >>= 11 + zExp;
3389         }
3390     } else {
3391         /* Normal number in halfprec */
3392         mask = 0x00001fff;
3393     }
3394
3395     switch (status->float_rounding_mode) {
3396     case float_round_nearest_even:
3397         increment = (mask + 1) >> 1;
3398         if ((zSig & mask) == increment) {
3399             increment = zSig & (increment << 1);
3400         }
3401         break;
3402     case float_round_ties_away:
3403         increment = (mask + 1) >> 1;
3404         break;
3405     case float_round_up:
3406         increment = zSign ? 0 : mask;
3407         break;
3408     case float_round_down:
3409         increment = zSign ? mask : 0;
3410         break;
3411     default: /* round_to_zero */
3412         increment = 0;
3413         break;
3414     }
3415
3416     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3417
3418     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3419         if (ieee) {
3420             float_raise(float_flag_overflow | float_flag_inexact, status);
3421             return packFloat16(zSign, 0x1f, 0);
3422         } else {
3423             float_raise(float_flag_invalid, status);
3424             return packFloat16(zSign, 0x1f, 0x3ff);
3425         }
3426     }
3427
3428     if (zExp < 0) {
3429         /* Note that flush-to-zero does not affect half-precision results */
3430         is_tiny =
3431             (status->float_detect_tininess == float_tininess_before_rounding)
3432             || (zExp < -1)
3433             || (!rounding_bumps_exp);
3434     }
3435     if (zSig & mask) {
3436         float_raise(float_flag_inexact, status);
3437         if (is_tiny) {
3438             float_raise(float_flag_underflow, status);
3439         }
3440     }
3441
3442     zSig += increment;
3443     if (rounding_bumps_exp) {
3444         zSig >>= 1;
3445         zExp++;
3446     }
3447
3448     if (zExp < -10) {
3449         return packFloat16(zSign, 0, 0);
3450     }
3451     if (zExp < 0) {
3452         zSig >>= -zExp;
3453         zExp = 0;
3454     }
3455     return packFloat16(zSign, zExp, zSig >> 13);
3456 }
3457
3458 static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr,
3459                                       uint32_t *zSigPtr)
3460 {
3461     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3462     *zSigPtr = aSig << shiftCount;
3463     *zExpPtr = 1 - shiftCount;
3464 }
3465
3466 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3467    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3468
3469 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3470 {
3471     flag aSign;
3472     int_fast16_t aExp;
3473     uint32_t aSig;
3474
3475     aSign = extractFloat16Sign(a);
3476     aExp = extractFloat16Exp(a);
3477     aSig = extractFloat16Frac(a);
3478
3479     if (aExp == 0x1f && ieee) {
3480         if (aSig) {
3481             return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3482         }
3483         return packFloat32(aSign, 0xff, 0);
3484     }
3485     if (aExp == 0) {
3486         if (aSig == 0) {
3487             return packFloat32(aSign, 0, 0);
3488         }
3489
3490         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3491         aExp--;
3492     }
3493     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3494 }
3495
3496 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3497 {
3498     flag aSign;
3499     int_fast16_t aExp;
3500     uint32_t aSig;
3501
3502     a = float32_squash_input_denormal(a, status);
3503
3504     aSig = extractFloat32Frac( a );
3505     aExp = extractFloat32Exp( a );
3506     aSign = extractFloat32Sign( a );
3507     if ( aExp == 0xFF ) {
3508         if (aSig) {
3509             /* Input is a NaN */
3510             if (!ieee) {
3511                 float_raise(float_flag_invalid, status);
3512                 return packFloat16(aSign, 0, 0);
3513             }
3514             return commonNaNToFloat16(
3515                 float32ToCommonNaN(a, status), status);
3516         }
3517         /* Infinity */
3518         if (!ieee) {
3519             float_raise(float_flag_invalid, status);
3520             return packFloat16(aSign, 0x1f, 0x3ff);
3521         }
3522         return packFloat16(aSign, 0x1f, 0);
3523     }
3524     if (aExp == 0 && aSig == 0) {
3525         return packFloat16(aSign, 0, 0);
3526     }
3527     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3528      * even if the input is denormal; however this is harmless because
3529      * the largest possible single-precision denormal is still smaller
3530      * than the smallest representable half-precision denormal, and so we
3531      * will end up ignoring aSig and returning via the "always return zero"
3532      * codepath.
3533      */
3534     aSig |= 0x00800000;
3535     aExp -= 0x71;
3536
3537     return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3538 }
3539
3540 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3541 {
3542     flag aSign;
3543     int_fast16_t aExp;
3544     uint32_t aSig;
3545
3546     aSign = extractFloat16Sign(a);
3547     aExp = extractFloat16Exp(a);
3548     aSig = extractFloat16Frac(a);
3549
3550     if (aExp == 0x1f && ieee) {
3551         if (aSig) {
3552             return commonNaNToFloat64(
3553                 float16ToCommonNaN(a, status), status);
3554         }
3555         return packFloat64(aSign, 0x7ff, 0);
3556     }
3557     if (aExp == 0) {
3558         if (aSig == 0) {
3559             return packFloat64(aSign, 0, 0);
3560         }
3561
3562         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3563         aExp--;
3564     }
3565     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3566 }
3567
3568 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3569 {
3570     flag aSign;
3571     int_fast16_t aExp;
3572     uint64_t aSig;
3573     uint32_t zSig;
3574
3575     a = float64_squash_input_denormal(a, status);
3576
3577     aSig = extractFloat64Frac(a);
3578     aExp = extractFloat64Exp(a);
3579     aSign = extractFloat64Sign(a);
3580     if (aExp == 0x7FF) {
3581         if (aSig) {
3582             /* Input is a NaN */
3583             if (!ieee) {
3584                 float_raise(float_flag_invalid, status);
3585                 return packFloat16(aSign, 0, 0);
3586             }
3587             return commonNaNToFloat16(
3588                 float64ToCommonNaN(a, status), status);
3589         }
3590         /* Infinity */
3591         if (!ieee) {
3592             float_raise(float_flag_invalid, status);
3593             return packFloat16(aSign, 0x1f, 0x3ff);
3594         }
3595         return packFloat16(aSign, 0x1f, 0);
3596     }
3597     shift64RightJamming(aSig, 29, &aSig);
3598     zSig = aSig;
3599     if (aExp == 0 && zSig == 0) {
3600         return packFloat16(aSign, 0, 0);
3601     }
3602     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3603      * even if the input is denormal; however this is harmless because
3604      * the largest possible single-precision denormal is still smaller
3605      * than the smallest representable half-precision denormal, and so we
3606      * will end up ignoring aSig and returning via the "always return zero"
3607      * codepath.
3608      */
3609     zSig |= 0x00800000;
3610     aExp -= 0x3F1;
3611
3612     return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
3613 }
3614
3615 /*----------------------------------------------------------------------------
3616 | Returns the result of converting the double-precision floating-point value
3617 | `a' to the extended double-precision floating-point format.  The conversion
3618 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3619 | Arithmetic.
3620 *----------------------------------------------------------------------------*/
3621
3622 floatx80 float64_to_floatx80(float64 a, float_status *status)
3623 {
3624     flag aSign;
3625     int_fast16_t aExp;
3626     uint64_t aSig;
3627
3628     a = float64_squash_input_denormal(a, status);
3629     aSig = extractFloat64Frac( a );
3630     aExp = extractFloat64Exp( a );
3631     aSign = extractFloat64Sign( a );
3632     if ( aExp == 0x7FF ) {
3633         if (aSig) {
3634             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3635         }
3636         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3637     }
3638     if ( aExp == 0 ) {
3639         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3640         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3641     }
3642     return
3643         packFloatx80(
3644             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3645
3646 }
3647
3648 /*----------------------------------------------------------------------------
3649 | Returns the result of converting the double-precision floating-point value
3650 | `a' to the quadruple-precision floating-point format.  The conversion is
3651 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3652 | Arithmetic.
3653 *----------------------------------------------------------------------------*/
3654
3655 float128 float64_to_float128(float64 a, float_status *status)
3656 {
3657     flag aSign;
3658     int_fast16_t aExp;
3659     uint64_t aSig, zSig0, zSig1;
3660
3661     a = float64_squash_input_denormal(a, status);
3662     aSig = extractFloat64Frac( a );
3663     aExp = extractFloat64Exp( a );
3664     aSign = extractFloat64Sign( a );
3665     if ( aExp == 0x7FF ) {
3666         if (aSig) {
3667             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3668         }
3669         return packFloat128( aSign, 0x7FFF, 0, 0 );
3670     }
3671     if ( aExp == 0 ) {
3672         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3673         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3674         --aExp;
3675     }
3676     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3677     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3678
3679 }
3680
3681 /*----------------------------------------------------------------------------
3682 | Rounds the double-precision floating-point value `a' to an integer, and
3683 | returns the result as a double-precision floating-point value.  The
3684 | operation is performed according to the IEC/IEEE Standard for Binary
3685 | Floating-Point Arithmetic.
3686 *----------------------------------------------------------------------------*/
3687
3688 float64 float64_round_to_int(float64 a, float_status *status)
3689 {
3690     flag aSign;
3691     int_fast16_t aExp;
3692     uint64_t lastBitMask, roundBitsMask;
3693     uint64_t z;
3694     a = float64_squash_input_denormal(a, status);
3695
3696     aExp = extractFloat64Exp( a );
3697     if ( 0x433 <= aExp ) {
3698         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3699             return propagateFloat64NaN(a, a, status);
3700         }
3701         return a;
3702     }
3703     if ( aExp < 0x3FF ) {
3704         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3705         status->float_exception_flags |= float_flag_inexact;
3706         aSign = extractFloat64Sign( a );
3707         switch (status->float_rounding_mode) {
3708          case float_round_nearest_even:
3709             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3710                 return packFloat64( aSign, 0x3FF, 0 );
3711             }
3712             break;
3713         case float_round_ties_away:
3714             if (aExp == 0x3FE) {
3715                 return packFloat64(aSign, 0x3ff, 0);
3716             }
3717             break;
3718          case float_round_down:
3719             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3720          case float_round_up:
3721             return make_float64(
3722             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3723         }
3724         return packFloat64( aSign, 0, 0 );
3725     }
3726     lastBitMask = 1;
3727     lastBitMask <<= 0x433 - aExp;
3728     roundBitsMask = lastBitMask - 1;
3729     z = float64_val(a);
3730     switch (status->float_rounding_mode) {
3731     case float_round_nearest_even:
3732         z += lastBitMask >> 1;
3733         if ((z & roundBitsMask) == 0) {
3734             z &= ~lastBitMask;
3735         }
3736         break;
3737     case float_round_ties_away:
3738         z += lastBitMask >> 1;
3739         break;
3740     case float_round_to_zero:
3741         break;
3742     case float_round_up:
3743         if (!extractFloat64Sign(make_float64(z))) {
3744             z += roundBitsMask;
3745         }
3746         break;
3747     case float_round_down:
3748         if (extractFloat64Sign(make_float64(z))) {
3749             z += roundBitsMask;
3750         }
3751         break;
3752     default:
3753         abort();
3754     }
3755     z &= ~ roundBitsMask;
3756     if (z != float64_val(a)) {
3757         status->float_exception_flags |= float_flag_inexact;
3758     }
3759     return make_float64(z);
3760
3761 }
3762
3763 float64 float64_trunc_to_int(float64 a, float_status *status)
3764 {
3765     int oldmode;
3766     float64 res;
3767     oldmode = status->float_rounding_mode;
3768     status->float_rounding_mode = float_round_to_zero;
3769     res = float64_round_to_int(a, status);
3770     status->float_rounding_mode = oldmode;
3771     return res;
3772 }
3773
3774 /*----------------------------------------------------------------------------
3775 | Returns the result of adding the absolute values of the double-precision
3776 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3777 | before being returned.  `zSign' is ignored if the result is a NaN.
3778 | The addition is performed according to the IEC/IEEE Standard for Binary
3779 | Floating-Point Arithmetic.
3780 *----------------------------------------------------------------------------*/
3781
3782 static float64 addFloat64Sigs(float64 a, float64 b, flag zSign,
3783                               float_status *status)
3784 {
3785     int_fast16_t aExp, bExp, zExp;
3786     uint64_t aSig, bSig, zSig;
3787     int_fast16_t expDiff;
3788
3789     aSig = extractFloat64Frac( a );
3790     aExp = extractFloat64Exp( a );
3791     bSig = extractFloat64Frac( b );
3792     bExp = extractFloat64Exp( b );
3793     expDiff = aExp - bExp;
3794     aSig <<= 9;
3795     bSig <<= 9;
3796     if ( 0 < expDiff ) {
3797         if ( aExp == 0x7FF ) {
3798             if (aSig) {
3799                 return propagateFloat64NaN(a, b, status);
3800             }
3801             return a;
3802         }
3803         if ( bExp == 0 ) {
3804             --expDiff;
3805         }
3806         else {
3807             bSig |= LIT64( 0x2000000000000000 );
3808         }
3809         shift64RightJamming( bSig, expDiff, &bSig );
3810         zExp = aExp;
3811     }
3812     else if ( expDiff < 0 ) {
3813         if ( bExp == 0x7FF ) {
3814             if (bSig) {
3815                 return propagateFloat64NaN(a, b, status);
3816             }
3817             return packFloat64( zSign, 0x7FF, 0 );
3818         }
3819         if ( aExp == 0 ) {
3820             ++expDiff;
3821         }
3822         else {
3823             aSig |= LIT64( 0x2000000000000000 );
3824         }
3825         shift64RightJamming( aSig, - expDiff, &aSig );
3826         zExp = bExp;
3827     }
3828     else {
3829         if ( aExp == 0x7FF ) {
3830             if (aSig | bSig) {
3831                 return propagateFloat64NaN(a, b, status);
3832             }
3833             return a;
3834         }
3835         if ( aExp == 0 ) {
3836             if (status->flush_to_zero) {
3837                 if (aSig | bSig) {
3838                     float_raise(float_flag_output_denormal, status);
3839                 }
3840                 return packFloat64(zSign, 0, 0);
3841             }
3842             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3843         }
3844         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3845         zExp = aExp;
3846         goto roundAndPack;
3847     }
3848     aSig |= LIT64( 0x2000000000000000 );
3849     zSig = ( aSig + bSig )<<1;
3850     --zExp;
3851     if ( (int64_t) zSig < 0 ) {
3852         zSig = aSig + bSig;
3853         ++zExp;
3854     }
3855  roundAndPack:
3856     return roundAndPackFloat64(zSign, zExp, zSig, status);
3857
3858 }
3859
3860 /*----------------------------------------------------------------------------
3861 | Returns the result of subtracting the absolute values of the double-
3862 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3863 | difference is negated before being returned.  `zSign' is ignored if the
3864 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3865 | Standard for Binary Floating-Point Arithmetic.
3866 *----------------------------------------------------------------------------*/
3867
3868 static float64 subFloat64Sigs(float64 a, float64 b, flag zSign,
3869                               float_status *status)
3870 {
3871     int_fast16_t aExp, bExp, zExp;
3872     uint64_t aSig, bSig, zSig;
3873     int_fast16_t expDiff;
3874
3875     aSig = extractFloat64Frac( a );
3876     aExp = extractFloat64Exp( a );
3877     bSig = extractFloat64Frac( b );
3878     bExp = extractFloat64Exp( b );
3879     expDiff = aExp - bExp;
3880     aSig <<= 10;
3881     bSig <<= 10;
3882     if ( 0 < expDiff ) goto aExpBigger;
3883     if ( expDiff < 0 ) goto bExpBigger;
3884     if ( aExp == 0x7FF ) {
3885         if (aSig | bSig) {
3886             return propagateFloat64NaN(a, b, status);
3887         }
3888         float_raise(float_flag_invalid, status);
3889         return float64_default_nan;
3890     }
3891     if ( aExp == 0 ) {
3892         aExp = 1;
3893         bExp = 1;
3894     }
3895     if ( bSig < aSig ) goto aBigger;
3896     if ( aSig < bSig ) goto bBigger;
3897     return packFloat64(status->float_rounding_mode == float_round_down, 0, 0);
3898  bExpBigger:
3899     if ( bExp == 0x7FF ) {
3900         if (bSig) {
3901             return propagateFloat64NaN(a, b, status);
3902         }
3903         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3904     }
3905     if ( aExp == 0 ) {
3906         ++expDiff;
3907     }
3908     else {
3909         aSig |= LIT64( 0x4000000000000000 );
3910     }
3911     shift64RightJamming( aSig, - expDiff, &aSig );
3912     bSig |= LIT64( 0x4000000000000000 );
3913  bBigger:
3914     zSig = bSig - aSig;
3915     zExp = bExp;
3916     zSign ^= 1;
3917     goto normalizeRoundAndPack;
3918  aExpBigger:
3919     if ( aExp == 0x7FF ) {
3920         if (aSig) {
3921             return propagateFloat64NaN(a, b, status);
3922         }
3923         return a;
3924     }
3925     if ( bExp == 0 ) {
3926         --expDiff;
3927     }
3928     else {
3929         bSig |= LIT64( 0x4000000000000000 );
3930     }
3931     shift64RightJamming( bSig, expDiff, &bSig );
3932     aSig |= LIT64( 0x4000000000000000 );
3933  aBigger:
3934     zSig = aSig - bSig;
3935     zExp = aExp;
3936  normalizeRoundAndPack:
3937     --zExp;
3938     return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
3939
3940 }
3941
3942 /*----------------------------------------------------------------------------
3943 | Returns the result of adding the double-precision floating-point values `a'
3944 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3945 | Binary Floating-Point Arithmetic.
3946 *----------------------------------------------------------------------------*/
3947
3948 float64 float64_add(float64 a, float64 b, float_status *status)
3949 {
3950     flag aSign, bSign;
3951     a = float64_squash_input_denormal(a, status);
3952     b = float64_squash_input_denormal(b, status);
3953
3954     aSign = extractFloat64Sign( a );
3955     bSign = extractFloat64Sign( b );
3956     if ( aSign == bSign ) {
3957         return addFloat64Sigs(a, b, aSign, status);
3958     }
3959     else {
3960         return subFloat64Sigs(a, b, aSign, status);
3961     }
3962
3963 }
3964
3965 /*----------------------------------------------------------------------------
3966 | Returns the result of subtracting the double-precision floating-point values
3967 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3968 | for Binary Floating-Point Arithmetic.
3969 *----------------------------------------------------------------------------*/
3970
3971 float64 float64_sub(float64 a, float64 b, float_status *status)
3972 {
3973     flag aSign, bSign;
3974     a = float64_squash_input_denormal(a, status);
3975     b = float64_squash_input_denormal(b, status);
3976
3977     aSign = extractFloat64Sign( a );
3978     bSign = extractFloat64Sign( b );
3979     if ( aSign == bSign ) {
3980         return subFloat64Sigs(a, b, aSign, status);
3981     }
3982     else {
3983         return addFloat64Sigs(a, b, aSign, status);
3984     }
3985
3986 }
3987
3988 /*----------------------------------------------------------------------------
3989 | Returns the result of multiplying the double-precision floating-point values
3990 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3991 | for Binary Floating-Point Arithmetic.
3992 *----------------------------------------------------------------------------*/
3993
3994 float64 float64_mul(float64 a, float64 b, float_status *status)
3995 {
3996     flag aSign, bSign, zSign;
3997     int_fast16_t aExp, bExp, zExp;
3998     uint64_t aSig, bSig, zSig0, zSig1;
3999
4000     a = float64_squash_input_denormal(a, status);
4001     b = float64_squash_input_denormal(b, status);
4002
4003     aSig = extractFloat64Frac( a );
4004     aExp = extractFloat64Exp( a );
4005     aSign = extractFloat64Sign( a );
4006     bSig = extractFloat64Frac( b );
4007     bExp = extractFloat64Exp( b );
4008     bSign = extractFloat64Sign( b );
4009     zSign = aSign ^ bSign;
4010     if ( aExp == 0x7FF ) {
4011         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4012             return propagateFloat64NaN(a, b, status);
4013         }
4014         if ( ( bExp | bSig ) == 0 ) {
4015             float_raise(float_flag_invalid, status);
4016             return float64_default_nan;
4017         }
4018         return packFloat64( zSign, 0x7FF, 0 );
4019     }
4020     if ( bExp == 0x7FF ) {
4021         if (bSig) {
4022             return propagateFloat64NaN(a, b, status);
4023         }
4024         if ( ( aExp | aSig ) == 0 ) {
4025             float_raise(float_flag_invalid, status);
4026             return float64_default_nan;
4027         }
4028         return packFloat64( zSign, 0x7FF, 0 );
4029     }
4030     if ( aExp == 0 ) {
4031         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4032         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4033     }
4034     if ( bExp == 0 ) {
4035         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
4036         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4037     }
4038     zExp = aExp + bExp - 0x3FF;
4039     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4040     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4041     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4042     zSig0 |= ( zSig1 != 0 );
4043     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
4044         zSig0 <<= 1;
4045         --zExp;
4046     }
4047     return roundAndPackFloat64(zSign, zExp, zSig0, status);
4048
4049 }
4050
4051 /*----------------------------------------------------------------------------
4052 | Returns the result of dividing the double-precision floating-point value `a'
4053 | by the corresponding value `b'.  The operation is performed according to
4054 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4055 *----------------------------------------------------------------------------*/
4056
4057 float64 float64_div(float64 a, float64 b, float_status *status)
4058 {
4059     flag aSign, bSign, zSign;
4060     int_fast16_t aExp, bExp, zExp;
4061     uint64_t aSig, bSig, zSig;
4062     uint64_t rem0, rem1;
4063     uint64_t term0, term1;
4064     a = float64_squash_input_denormal(a, status);
4065     b = float64_squash_input_denormal(b, status);
4066
4067     aSig = extractFloat64Frac( a );
4068     aExp = extractFloat64Exp( a );
4069     aSign = extractFloat64Sign( a );
4070     bSig = extractFloat64Frac( b );
4071     bExp = extractFloat64Exp( b );
4072     bSign = extractFloat64Sign( b );
4073     zSign = aSign ^ bSign;
4074     if ( aExp == 0x7FF ) {
4075         if (aSig) {
4076             return propagateFloat64NaN(a, b, status);
4077         }
4078         if ( bExp == 0x7FF ) {
4079             if (bSig) {
4080                 return propagateFloat64NaN(a, b, status);
4081             }
4082             float_raise(float_flag_invalid, status);
4083             return float64_default_nan;
4084         }
4085         return packFloat64( zSign, 0x7FF, 0 );
4086     }
4087     if ( bExp == 0x7FF ) {
4088         if (bSig) {
4089             return propagateFloat64NaN(a, b, status);
4090         }
4091         return packFloat64( zSign, 0, 0 );
4092     }
4093     if ( bExp == 0 ) {
4094         if ( bSig == 0 ) {
4095             if ( ( aExp | aSig ) == 0 ) {
4096                 float_raise(float_flag_invalid, status);
4097                 return float64_default_nan;
4098             }
4099             float_raise(float_flag_divbyzero, status);
4100             return packFloat64( zSign, 0x7FF, 0 );
4101         }
4102         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4103     }
4104     if ( aExp == 0 ) {
4105         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4106         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4107     }
4108     zExp = aExp - bExp + 0x3FD;
4109     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4110     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4111     if ( bSig <= ( aSig + aSig ) ) {
4112         aSig >>= 1;
4113         ++zExp;
4114     }
4115     zSig = estimateDiv128To64( aSig, 0, bSig );
4116     if ( ( zSig & 0x1FF ) <= 2 ) {
4117         mul64To128( bSig, zSig, &term0, &term1 );
4118         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4119         while ( (int64_t) rem0 < 0 ) {
4120             --zSig;
4121             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4122         }
4123         zSig |= ( rem1 != 0 );
4124     }
4125     return roundAndPackFloat64(zSign, zExp, zSig, status);
4126
4127 }
4128
4129 /*----------------------------------------------------------------------------
4130 | Returns the remainder of the double-precision floating-point value `a'
4131 | with respect to the corresponding value `b'.  The operation is performed
4132 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4133 *----------------------------------------------------------------------------*/
4134
4135 float64 float64_rem(float64 a, float64 b, float_status *status)
4136 {
4137     flag aSign, zSign;
4138     int_fast16_t aExp, bExp, expDiff;
4139     uint64_t aSig, bSig;
4140     uint64_t q, alternateASig;
4141     int64_t sigMean;
4142
4143     a = float64_squash_input_denormal(a, status);
4144     b = float64_squash_input_denormal(b, status);
4145     aSig = extractFloat64Frac( a );
4146     aExp = extractFloat64Exp( a );
4147     aSign = extractFloat64Sign( a );
4148     bSig = extractFloat64Frac( b );
4149     bExp = extractFloat64Exp( b );
4150     if ( aExp == 0x7FF ) {
4151         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4152             return propagateFloat64NaN(a, b, status);
4153         }
4154         float_raise(float_flag_invalid, status);
4155         return float64_default_nan;
4156     }
4157     if ( bExp == 0x7FF ) {
4158         if (bSig) {
4159             return propagateFloat64NaN(a, b, status);
4160         }
4161         return a;
4162     }
4163     if ( bExp == 0 ) {
4164         if ( bSig == 0 ) {
4165             float_raise(float_flag_invalid, status);
4166             return float64_default_nan;
4167         }
4168         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4169     }
4170     if ( aExp == 0 ) {
4171         if ( aSig == 0 ) return a;
4172         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4173     }
4174     expDiff = aExp - bExp;
4175     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4176     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4177     if ( expDiff < 0 ) {
4178         if ( expDiff < -1 ) return a;
4179         aSig >>= 1;
4180     }
4181     q = ( bSig <= aSig );
4182     if ( q ) aSig -= bSig;
4183     expDiff -= 64;
4184     while ( 0 < expDiff ) {
4185         q = estimateDiv128To64( aSig, 0, bSig );
4186         q = ( 2 < q ) ? q - 2 : 0;
4187         aSig = - ( ( bSig>>2 ) * q );
4188         expDiff -= 62;
4189     }
4190     expDiff += 64;
4191     if ( 0 < expDiff ) {
4192         q = estimateDiv128To64( aSig, 0, bSig );
4193         q = ( 2 < q ) ? q - 2 : 0;
4194         q >>= 64 - expDiff;
4195         bSig >>= 2;
4196         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4197     }
4198     else {
4199         aSig >>= 2;
4200         bSig >>= 2;
4201     }
4202     do {
4203         alternateASig = aSig;
4204         ++q;
4205         aSig -= bSig;
4206     } while ( 0 <= (int64_t) aSig );
4207     sigMean = aSig + alternateASig;
4208     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4209         aSig = alternateASig;
4210     }
4211     zSign = ( (int64_t) aSig < 0 );
4212     if ( zSign ) aSig = - aSig;
4213     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4214
4215 }
4216
4217 /*----------------------------------------------------------------------------
4218 | Returns the result of multiplying the double-precision floating-point values
4219 | `a' and `b' then adding 'c', with no intermediate rounding step after the
4220 | multiplication.  The operation is performed according to the IEC/IEEE
4221 | Standard for Binary Floating-Point Arithmetic 754-2008.
4222 | The flags argument allows the caller to select negation of the
4223 | addend, the intermediate product, or the final result. (The difference
4224 | between this and having the caller do a separate negation is that negating
4225 | externally will flip the sign bit on NaNs.)
4226 *----------------------------------------------------------------------------*/
4227
4228 float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
4229                        float_status *status)
4230 {
4231     flag aSign, bSign, cSign, zSign;
4232     int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
4233     uint64_t aSig, bSig, cSig;
4234     flag pInf, pZero, pSign;
4235     uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4236     int shiftcount;
4237     flag signflip, infzero;
4238
4239     a = float64_squash_input_denormal(a, status);
4240     b = float64_squash_input_denormal(b, status);
4241     c = float64_squash_input_denormal(c, status);
4242     aSig = extractFloat64Frac(a);
4243     aExp = extractFloat64Exp(a);
4244     aSign = extractFloat64Sign(a);
4245     bSig = extractFloat64Frac(b);
4246     bExp = extractFloat64Exp(b);
4247     bSign = extractFloat64Sign(b);
4248     cSig = extractFloat64Frac(c);
4249     cExp = extractFloat64Exp(c);
4250     cSign = extractFloat64Sign(c);
4251
4252     infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4253                (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4254
4255     /* It is implementation-defined whether the cases of (0,inf,qnan)
4256      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4257      * they return if they do), so we have to hand this information
4258      * off to the target-specific pick-a-NaN routine.
4259      */
4260     if (((aExp == 0x7ff) && aSig) ||
4261         ((bExp == 0x7ff) && bSig) ||
4262         ((cExp == 0x7ff) && cSig)) {
4263         return propagateFloat64MulAddNaN(a, b, c, infzero, status);
4264     }
4265
4266     if (infzero) {
4267         float_raise(float_flag_invalid, status);
4268         return float64_default_nan;
4269     }
4270
4271     if (flags & float_muladd_negate_c) {
4272         cSign ^= 1;
4273     }
4274
4275     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4276
4277     /* Work out the sign and type of the product */
4278     pSign = aSign ^ bSign;
4279     if (flags & float_muladd_negate_product) {
4280         pSign ^= 1;
4281     }
4282     pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4283     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4284
4285     if (cExp == 0x7ff) {
4286         if (pInf && (pSign ^ cSign)) {
4287             /* addition of opposite-signed infinities => InvalidOperation */
4288             float_raise(float_flag_invalid, status);
4289             return float64_default_nan;
4290         }
4291         /* Otherwise generate an infinity of the same sign */
4292         return packFloat64(cSign ^ signflip, 0x7ff, 0);
4293     }
4294
4295     if (pInf) {
4296         return packFloat64(pSign ^ signflip, 0x7ff, 0);
4297     }
4298
4299     if (pZero) {
4300         if (cExp == 0) {
4301             if (cSig == 0) {
4302                 /* Adding two exact zeroes */
4303                 if (pSign == cSign) {
4304                     zSign = pSign;
4305                 } else if (status->float_rounding_mode == float_round_down) {
4306                     zSign = 1;
4307                 } else {
4308                     zSign = 0;
4309                 }
4310                 return packFloat64(zSign ^ signflip, 0, 0);
4311             }
4312             /* Exact zero plus a denorm */
4313             if (status->flush_to_zero) {
4314                 float_raise(float_flag_output_denormal, status);
4315                 return packFloat64(cSign ^ signflip, 0, 0);
4316             }
4317         }
4318         /* Zero plus something non-zero : just return the something */
4319         if (flags & float_muladd_halve_result) {
4320             if (cExp == 0) {
4321                 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4322             }
4323             /* Subtract one to halve, and one again because roundAndPackFloat64
4324              * wants one less than the true exponent.
4325              */
4326             cExp -= 2;
4327             cSig = (cSig | 0x0010000000000000ULL) << 10;
4328             return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status);
4329         }
4330         return packFloat64(cSign ^ signflip, cExp, cSig);
4331     }
4332
4333     if (aExp == 0) {
4334         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4335     }
4336     if (bExp == 0) {
4337         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4338     }
4339
4340     /* Calculate the actual result a * b + c */
4341
4342     /* Multiply first; this is easy. */
4343     /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4344      * because we want the true exponent, not the "one-less-than"
4345      * flavour that roundAndPackFloat64() takes.
4346      */
4347     pExp = aExp + bExp - 0x3fe;
4348     aSig = (aSig | LIT64(0x0010000000000000))<<10;
4349     bSig = (bSig | LIT64(0x0010000000000000))<<11;
4350     mul64To128(aSig, bSig, &pSig0, &pSig1);
4351     if ((int64_t)(pSig0 << 1) >= 0) {
4352         shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4353         pExp--;
4354     }
4355
4356     zSign = pSign ^ signflip;
4357
4358     /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4359      * bit in position 126.
4360      */
4361     if (cExp == 0) {
4362         if (!cSig) {
4363             /* Throw out the special case of c being an exact zero now */
4364             shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4365             if (flags & float_muladd_halve_result) {
4366                 pExp--;
4367             }
4368             return roundAndPackFloat64(zSign, pExp - 1,
4369                                        pSig1, status);
4370         }
4371         normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4372     }
4373
4374     /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4375      * significand of the addend, with the explicit bit in position 126.
4376      */
4377     cSig0 = cSig << (126 - 64 - 52);
4378     cSig1 = 0;
4379     cSig0 |= LIT64(0x4000000000000000);
4380     expDiff = pExp - cExp;
4381
4382     if (pSign == cSign) {
4383         /* Addition */
4384         if (expDiff > 0) {
4385             /* scale c to match p */
4386             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4387             zExp = pExp;
4388         } else if (expDiff < 0) {
4389             /* scale p to match c */
4390             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4391             zExp = cExp;
4392         } else {
4393             /* no scaling needed */
4394             zExp = cExp;
4395         }
4396         /* Add significands and make sure explicit bit ends up in posn 126 */
4397         add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4398         if ((int64_t)zSig0 < 0) {
4399             shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4400         } else {
4401             zExp--;
4402         }
4403         shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4404         if (flags & float_muladd_halve_result) {
4405             zExp--;
4406         }
4407         return roundAndPackFloat64(zSign, zExp, zSig1, status);
4408     } else {
4409         /* Subtraction */
4410         if (expDiff > 0) {
4411             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4412             sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4413             zExp = pExp;
4414         } else if (expDiff < 0) {
4415             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4416             sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4417             zExp = cExp;
4418             zSign ^= 1;
4419         } else {
4420             zExp = pExp;
4421             if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4422                 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4423             } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4424                 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4425                 zSign ^= 1;
4426             } else {
4427                 /* Exact zero */
4428                 zSign = signflip;
4429                 if (status->float_rounding_mode == float_round_down) {
4430                     zSign ^= 1;
4431                 }
4432                 return packFloat64(zSign, 0, 0);
4433             }
4434         }
4435         --zExp;
4436         /* Do the equivalent of normalizeRoundAndPackFloat64() but
4437          * starting with the significand in a pair of uint64_t.
4438          */
4439         if (zSig0) {
4440             shiftcount = countLeadingZeros64(zSig0) - 1;
4441             shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4442             if (zSig1) {
4443                 zSig0 |= 1;
4444             }
4445             zExp -= shiftcount;
4446         } else {
4447             shiftcount = countLeadingZeros64(zSig1);
4448             if (shiftcount == 0) {
4449                 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4450                 zExp -= 63;
4451             } else {
4452                 shiftcount--;
4453                 zSig0 = zSig1 << shiftcount;
4454                 zExp -= (shiftcount + 64);
4455             }
4456         }
4457         if (flags & float_muladd_halve_result) {
4458             zExp--;
4459         }
4460         return roundAndPackFloat64(zSign, zExp, zSig0, status);
4461     }
4462 }
4463
4464 /*----------------------------------------------------------------------------
4465 | Returns the square root of the double-precision floating-point value `a'.
4466 | The operation is performed according to the IEC/IEEE Standard for Binary
4467 | Floating-Point Arithmetic.
4468 *----------------------------------------------------------------------------*/
4469
4470 float64 float64_sqrt(float64 a, float_status *status)
4471 {
4472     flag aSign;
4473     int_fast16_t aExp, zExp;
4474     uint64_t aSig, zSig, doubleZSig;
4475     uint64_t rem0, rem1, term0, term1;
4476     a = float64_squash_input_denormal(a, status);
4477
4478     aSig = extractFloat64Frac( a );
4479     aExp = extractFloat64Exp( a );
4480     aSign = extractFloat64Sign( a );
4481     if ( aExp == 0x7FF ) {
4482         if (aSig) {
4483             return propagateFloat64NaN(a, a, status);
4484         }
4485         if ( ! aSign ) return a;
4486         float_raise(float_flag_invalid, status);
4487         return float64_default_nan;
4488     }
4489     if ( aSign ) {
4490         if ( ( aExp | aSig ) == 0 ) return a;
4491         float_raise(float_flag_invalid, status);
4492         return float64_default_nan;
4493     }
4494     if ( aExp == 0 ) {
4495         if ( aSig == 0 ) return float64_zero;
4496         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4497     }
4498     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4499     aSig |= LIT64( 0x0010000000000000 );
4500     zSig = estimateSqrt32( aExp, aSig>>21 );
4501     aSig <<= 9 - ( aExp & 1 );
4502     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4503     if ( ( zSig & 0x1FF ) <= 5 ) {
4504         doubleZSig = zSig<<1;
4505         mul64To128( zSig, zSig, &term0, &term1 );
4506         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4507         while ( (int64_t) rem0 < 0 ) {
4508             --zSig;
4509             doubleZSig -= 2;
4510             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4511         }
4512         zSig |= ( ( rem0 | rem1 ) != 0 );
4513     }
4514     return roundAndPackFloat64(0, zExp, zSig, status);
4515
4516 }
4517
4518 /*----------------------------------------------------------------------------
4519 | Returns the binary log of the double-precision floating-point value `a'.
4520 | The operation is performed according to the IEC/IEEE Standard for Binary
4521 | Floating-Point Arithmetic.
4522 *----------------------------------------------------------------------------*/
4523 float64 float64_log2(float64 a, float_status *status)
4524 {
4525     flag aSign, zSign;
4526     int_fast16_t aExp;
4527     uint64_t aSig, aSig0, aSig1, zSig, i;
4528     a = float64_squash_input_denormal(a, status);
4529
4530     aSig = extractFloat64Frac( a );
4531     aExp = extractFloat64Exp( a );
4532     aSign = extractFloat64Sign( a );
4533
4534     if ( aExp == 0 ) {
4535         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4536         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4537     }
4538     if ( aSign ) {
4539         float_raise(float_flag_invalid, status);
4540         return float64_default_nan;
4541     }
4542     if ( aExp == 0x7FF ) {
4543         if (aSig) {
4544             return propagateFloat64NaN(a, float64_zero, status);
4545         }
4546         return a;
4547     }
4548
4549     aExp -= 0x3FF;
4550     aSig |= LIT64( 0x0010000000000000 );
4551     zSign = aExp < 0;
4552     zSig = (uint64_t)aExp << 52;
4553     for (i = 1LL << 51; i > 0; i >>= 1) {
4554         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4555         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4556         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4557             aSig >>= 1;
4558             zSig |= i;
4559         }
4560     }
4561
4562     if ( zSign )
4563         zSig = -zSig;
4564     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4565 }
4566
4567 /*----------------------------------------------------------------------------
4568 | Returns 1 if the double-precision floating-point value `a' is equal to the
4569 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4570 | if either operand is a NaN.  Otherwise, the comparison is performed
4571 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4572 *----------------------------------------------------------------------------*/
4573
4574 int float64_eq(float64 a, float64 b, float_status *status)
4575 {
4576     uint64_t av, bv;
4577     a = float64_squash_input_denormal(a, status);
4578     b = float64_squash_input_denormal(b, status);
4579
4580     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4581          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4582        ) {
4583         float_raise(float_flag_invalid, status);
4584         return 0;
4585     }
4586     av = float64_val(a);
4587     bv = float64_val(b);
4588     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4589
4590 }
4591
4592 /*----------------------------------------------------------------------------
4593 | Returns 1 if the double-precision floating-point value `a' is less than or
4594 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4595 | exception is raised if either operand is a NaN.  The comparison is performed
4596 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4597 *----------------------------------------------------------------------------*/
4598
4599 int float64_le(float64 a, float64 b, float_status *status)
4600 {
4601     flag aSign, bSign;
4602     uint64_t av, bv;
4603     a = float64_squash_input_denormal(a, status);
4604     b = float64_squash_input_denormal(b, status);
4605
4606     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4607          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4608        ) {
4609         float_raise(float_flag_invalid, status);
4610         return 0;
4611     }
4612     aSign = extractFloat64Sign( a );
4613     bSign = extractFloat64Sign( b );
4614     av = float64_val(a);
4615     bv = float64_val(b);
4616     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4617     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4618
4619 }
4620
4621 /*----------------------------------------------------------------------------
4622 | Returns 1 if the double-precision floating-point value `a' is less than
4623 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4624 | raised if either operand is a NaN.  The comparison is performed according
4625 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4626 *----------------------------------------------------------------------------*/
4627
4628 int float64_lt(float64 a, float64 b, float_status *status)
4629 {
4630     flag aSign, bSign;
4631     uint64_t av, bv;
4632
4633     a = float64_squash_input_denormal(a, status);
4634     b = float64_squash_input_denormal(b, status);
4635     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4636          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4637        ) {
4638         float_raise(float_flag_invalid, status);
4639         return 0;
4640     }
4641     aSign = extractFloat64Sign( a );
4642     bSign = extractFloat64Sign( b );
4643     av = float64_val(a);
4644     bv = float64_val(b);
4645     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4646     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4647
4648 }
4649
4650 /*----------------------------------------------------------------------------
4651 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4652 | be compared, and 0 otherwise.  The invalid exception is raised if either
4653 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4654 | Standard for Binary Floating-Point Arithmetic.
4655 *----------------------------------------------------------------------------*/
4656
4657 int float64_unordered(float64 a, float64 b, float_status *status)
4658 {
4659     a = float64_squash_input_denormal(a, status);
4660     b = float64_squash_input_denormal(b, status);
4661
4662     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4663          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4664        ) {
4665         float_raise(float_flag_invalid, status);
4666         return 1;
4667     }
4668     return 0;
4669 }
4670
4671 /*----------------------------------------------------------------------------
4672 | Returns 1 if the double-precision floating-point value `a' is equal to the
4673 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4674 | exception.The comparison is performed according to the IEC/IEEE Standard
4675 | for Binary Floating-Point Arithmetic.
4676 *----------------------------------------------------------------------------*/
4677
4678 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4679 {
4680     uint64_t av, bv;
4681     a = float64_squash_input_denormal(a, status);
4682     b = float64_squash_input_denormal(b, status);
4683
4684     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4685          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4686        ) {
4687         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4688             float_raise(float_flag_invalid, status);
4689         }
4690         return 0;
4691     }
4692     av = float64_val(a);
4693     bv = float64_val(b);
4694     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4695
4696 }
4697
4698 /*----------------------------------------------------------------------------
4699 | Returns 1 if the double-precision floating-point value `a' is less than or
4700 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4701 | cause an exception.  Otherwise, the comparison is performed according to the
4702 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4703 *----------------------------------------------------------------------------*/
4704
4705 int float64_le_quiet(float64 a, float64 b, float_status *status)
4706 {
4707     flag aSign, bSign;
4708     uint64_t av, bv;
4709     a = float64_squash_input_denormal(a, status);
4710     b = float64_squash_input_denormal(b, status);
4711
4712     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4713          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4714        ) {
4715         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4716             float_raise(float_flag_invalid, status);
4717         }
4718         return 0;
4719     }
4720     aSign = extractFloat64Sign( a );
4721     bSign = extractFloat64Sign( b );
4722     av = float64_val(a);
4723     bv = float64_val(b);
4724     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4725     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4726
4727 }
4728
4729 /*----------------------------------------------------------------------------
4730 | Returns 1 if the double-precision floating-point value `a' is less than
4731 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4732 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4733 | Standard for Binary Floating-Point Arithmetic.
4734 *----------------------------------------------------------------------------*/
4735
4736 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4737 {
4738     flag aSign, bSign;
4739     uint64_t av, bv;
4740     a = float64_squash_input_denormal(a, status);
4741     b = float64_squash_input_denormal(b, status);
4742
4743     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4744          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4745        ) {
4746         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4747             float_raise(float_flag_invalid, status);
4748         }
4749         return 0;
4750     }
4751     aSign = extractFloat64Sign( a );
4752     bSign = extractFloat64Sign( b );
4753     av = float64_val(a);
4754     bv = float64_val(b);
4755     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4756     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4757
4758 }
4759
4760 /*----------------------------------------------------------------------------
4761 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4762 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4763 | comparison is performed according to the IEC/IEEE Standard for Binary
4764 | Floating-Point Arithmetic.
4765 *----------------------------------------------------------------------------*/
4766
4767 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4768 {
4769     a = float64_squash_input_denormal(a, status);
4770     b = float64_squash_input_denormal(b, status);
4771
4772     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4773          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4774        ) {
4775         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4776             float_raise(float_flag_invalid, status);
4777         }
4778         return 1;
4779     }
4780     return 0;
4781 }
4782
4783 /*----------------------------------------------------------------------------
4784 | Returns the result of converting the extended double-precision floating-
4785 | point value `a' to the 32-bit two's complement integer format.  The
4786 | conversion is performed according to the IEC/IEEE Standard for Binary
4787 | Floating-Point Arithmetic---which means in particular that the conversion
4788 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4789 | largest positive integer is returned.  Otherwise, if the conversion
4790 | overflows, the largest integer with the same sign as `a' is returned.
4791 *----------------------------------------------------------------------------*/
4792
4793 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4794 {
4795     flag aSign;
4796     int32_t aExp, shiftCount;
4797     uint64_t aSig;
4798
4799     aSig = extractFloatx80Frac( a );
4800     aExp = extractFloatx80Exp( a );
4801     aSign = extractFloatx80Sign( a );
4802     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4803     shiftCount = 0x4037 - aExp;
4804     if ( shiftCount <= 0 ) shiftCount = 1;
4805     shift64RightJamming( aSig, shiftCount, &aSig );
4806     return roundAndPackInt32(aSign, aSig, status);
4807
4808 }
4809
4810 /*----------------------------------------------------------------------------
4811 | Returns the result of converting the extended double-precision floating-
4812 | point value `a' to the 32-bit two's complement integer format.  The
4813 | conversion is performed according to the IEC/IEEE Standard for Binary
4814 | Floating-Point Arithmetic, except that the conversion is always rounded
4815 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4816 | Otherwise, if the conversion overflows, the largest integer with the same
4817 | sign as `a' is returned.
4818 *----------------------------------------------------------------------------*/
4819
4820 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4821 {
4822     flag aSign;
4823     int32_t aExp, shiftCount;
4824     uint64_t aSig, savedASig;
4825     int32_t z;
4826
4827     aSig = extractFloatx80Frac( a );
4828     aExp = extractFloatx80Exp( a );
4829     aSign = extractFloatx80Sign( a );
4830     if ( 0x401E < aExp ) {
4831         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4832         goto invalid;
4833     }
4834     else if ( aExp < 0x3FFF ) {
4835         if (aExp || aSig) {
4836             status->float_exception_flags |= float_flag_inexact;
4837         }
4838         return 0;
4839     }
4840     shiftCount = 0x403E - aExp;
4841     savedASig = aSig;
4842     aSig >>= shiftCount;
4843     z = aSig;
4844     if ( aSign ) z = - z;
4845     if ( ( z < 0 ) ^ aSign ) {
4846  invalid:
4847         float_raise(float_flag_invalid, status);
4848         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4849     }
4850     if ( ( aSig<<shiftCount ) != savedASig ) {
4851         status->float_exception_flags |= float_flag_inexact;
4852     }
4853     return z;
4854
4855 }
4856
4857 /*----------------------------------------------------------------------------
4858 | Returns the result of converting the extended double-precision floating-
4859 | point value `a' to the 64-bit two's complement integer format.  The
4860 | conversion is performed according to the IEC/IEEE Standard for Binary
4861 | Floating-Point Arithmetic---which means in particular that the conversion
4862 | is rounded according to the current rounding mode.  If `a' is a NaN,
4863 | the largest positive integer is returned.  Otherwise, if the conversion
4864 | overflows, the largest integer with the same sign as `a' is returned.
4865 *----------------------------------------------------------------------------*/
4866
4867 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4868 {
4869     flag aSign;
4870     int32_t aExp, shiftCount;
4871     uint64_t aSig, aSigExtra;
4872
4873     aSig = extractFloatx80Frac( a );
4874     aExp = extractFloatx80Exp( a );
4875     aSign = extractFloatx80Sign( a );
4876     shiftCount = 0x403E - aExp;
4877     if ( shiftCount <= 0 ) {
4878         if ( shiftCount ) {
4879             float_raise(float_flag_invalid, status);
4880             if (    ! aSign
4881                  || (    ( aExp == 0x7FFF )
4882                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
4883                ) {
4884                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4885             }
4886             return (int64_t) LIT64( 0x8000000000000000 );
4887         }
4888         aSigExtra = 0;
4889     }
4890     else {
4891         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4892     }
4893     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4894
4895 }
4896
4897 /*----------------------------------------------------------------------------
4898 | Returns the result of converting the extended double-precision floating-
4899 | point value `a' to the 64-bit two's complement integer format.  The
4900 | conversion is performed according to the IEC/IEEE Standard for Binary
4901 | Floating-Point Arithmetic, except that the conversion is always rounded
4902 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4903 | Otherwise, if the conversion overflows, the largest integer with the same
4904 | sign as `a' is returned.
4905 *----------------------------------------------------------------------------*/
4906
4907 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4908 {
4909     flag aSign;
4910     int32_t aExp, shiftCount;
4911     uint64_t aSig;
4912     int64_t z;
4913
4914     aSig = extractFloatx80Frac( a );
4915     aExp = extractFloatx80Exp( a );
4916     aSign = extractFloatx80Sign( a );
4917     shiftCount = aExp - 0x403E;
4918     if ( 0 <= shiftCount ) {
4919         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4920         if ( ( a.high != 0xC03E ) || aSig ) {
4921             float_raise(float_flag_invalid, status);
4922             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4923                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4924             }
4925         }
4926         return (int64_t) LIT64( 0x8000000000000000 );
4927     }
4928     else if ( aExp < 0x3FFF ) {
4929         if (aExp | aSig) {
4930             status->float_exception_flags |= float_flag_inexact;
4931         }
4932         return 0;
4933     }
4934     z = aSig>>( - shiftCount );
4935     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4936         status->float_exception_flags |= float_flag_inexact;
4937     }
4938     if ( aSign ) z = - z;
4939     return z;
4940
4941 }
4942
4943 /*----------------------------------------------------------------------------
4944 | Returns the result of converting the extended double-precision floating-
4945 | point value `a' to the single-precision floating-point format.  The
4946 | conversion is performed according to the IEC/IEEE Standard for Binary
4947 | Floating-Point Arithmetic.
4948 *----------------------------------------------------------------------------*/
4949
4950 float32 floatx80_to_float32(floatx80 a, float_status *status)
4951 {
4952     flag aSign;
4953     int32_t aExp;
4954     uint64_t aSig;
4955
4956     aSig = extractFloatx80Frac( a );
4957     aExp = extractFloatx80Exp( a );
4958     aSign = extractFloatx80Sign( a );
4959     if ( aExp == 0x7FFF ) {
4960         if ( (uint64_t) ( aSig<<1 ) ) {
4961             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4962         }
4963         return packFloat32( aSign, 0xFF, 0 );
4964     }
4965     shift64RightJamming( aSig, 33, &aSig );
4966     if ( aExp || aSig ) aExp -= 0x3F81;
4967     return roundAndPackFloat32(aSign, aExp, aSig, status);
4968
4969 }
4970
4971 /*----------------------------------------------------------------------------
4972 | Returns the result of converting the extended double-precision floating-
4973 | point value `a' to the double-precision floating-point format.  The
4974 | conversion is performed according to the IEC/IEEE Standard for Binary
4975 | Floating-Point Arithmetic.
4976 *----------------------------------------------------------------------------*/
4977
4978 float64 floatx80_to_float64(floatx80 a, float_status *status)
4979 {
4980     flag aSign;
4981     int32_t aExp;
4982     uint64_t aSig, zSig;
4983
4984     aSig = extractFloatx80Frac( a );
4985     aExp = extractFloatx80Exp( a );
4986     aSign = extractFloatx80Sign( a );
4987     if ( aExp == 0x7FFF ) {
4988         if ( (uint64_t) ( aSig<<1 ) ) {
4989             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4990         }
4991         return packFloat64( aSign, 0x7FF, 0 );
4992     }
4993     shift64RightJamming( aSig, 1, &zSig );
4994     if ( aExp || aSig ) aExp -= 0x3C01;
4995     return roundAndPackFloat64(aSign, aExp, zSig, status);
4996
4997 }
4998
4999 /*----------------------------------------------------------------------------
5000 | Returns the result of converting the extended double-precision floating-
5001 | point value `a' to the quadruple-precision floating-point format.  The
5002 | conversion is performed according to the IEC/IEEE Standard for Binary
5003 | Floating-Point Arithmetic.
5004 *----------------------------------------------------------------------------*/
5005
5006 float128 floatx80_to_float128(floatx80 a, float_status *status)
5007 {
5008     flag aSign;
5009     int_fast16_t aExp;
5010     uint64_t aSig, zSig0, zSig1;
5011
5012     aSig = extractFloatx80Frac( a );
5013     aExp = extractFloatx80Exp( a );
5014     aSign = extractFloatx80Sign( a );
5015     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5016         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5017     }
5018     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5019     return packFloat128( aSign, aExp, zSig0, zSig1 );
5020
5021 }
5022
5023 /*----------------------------------------------------------------------------
5024 | Rounds the extended double-precision floating-point value `a' to an integer,
5025 | and returns the result as an extended quadruple-precision floating-point
5026 | value.  The operation is performed according to the IEC/IEEE Standard for
5027 | Binary Floating-Point Arithmetic.
5028 *----------------------------------------------------------------------------*/
5029
5030 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5031 {
5032     flag aSign;
5033     int32_t aExp;
5034     uint64_t lastBitMask, roundBitsMask;
5035     floatx80 z;
5036
5037     aExp = extractFloatx80Exp( a );
5038     if ( 0x403E <= aExp ) {
5039         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5040             return propagateFloatx80NaN(a, a, status);
5041         }
5042         return a;
5043     }
5044     if ( aExp < 0x3FFF ) {
5045         if (    ( aExp == 0 )
5046              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5047             return a;
5048         }
5049         status->float_exception_flags |= float_flag_inexact;
5050         aSign = extractFloatx80Sign( a );
5051         switch (status->float_rounding_mode) {
5052          case float_round_nearest_even:
5053             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5054                ) {
5055                 return
5056                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5057             }
5058             break;
5059         case float_round_ties_away:
5060             if (aExp == 0x3FFE) {
5061                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5062             }
5063             break;
5064          case float_round_down:
5065             return
5066                   aSign ?
5067                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5068                 : packFloatx80( 0, 0, 0 );
5069          case float_round_up:
5070             return
5071                   aSign ? packFloatx80( 1, 0, 0 )
5072                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5073         }
5074         return packFloatx80( aSign, 0, 0 );
5075     }
5076     lastBitMask = 1;
5077     lastBitMask <<= 0x403E - aExp;
5078     roundBitsMask = lastBitMask - 1;
5079     z = a;
5080     switch (status->float_rounding_mode) {
5081     case float_round_nearest_even:
5082         z.low += lastBitMask>>1;
5083         if ((z.low & roundBitsMask) == 0) {
5084             z.low &= ~lastBitMask;
5085         }
5086         break;
5087     case float_round_ties_away:
5088         z.low += lastBitMask >> 1;
5089         break;
5090     case float_round_to_zero:
5091         break;
5092     case float_round_up:
5093         if (!extractFloatx80Sign(z)) {
5094             z.low += roundBitsMask;
5095         }
5096         break;
5097     case float_round_down:
5098         if (extractFloatx80Sign(z)) {
5099             z.low += roundBitsMask;
5100         }
5101         break;
5102     default:
5103         abort();
5104     }
5105     z.low &= ~ roundBitsMask;
5106     if ( z.low == 0 ) {
5107         ++z.high;
5108         z.low = LIT64( 0x8000000000000000 );
5109     }
5110     if (z.low != a.low) {
5111         status->float_exception_flags |= float_flag_inexact;
5112     }
5113     return z;
5114
5115 }
5116
5117 /*----------------------------------------------------------------------------
5118 | Returns the result of adding the absolute values of the extended double-
5119 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5120 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5121 | The addition is performed according to the IEC/IEEE Standard for Binary
5122 | Floating-Point Arithmetic.
5123 *----------------------------------------------------------------------------*/
5124
5125 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5126                                 float_status *status)
5127 {
5128     int32_t aExp, bExp, zExp;
5129     uint64_t aSig, bSig, zSig0, zSig1;
5130     int32_t expDiff;
5131
5132     aSig = extractFloatx80Frac( a );
5133     aExp = extractFloatx80Exp( a );
5134     bSig = extractFloatx80Frac( b );
5135     bExp = extractFloatx80Exp( b );
5136     expDiff = aExp - bExp;
5137     if ( 0 < expDiff ) {
5138         if ( aExp == 0x7FFF ) {
5139             if ((uint64_t)(aSig << 1)) {
5140                 return propagateFloatx80NaN(a, b, status);
5141             }
5142             return a;
5143         }
5144         if ( bExp == 0 ) --expDiff;
5145         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5146         zExp = aExp;
5147     }
5148     else if ( expDiff < 0 ) {
5149         if ( bExp == 0x7FFF ) {
5150             if ((uint64_t)(bSig << 1)) {
5151                 return propagateFloatx80NaN(a, b, status);
5152             }
5153             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5154         }
5155         if ( aExp == 0 ) ++expDiff;
5156         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5157         zExp = bExp;
5158     }
5159     else {
5160         if ( aExp == 0x7FFF ) {
5161             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5162                 return propagateFloatx80NaN(a, b, status);
5163             }
5164             return a;
5165         }
5166         zSig1 = 0;
5167         zSig0 = aSig + bSig;
5168         if ( aExp == 0 ) {
5169             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5170             goto roundAndPack;
5171         }
5172         zExp = aExp;
5173         goto shiftRight1;
5174     }
5175     zSig0 = aSig + bSig;
5176     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5177  shiftRight1:
5178     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5179     zSig0 |= LIT64( 0x8000000000000000 );
5180     ++zExp;
5181  roundAndPack:
5182     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5183                                 zSign, zExp, zSig0, zSig1, status);
5184 }
5185
5186 /*----------------------------------------------------------------------------
5187 | Returns the result of subtracting the absolute values of the extended
5188 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5189 | difference is negated before being returned.  `zSign' is ignored if the
5190 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5191 | Standard for Binary Floating-Point Arithmetic.
5192 *----------------------------------------------------------------------------*/
5193
5194 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5195                                 float_status *status)
5196 {
5197     int32_t aExp, bExp, zExp;
5198     uint64_t aSig, bSig, zSig0, zSig1;
5199     int32_t expDiff;
5200     floatx80 z;
5201
5202     aSig = extractFloatx80Frac( a );
5203     aExp = extractFloatx80Exp( a );
5204     bSig = extractFloatx80Frac( b );
5205     bExp = extractFloatx80Exp( b );
5206     expDiff = aExp - bExp;
5207     if ( 0 < expDiff ) goto aExpBigger;
5208     if ( expDiff < 0 ) goto bExpBigger;
5209     if ( aExp == 0x7FFF ) {
5210         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5211             return propagateFloatx80NaN(a, b, status);
5212         }
5213         float_raise(float_flag_invalid, status);
5214         z.low = floatx80_default_nan_low;
5215         z.high = floatx80_default_nan_high;
5216         return z;
5217     }
5218     if ( aExp == 0 ) {
5219         aExp = 1;
5220         bExp = 1;
5221     }
5222     zSig1 = 0;
5223     if ( bSig < aSig ) goto aBigger;
5224     if ( aSig < bSig ) goto bBigger;
5225     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5226  bExpBigger:
5227     if ( bExp == 0x7FFF ) {
5228         if ((uint64_t)(bSig << 1)) {
5229             return propagateFloatx80NaN(a, b, status);
5230         }
5231         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5232     }
5233     if ( aExp == 0 ) ++expDiff;
5234     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5235  bBigger:
5236     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5237     zExp = bExp;
5238     zSign ^= 1;
5239     goto normalizeRoundAndPack;
5240  aExpBigger:
5241     if ( aExp == 0x7FFF ) {
5242         if ((uint64_t)(aSig << 1)) {
5243             return propagateFloatx80NaN(a, b, status);
5244         }
5245         return a;
5246     }
5247     if ( bExp == 0 ) --expDiff;
5248     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5249  aBigger:
5250     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5251     zExp = aExp;
5252  normalizeRoundAndPack:
5253     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5254                                          zSign, zExp, zSig0, zSig1, status);
5255 }
5256
5257 /*----------------------------------------------------------------------------
5258 | Returns the result of adding the extended double-precision floating-point
5259 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5260 | Standard for Binary Floating-Point Arithmetic.
5261 *----------------------------------------------------------------------------*/
5262
5263 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5264 {
5265     flag aSign, bSign;
5266
5267     aSign = extractFloatx80Sign( a );
5268     bSign = extractFloatx80Sign( b );
5269     if ( aSign == bSign ) {
5270         return addFloatx80Sigs(a, b, aSign, status);
5271     }
5272     else {
5273         return subFloatx80Sigs(a, b, aSign, status);
5274     }
5275
5276 }
5277
5278 /*----------------------------------------------------------------------------
5279 | Returns the result of subtracting the extended double-precision floating-
5280 | point values `a' and `b'.  The operation is performed according to the
5281 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5282 *----------------------------------------------------------------------------*/
5283
5284 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5285 {
5286     flag aSign, bSign;
5287
5288     aSign = extractFloatx80Sign( a );
5289     bSign = extractFloatx80Sign( b );
5290     if ( aSign == bSign ) {
5291         return subFloatx80Sigs(a, b, aSign, status);
5292     }
5293     else {
5294         return addFloatx80Sigs(a, b, aSign, status);
5295     }
5296
5297 }
5298
5299 /*----------------------------------------------------------------------------
5300 | Returns the result of multiplying the extended double-precision floating-
5301 | point values `a' and `b'.  The operation is performed according to the
5302 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5303 *----------------------------------------------------------------------------*/
5304
5305 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5306 {
5307     flag aSign, bSign, zSign;
5308     int32_t aExp, bExp, zExp;
5309     uint64_t aSig, bSig, zSig0, zSig1;
5310     floatx80 z;
5311
5312     aSig = extractFloatx80Frac( a );
5313     aExp = extractFloatx80Exp( a );
5314     aSign = extractFloatx80Sign( a );
5315     bSig = extractFloatx80Frac( b );
5316     bExp = extractFloatx80Exp( b );
5317     bSign = extractFloatx80Sign( b );
5318     zSign = aSign ^ bSign;
5319     if ( aExp == 0x7FFF ) {
5320         if (    (uint64_t) ( aSig<<1 )
5321              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5322             return propagateFloatx80NaN(a, b, status);
5323         }
5324         if ( ( bExp | bSig ) == 0 ) goto invalid;
5325         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5326     }
5327     if ( bExp == 0x7FFF ) {
5328         if ((uint64_t)(bSig << 1)) {
5329             return propagateFloatx80NaN(a, b, status);
5330         }
5331         if ( ( aExp | aSig ) == 0 ) {
5332  invalid:
5333             float_raise(float_flag_invalid, status);
5334             z.low = floatx80_default_nan_low;
5335             z.high = floatx80_default_nan_high;
5336             return z;
5337         }
5338         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5339     }
5340     if ( aExp == 0 ) {
5341         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5342         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5343     }
5344     if ( bExp == 0 ) {
5345         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5346         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5347     }
5348     zExp = aExp + bExp - 0x3FFE;
5349     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5350     if ( 0 < (int64_t) zSig0 ) {
5351         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5352         --zExp;
5353     }
5354     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5355                                 zSign, zExp, zSig0, zSig1, status);
5356 }
5357
5358 /*----------------------------------------------------------------------------
5359 | Returns the result of dividing the extended double-precision floating-point
5360 | value `a' by the corresponding value `b'.  The operation is performed
5361 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5362 *----------------------------------------------------------------------------*/
5363
5364 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5365 {
5366     flag aSign, bSign, zSign;
5367     int32_t aExp, bExp, zExp;
5368     uint64_t aSig, bSig, zSig0, zSig1;
5369     uint64_t rem0, rem1, rem2, term0, term1, term2;
5370     floatx80 z;
5371
5372     aSig = extractFloatx80Frac( a );
5373     aExp = extractFloatx80Exp( a );
5374     aSign = extractFloatx80Sign( a );
5375     bSig = extractFloatx80Frac( b );
5376     bExp = extractFloatx80Exp( b );
5377     bSign = extractFloatx80Sign( b );
5378     zSign = aSign ^ bSign;
5379     if ( aExp == 0x7FFF ) {
5380         if ((uint64_t)(aSig << 1)) {
5381             return propagateFloatx80NaN(a, b, status);
5382         }
5383         if ( bExp == 0x7FFF ) {
5384             if ((uint64_t)(bSig << 1)) {
5385                 return propagateFloatx80NaN(a, b, status);
5386             }
5387             goto invalid;
5388         }
5389         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5390     }
5391     if ( bExp == 0x7FFF ) {
5392         if ((uint64_t)(bSig << 1)) {
5393             return propagateFloatx80NaN(a, b, status);
5394         }
5395         return packFloatx80( zSign, 0, 0 );
5396     }
5397     if ( bExp == 0 ) {
5398         if ( bSig == 0 ) {
5399             if ( ( aExp | aSig ) == 0 ) {
5400  invalid:
5401                 float_raise(float_flag_invalid, status);
5402                 z.low = floatx80_default_nan_low;
5403                 z.high = floatx80_default_nan_high;
5404                 return z;
5405             }
5406             float_raise(float_flag_divbyzero, status);
5407             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5408         }
5409         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5410     }
5411     if ( aExp == 0 ) {
5412         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5413         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5414     }
5415     zExp = aExp - bExp + 0x3FFE;
5416     rem1 = 0;
5417     if ( bSig <= aSig ) {
5418         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5419         ++zExp;
5420     }
5421     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5422     mul64To128( bSig, zSig0, &term0, &term1 );
5423     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5424     while ( (int64_t) rem0 < 0 ) {
5425         --zSig0;
5426         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5427     }
5428     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5429     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5430         mul64To128( bSig, zSig1, &term1, &term2 );
5431         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5432         while ( (int64_t) rem1 < 0 ) {
5433             --zSig1;
5434             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5435         }
5436         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5437     }
5438     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5439                                 zSign, zExp, zSig0, zSig1, status);
5440 }
5441
5442 /*----------------------------------------------------------------------------
5443 | Returns the remainder of the extended double-precision floating-point value
5444 | `a' with respect to the corresponding value `b'.  The operation is performed
5445 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5446 *----------------------------------------------------------------------------*/
5447
5448 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5449 {
5450     flag aSign, zSign;
5451     int32_t aExp, bExp, expDiff;
5452     uint64_t aSig0, aSig1, bSig;
5453     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5454     floatx80 z;
5455
5456     aSig0 = extractFloatx80Frac( a );
5457     aExp = extractFloatx80Exp( a );
5458     aSign = extractFloatx80Sign( a );
5459     bSig = extractFloatx80Frac( b );
5460     bExp = extractFloatx80Exp( b );
5461     if ( aExp == 0x7FFF ) {
5462         if (    (uint64_t) ( aSig0<<1 )
5463              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5464             return propagateFloatx80NaN(a, b, status);
5465         }
5466         goto invalid;
5467     }
5468     if ( bExp == 0x7FFF ) {
5469         if ((uint64_t)(bSig << 1)) {
5470             return propagateFloatx80NaN(a, b, status);
5471         }
5472         return a;
5473     }
5474     if ( bExp == 0 ) {
5475         if ( bSig == 0 ) {
5476  invalid:
5477             float_raise(float_flag_invalid, status);
5478             z.low = floatx80_default_nan_low;
5479             z.high = floatx80_default_nan_high;
5480             return z;
5481         }
5482         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5483     }
5484     if ( aExp == 0 ) {
5485         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5486         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5487     }
5488     bSig |= LIT64( 0x8000000000000000 );
5489     zSign = aSign;
5490     expDiff = aExp - bExp;
5491     aSig1 = 0;
5492     if ( expDiff < 0 ) {
5493         if ( expDiff < -1 ) return a;
5494         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5495         expDiff = 0;
5496     }
5497     q = ( bSig <= aSig0 );
5498     if ( q ) aSig0 -= bSig;
5499     expDiff -= 64;
5500     while ( 0 < expDiff ) {
5501         q = estimateDiv128To64( aSig0, aSig1, bSig );
5502         q = ( 2 < q ) ? q - 2 : 0;
5503         mul64To128( bSig, q, &term0, &term1 );
5504         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5505         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5506         expDiff -= 62;
5507     }
5508     expDiff += 64;
5509     if ( 0 < expDiff ) {
5510         q = estimateDiv128To64( aSig0, aSig1, bSig );
5511         q = ( 2 < q ) ? q - 2 : 0;
5512         q >>= 64 - expDiff;
5513         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5514         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5515         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5516         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5517             ++q;
5518             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5519         }
5520     }
5521     else {
5522         term1 = 0;
5523         term0 = bSig;
5524     }
5525     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5526     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5527          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5528               && ( q & 1 ) )
5529        ) {
5530         aSig0 = alternateASig0;
5531         aSig1 = alternateASig1;
5532         zSign = ! zSign;
5533     }
5534     return
5535         normalizeRoundAndPackFloatx80(
5536             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5537
5538 }
5539
5540 /*----------------------------------------------------------------------------
5541 | Returns the square root of the extended double-precision floating-point
5542 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5543 | for Binary Floating-Point Arithmetic.
5544 *----------------------------------------------------------------------------*/
5545
5546 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5547 {
5548     flag aSign;
5549     int32_t aExp, zExp;
5550     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5551     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5552     floatx80 z;
5553
5554     aSig0 = extractFloatx80Frac( a );
5555     aExp = extractFloatx80Exp( a );
5556     aSign = extractFloatx80Sign( a );
5557     if ( aExp == 0x7FFF ) {
5558         if ((uint64_t)(aSig0 << 1)) {
5559             return propagateFloatx80NaN(a, a, status);
5560         }
5561         if ( ! aSign ) return a;
5562         goto invalid;
5563     }
5564     if ( aSign ) {
5565         if ( ( aExp | aSig0 ) == 0 ) return a;
5566  invalid:
5567         float_raise(float_flag_invalid, status);
5568         z.low = floatx80_default_nan_low;
5569         z.high = floatx80_default_nan_high;
5570         return z;
5571     }
5572     if ( aExp == 0 ) {
5573         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5574         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5575     }
5576     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5577     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5578     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5579     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5580     doubleZSig0 = zSig0<<1;
5581     mul64To128( zSig0, zSig0, &term0, &term1 );
5582     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5583     while ( (int64_t) rem0 < 0 ) {
5584         --zSig0;
5585         doubleZSig0 -= 2;
5586         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5587     }
5588     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5589     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5590         if ( zSig1 == 0 ) zSig1 = 1;
5591         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5592         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5593         mul64To128( zSig1, zSig1, &term2, &term3 );
5594         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5595         while ( (int64_t) rem1 < 0 ) {
5596             --zSig1;
5597             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5598             term3 |= 1;
5599             term2 |= doubleZSig0;
5600             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5601         }
5602         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5603     }
5604     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5605     zSig0 |= doubleZSig0;
5606     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5607                                 0, zExp, zSig0, zSig1, status);
5608 }
5609
5610 /*----------------------------------------------------------------------------
5611 | Returns 1 if the extended double-precision floating-point value `a' is equal
5612 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5613 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5614 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5615 *----------------------------------------------------------------------------*/
5616
5617 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5618 {
5619
5620     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5621               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5622          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5623               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5624        ) {
5625         float_raise(float_flag_invalid, status);
5626         return 0;
5627     }
5628     return
5629            ( a.low == b.low )
5630         && (    ( a.high == b.high )
5631              || (    ( a.low == 0 )
5632                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5633            );
5634
5635 }
5636
5637 /*----------------------------------------------------------------------------
5638 | Returns 1 if the extended double-precision floating-point value `a' is
5639 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5640 | invalid exception is raised if either operand is a NaN.  The comparison is
5641 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5642 | Arithmetic.
5643 *----------------------------------------------------------------------------*/
5644
5645 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5646 {
5647     flag aSign, bSign;
5648
5649     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5650               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5651          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5652               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5653        ) {
5654         float_raise(float_flag_invalid, status);
5655         return 0;
5656     }
5657     aSign = extractFloatx80Sign( a );
5658     bSign = extractFloatx80Sign( b );
5659     if ( aSign != bSign ) {
5660         return
5661                aSign
5662             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5663                  == 0 );
5664     }
5665     return
5666           aSign ? le128( b.high, b.low, a.high, a.low )
5667         : le128( a.high, a.low, b.high, b.low );
5668
5669 }
5670
5671 /*----------------------------------------------------------------------------
5672 | Returns 1 if the extended double-precision floating-point value `a' is
5673 | less than the corresponding value `b', and 0 otherwise.  The invalid
5674 | exception is raised if either operand is a NaN.  The comparison is performed
5675 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5676 *----------------------------------------------------------------------------*/
5677
5678 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5679 {
5680     flag aSign, bSign;
5681
5682     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5683               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5684          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5685               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5686        ) {
5687         float_raise(float_flag_invalid, status);
5688         return 0;
5689     }
5690     aSign = extractFloatx80Sign( a );
5691     bSign = extractFloatx80Sign( b );
5692     if ( aSign != bSign ) {
5693         return
5694                aSign
5695             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5696                  != 0 );
5697     }
5698     return
5699           aSign ? lt128( b.high, b.low, a.high, a.low )
5700         : lt128( a.high, a.low, b.high, b.low );
5701
5702 }
5703
5704 /*----------------------------------------------------------------------------
5705 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5706 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5707 | either operand is a NaN.   The comparison is performed according to the
5708 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5709 *----------------------------------------------------------------------------*/
5710 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5711 {
5712     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5713               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5714          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5715               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5716        ) {
5717         float_raise(float_flag_invalid, status);
5718         return 1;
5719     }
5720     return 0;
5721 }
5722
5723 /*----------------------------------------------------------------------------
5724 | Returns 1 if the extended double-precision floating-point value `a' is
5725 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5726 | cause an exception.  The comparison is performed according to the IEC/IEEE
5727 | Standard for Binary Floating-Point Arithmetic.
5728 *----------------------------------------------------------------------------*/
5729
5730 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5731 {
5732
5733     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5734               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5735          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5736               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5737        ) {
5738         if (    floatx80_is_signaling_nan( a )
5739              || floatx80_is_signaling_nan( b ) ) {
5740             float_raise(float_flag_invalid, status);
5741         }
5742         return 0;
5743     }
5744     return
5745            ( a.low == b.low )
5746         && (    ( a.high == b.high )
5747              || (    ( a.low == 0 )
5748                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5749            );
5750
5751 }
5752
5753 /*----------------------------------------------------------------------------
5754 | Returns 1 if the extended double-precision floating-point value `a' is less
5755 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5756 | do not cause an exception.  Otherwise, the comparison is performed according
5757 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5758 *----------------------------------------------------------------------------*/
5759
5760 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5761 {
5762     flag aSign, bSign;
5763
5764     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5765               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5766          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5767               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5768        ) {
5769         if (    floatx80_is_signaling_nan( a )
5770              || floatx80_is_signaling_nan( b ) ) {
5771             float_raise(float_flag_invalid, status);
5772         }
5773         return 0;
5774     }
5775     aSign = extractFloatx80Sign( a );
5776     bSign = extractFloatx80Sign( b );
5777     if ( aSign != bSign ) {
5778         return
5779                aSign
5780             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5781                  == 0 );
5782     }
5783     return
5784           aSign ? le128( b.high, b.low, a.high, a.low )
5785         : le128( a.high, a.low, b.high, b.low );
5786
5787 }
5788
5789 /*----------------------------------------------------------------------------
5790 | Returns 1 if the extended double-precision floating-point value `a' is less
5791 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5792 | an exception.  Otherwise, the comparison is performed according to the
5793 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5794 *----------------------------------------------------------------------------*/
5795
5796 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5797 {
5798     flag aSign, bSign;
5799
5800     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5801               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5802          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5803               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5804        ) {
5805         if (    floatx80_is_signaling_nan( a )
5806              || floatx80_is_signaling_nan( b ) ) {
5807             float_raise(float_flag_invalid, status);
5808         }
5809         return 0;
5810     }
5811     aSign = extractFloatx80Sign( a );
5812     bSign = extractFloatx80Sign( b );
5813     if ( aSign != bSign ) {
5814         return
5815                aSign
5816             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5817                  != 0 );
5818     }
5819     return
5820           aSign ? lt128( b.high, b.low, a.high, a.low )
5821         : lt128( a.high, a.low, b.high, b.low );
5822
5823 }
5824
5825 /*----------------------------------------------------------------------------
5826 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5827 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5828 | The comparison is performed according to the IEC/IEEE Standard for Binary
5829 | Floating-Point Arithmetic.
5830 *----------------------------------------------------------------------------*/
5831 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5832 {
5833     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5834               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5835          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5836               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5837        ) {
5838         if (    floatx80_is_signaling_nan( a )
5839              || floatx80_is_signaling_nan( b ) ) {
5840             float_raise(float_flag_invalid, status);
5841         }
5842         return 1;
5843     }
5844     return 0;
5845 }
5846
5847 /*----------------------------------------------------------------------------
5848 | Returns the result of converting the quadruple-precision floating-point
5849 | value `a' to the 32-bit two's complement integer format.  The conversion
5850 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5851 | Arithmetic---which means in particular that the conversion is rounded
5852 | according to the current rounding mode.  If `a' is a NaN, the largest
5853 | positive integer is returned.  Otherwise, if the conversion overflows, the
5854 | largest integer with the same sign as `a' is returned.
5855 *----------------------------------------------------------------------------*/
5856
5857 int32_t float128_to_int32(float128 a, float_status *status)
5858 {
5859     flag aSign;
5860     int32_t aExp, shiftCount;
5861     uint64_t aSig0, aSig1;
5862
5863     aSig1 = extractFloat128Frac1( a );
5864     aSig0 = extractFloat128Frac0( a );
5865     aExp = extractFloat128Exp( a );
5866     aSign = extractFloat128Sign( a );
5867     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5868     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5869     aSig0 |= ( aSig1 != 0 );
5870     shiftCount = 0x4028 - aExp;
5871     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5872     return roundAndPackInt32(aSign, aSig0, status);
5873
5874 }
5875
5876 /*----------------------------------------------------------------------------
5877 | Returns the result of converting the quadruple-precision floating-point
5878 | value `a' to the 32-bit two's complement integer format.  The conversion
5879 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5880 | Arithmetic, except that the conversion is always rounded toward zero.  If
5881 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5882 | conversion overflows, the largest integer with the same sign as `a' is
5883 | returned.
5884 *----------------------------------------------------------------------------*/
5885
5886 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5887 {
5888     flag aSign;
5889     int32_t aExp, shiftCount;
5890     uint64_t aSig0, aSig1, savedASig;
5891     int32_t z;
5892
5893     aSig1 = extractFloat128Frac1( a );
5894     aSig0 = extractFloat128Frac0( a );
5895     aExp = extractFloat128Exp( a );
5896     aSign = extractFloat128Sign( a );
5897     aSig0 |= ( aSig1 != 0 );
5898     if ( 0x401E < aExp ) {
5899         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5900         goto invalid;
5901     }
5902     else if ( aExp < 0x3FFF ) {
5903         if (aExp || aSig0) {
5904             status->float_exception_flags |= float_flag_inexact;
5905         }
5906         return 0;
5907     }
5908     aSig0 |= LIT64( 0x0001000000000000 );
5909     shiftCount = 0x402F - aExp;
5910     savedASig = aSig0;
5911     aSig0 >>= shiftCount;
5912     z = aSig0;
5913     if ( aSign ) z = - z;
5914     if ( ( z < 0 ) ^ aSign ) {
5915  invalid:
5916         float_raise(float_flag_invalid, status);
5917         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5918     }
5919     if ( ( aSig0<<shiftCount ) != savedASig ) {
5920         status->float_exception_flags |= float_flag_inexact;
5921     }
5922     return z;
5923
5924 }
5925
5926 /*----------------------------------------------------------------------------
5927 | Returns the result of converting the quadruple-precision floating-point
5928 | value `a' to the 64-bit two's complement integer format.  The conversion
5929 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5930 | Arithmetic---which means in particular that the conversion is rounded
5931 | according to the current rounding mode.  If `a' is a NaN, the largest
5932 | positive integer is returned.  Otherwise, if the conversion overflows, the
5933 | largest integer with the same sign as `a' is returned.
5934 *----------------------------------------------------------------------------*/
5935
5936 int64_t float128_to_int64(float128 a, float_status *status)
5937 {
5938     flag aSign;
5939     int32_t aExp, shiftCount;
5940     uint64_t aSig0, aSig1;
5941
5942     aSig1 = extractFloat128Frac1( a );
5943     aSig0 = extractFloat128Frac0( a );
5944     aExp = extractFloat128Exp( a );
5945     aSign = extractFloat128Sign( a );
5946     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5947     shiftCount = 0x402F - aExp;
5948     if ( shiftCount <= 0 ) {
5949         if ( 0x403E < aExp ) {
5950             float_raise(float_flag_invalid, status);
5951             if (    ! aSign
5952                  || (    ( aExp == 0x7FFF )
5953                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5954                     )
5955                ) {
5956                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5957             }
5958             return (int64_t) LIT64( 0x8000000000000000 );
5959         }
5960         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5961     }
5962     else {
5963         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5964     }
5965     return roundAndPackInt64(aSign, aSig0, aSig1, status);
5966
5967 }
5968
5969 /*----------------------------------------------------------------------------
5970 | Returns the result of converting the quadruple-precision floating-point
5971 | value `a' to the 64-bit two's complement integer format.  The conversion
5972 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5973 | Arithmetic, except that the conversion is always rounded toward zero.
5974 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5975 | the conversion overflows, the largest integer with the same sign as `a' is
5976 | returned.
5977 *----------------------------------------------------------------------------*/
5978
5979 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5980 {
5981     flag aSign;
5982     int32_t aExp, shiftCount;
5983     uint64_t aSig0, aSig1;
5984     int64_t z;
5985
5986     aSig1 = extractFloat128Frac1( a );
5987     aSig0 = extractFloat128Frac0( a );
5988     aExp = extractFloat128Exp( a );
5989     aSign = extractFloat128Sign( a );
5990     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5991     shiftCount = aExp - 0x402F;
5992     if ( 0 < shiftCount ) {
5993         if ( 0x403E <= aExp ) {
5994             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5995             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5996                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5997                 if (aSig1) {
5998                     status->float_exception_flags |= float_flag_inexact;
5999                 }
6000             }
6001             else {
6002                 float_raise(float_flag_invalid, status);
6003                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6004                     return LIT64( 0x7FFFFFFFFFFFFFFF );
6005                 }
6006             }
6007             return (int64_t) LIT64( 0x8000000000000000 );
6008         }
6009         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6010         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6011             status->float_exception_flags |= float_flag_inexact;
6012         }
6013     }
6014     else {
6015         if ( aExp < 0x3FFF ) {
6016             if ( aExp | aSig0 | aSig1 ) {
6017                 status->float_exception_flags |= float_flag_inexact;
6018             }
6019             return 0;
6020         }
6021         z = aSig0>>( - shiftCount );
6022         if (    aSig1
6023              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6024             status->float_exception_flags |= float_flag_inexact;
6025         }
6026     }
6027     if ( aSign ) z = - z;
6028     return z;
6029
6030 }
6031
6032 /*----------------------------------------------------------------------------
6033 | Returns the result of converting the quadruple-precision floating-point
6034 | value `a' to the single-precision floating-point format.  The conversion
6035 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6036 | Arithmetic.
6037 *----------------------------------------------------------------------------*/
6038
6039 float32 float128_to_float32(float128 a, float_status *status)
6040 {
6041     flag aSign;
6042     int32_t aExp;
6043     uint64_t aSig0, aSig1;
6044     uint32_t zSig;
6045
6046     aSig1 = extractFloat128Frac1( a );
6047     aSig0 = extractFloat128Frac0( a );
6048     aExp = extractFloat128Exp( a );
6049     aSign = extractFloat128Sign( a );
6050     if ( aExp == 0x7FFF ) {
6051         if ( aSig0 | aSig1 ) {
6052             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6053         }
6054         return packFloat32( aSign, 0xFF, 0 );
6055     }
6056     aSig0 |= ( aSig1 != 0 );
6057     shift64RightJamming( aSig0, 18, &aSig0 );
6058     zSig = aSig0;
6059     if ( aExp || zSig ) {
6060         zSig |= 0x40000000;
6061         aExp -= 0x3F81;
6062     }
6063     return roundAndPackFloat32(aSign, aExp, zSig, status);
6064
6065 }
6066
6067 /*----------------------------------------------------------------------------
6068 | Returns the result of converting the quadruple-precision floating-point
6069 | value `a' to the double-precision floating-point format.  The conversion
6070 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6071 | Arithmetic.
6072 *----------------------------------------------------------------------------*/
6073
6074 float64 float128_to_float64(float128 a, float_status *status)
6075 {
6076     flag aSign;
6077     int32_t aExp;
6078     uint64_t aSig0, aSig1;
6079
6080     aSig1 = extractFloat128Frac1( a );
6081     aSig0 = extractFloat128Frac0( a );
6082     aExp = extractFloat128Exp( a );
6083     aSign = extractFloat128Sign( a );
6084     if ( aExp == 0x7FFF ) {
6085         if ( aSig0 | aSig1 ) {
6086             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6087         }
6088         return packFloat64( aSign, 0x7FF, 0 );
6089     }
6090     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6091     aSig0 |= ( aSig1 != 0 );
6092     if ( aExp || aSig0 ) {
6093         aSig0 |= LIT64( 0x4000000000000000 );
6094         aExp -= 0x3C01;
6095     }
6096     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6097
6098 }
6099
6100 /*----------------------------------------------------------------------------
6101 | Returns the result of converting the quadruple-precision floating-point
6102 | value `a' to the extended double-precision floating-point format.  The
6103 | conversion is performed according to the IEC/IEEE Standard for Binary
6104 | Floating-Point Arithmetic.
6105 *----------------------------------------------------------------------------*/
6106
6107 floatx80 float128_to_floatx80(float128 a, float_status *status)
6108 {
6109     flag aSign;
6110     int32_t aExp;
6111     uint64_t aSig0, aSig1;
6112
6113     aSig1 = extractFloat128Frac1( a );
6114     aSig0 = extractFloat128Frac0( a );
6115     aExp = extractFloat128Exp( a );
6116     aSign = extractFloat128Sign( a );
6117     if ( aExp == 0x7FFF ) {
6118         if ( aSig0 | aSig1 ) {
6119             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6120         }
6121         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6122     }
6123     if ( aExp == 0 ) {
6124         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6125         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6126     }
6127     else {
6128         aSig0 |= LIT64( 0x0001000000000000 );
6129     }
6130     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6131     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6132
6133 }
6134
6135 /*----------------------------------------------------------------------------
6136 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6137 | returns the result as a quadruple-precision floating-point value.  The
6138 | operation is performed according to the IEC/IEEE Standard for Binary
6139 | Floating-Point Arithmetic.
6140 *----------------------------------------------------------------------------*/
6141
6142 float128 float128_round_to_int(float128 a, float_status *status)
6143 {
6144     flag aSign;
6145     int32_t aExp;
6146     uint64_t lastBitMask, roundBitsMask;
6147     float128 z;
6148
6149     aExp = extractFloat128Exp( a );
6150     if ( 0x402F <= aExp ) {
6151         if ( 0x406F <= aExp ) {
6152             if (    ( aExp == 0x7FFF )
6153                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6154                ) {
6155                 return propagateFloat128NaN(a, a, status);
6156             }
6157             return a;
6158         }
6159         lastBitMask = 1;
6160         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6161         roundBitsMask = lastBitMask - 1;
6162         z = a;
6163         switch (status->float_rounding_mode) {
6164         case float_round_nearest_even:
6165             if ( lastBitMask ) {
6166                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6167                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6168             }
6169             else {
6170                 if ( (int64_t) z.low < 0 ) {
6171                     ++z.high;
6172                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6173                 }
6174             }
6175             break;
6176         case float_round_ties_away:
6177             if (lastBitMask) {
6178                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6179             } else {
6180                 if ((int64_t) z.low < 0) {
6181                     ++z.high;
6182                 }
6183             }
6184             break;
6185         case float_round_to_zero:
6186             break;
6187         case float_round_up:
6188             if (!extractFloat128Sign(z)) {
6189                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6190             }
6191             break;
6192         case float_round_down:
6193             if (extractFloat128Sign(z)) {
6194                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6195             }
6196             break;
6197         default:
6198             abort();
6199         }
6200         z.low &= ~ roundBitsMask;
6201     }
6202     else {
6203         if ( aExp < 0x3FFF ) {
6204             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6205             status->float_exception_flags |= float_flag_inexact;
6206             aSign = extractFloat128Sign( a );
6207             switch (status->float_rounding_mode) {
6208              case float_round_nearest_even:
6209                 if (    ( aExp == 0x3FFE )
6210                      && (   extractFloat128Frac0( a )
6211                           | extractFloat128Frac1( a ) )
6212                    ) {
6213                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6214                 }
6215                 break;
6216             case float_round_ties_away:
6217                 if (aExp == 0x3FFE) {
6218                     return packFloat128(aSign, 0x3FFF, 0, 0);
6219                 }
6220                 break;
6221              case float_round_down:
6222                 return
6223                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6224                     : packFloat128( 0, 0, 0, 0 );
6225              case float_round_up:
6226                 return
6227                       aSign ? packFloat128( 1, 0, 0, 0 )
6228                     : packFloat128( 0, 0x3FFF, 0, 0 );
6229             }
6230             return packFloat128( aSign, 0, 0, 0 );
6231         }
6232         lastBitMask = 1;
6233         lastBitMask <<= 0x402F - aExp;
6234         roundBitsMask = lastBitMask - 1;
6235         z.low = 0;
6236         z.high = a.high;
6237         switch (status->float_rounding_mode) {
6238         case float_round_nearest_even:
6239             z.high += lastBitMask>>1;
6240             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6241                 z.high &= ~ lastBitMask;
6242             }
6243             break;
6244         case float_round_ties_away:
6245             z.high += lastBitMask>>1;
6246             break;
6247         case float_round_to_zero:
6248             break;
6249         case float_round_up:
6250             if (!extractFloat128Sign(z)) {
6251                 z.high |= ( a.low != 0 );
6252                 z.high += roundBitsMask;
6253             }
6254             break;
6255         case float_round_down:
6256             if (extractFloat128Sign(z)) {
6257                 z.high |= (a.low != 0);
6258                 z.high += roundBitsMask;
6259             }
6260             break;
6261         default:
6262             abort();
6263         }
6264         z.high &= ~ roundBitsMask;
6265     }
6266     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6267         status->float_exception_flags |= float_flag_inexact;
6268     }
6269     return z;
6270
6271 }
6272
6273 /*----------------------------------------------------------------------------
6274 | Returns the result of adding the absolute values of the quadruple-precision
6275 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6276 | before being returned.  `zSign' is ignored if the result is a NaN.
6277 | The addition is performed according to the IEC/IEEE Standard for Binary
6278 | Floating-Point Arithmetic.
6279 *----------------------------------------------------------------------------*/
6280
6281 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6282                                 float_status *status)
6283 {
6284     int32_t aExp, bExp, zExp;
6285     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6286     int32_t expDiff;
6287
6288     aSig1 = extractFloat128Frac1( a );
6289     aSig0 = extractFloat128Frac0( a );
6290     aExp = extractFloat128Exp( a );
6291     bSig1 = extractFloat128Frac1( b );
6292     bSig0 = extractFloat128Frac0( b );
6293     bExp = extractFloat128Exp( b );
6294     expDiff = aExp - bExp;
6295     if ( 0 < expDiff ) {
6296         if ( aExp == 0x7FFF ) {
6297             if (aSig0 | aSig1) {
6298                 return propagateFloat128NaN(a, b, status);
6299             }
6300             return a;
6301         }
6302         if ( bExp == 0 ) {
6303             --expDiff;
6304         }
6305         else {
6306             bSig0 |= LIT64( 0x0001000000000000 );
6307         }
6308         shift128ExtraRightJamming(
6309             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6310         zExp = aExp;
6311     }
6312     else if ( expDiff < 0 ) {
6313         if ( bExp == 0x7FFF ) {
6314             if (bSig0 | bSig1) {
6315                 return propagateFloat128NaN(a, b, status);
6316             }
6317             return packFloat128( zSign, 0x7FFF, 0, 0 );
6318         }
6319         if ( aExp == 0 ) {
6320             ++expDiff;
6321         }
6322         else {
6323             aSig0 |= LIT64( 0x0001000000000000 );
6324         }
6325         shift128ExtraRightJamming(
6326             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6327         zExp = bExp;
6328     }
6329     else {
6330         if ( aExp == 0x7FFF ) {
6331             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6332                 return propagateFloat128NaN(a, b, status);
6333             }
6334             return a;
6335         }
6336         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6337         if ( aExp == 0 ) {
6338             if (status->flush_to_zero) {
6339                 if (zSig0 | zSig1) {
6340                     float_raise(float_flag_output_denormal, status);
6341                 }
6342                 return packFloat128(zSign, 0, 0, 0);
6343             }
6344             return packFloat128( zSign, 0, zSig0, zSig1 );
6345         }
6346         zSig2 = 0;
6347         zSig0 |= LIT64( 0x0002000000000000 );
6348         zExp = aExp;
6349         goto shiftRight1;
6350     }
6351     aSig0 |= LIT64( 0x0001000000000000 );
6352     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6353     --zExp;
6354     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6355     ++zExp;
6356  shiftRight1:
6357     shift128ExtraRightJamming(
6358         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6359  roundAndPack:
6360     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6361
6362 }
6363
6364 /*----------------------------------------------------------------------------
6365 | Returns the result of subtracting the absolute values of the quadruple-
6366 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6367 | difference is negated before being returned.  `zSign' is ignored if the
6368 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6369 | Standard for Binary Floating-Point Arithmetic.
6370 *----------------------------------------------------------------------------*/
6371
6372 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6373                                 float_status *status)
6374 {
6375     int32_t aExp, bExp, zExp;
6376     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6377     int32_t expDiff;
6378     float128 z;
6379
6380     aSig1 = extractFloat128Frac1( a );
6381     aSig0 = extractFloat128Frac0( a );
6382     aExp = extractFloat128Exp( a );
6383     bSig1 = extractFloat128Frac1( b );
6384     bSig0 = extractFloat128Frac0( b );
6385     bExp = extractFloat128Exp( b );
6386     expDiff = aExp - bExp;
6387     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6388     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6389     if ( 0 < expDiff ) goto aExpBigger;
6390     if ( expDiff < 0 ) goto bExpBigger;
6391     if ( aExp == 0x7FFF ) {
6392         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6393             return propagateFloat128NaN(a, b, status);
6394         }
6395         float_raise(float_flag_invalid, status);
6396         z.low = float128_default_nan_low;
6397         z.high = float128_default_nan_high;
6398         return z;
6399     }
6400     if ( aExp == 0 ) {
6401         aExp = 1;
6402         bExp = 1;
6403     }
6404     if ( bSig0 < aSig0 ) goto aBigger;
6405     if ( aSig0 < bSig0 ) goto bBigger;
6406     if ( bSig1 < aSig1 ) goto aBigger;
6407     if ( aSig1 < bSig1 ) goto bBigger;
6408     return packFloat128(status->float_rounding_mode == float_round_down,
6409                         0, 0, 0);
6410  bExpBigger:
6411     if ( bExp == 0x7FFF ) {
6412         if (bSig0 | bSig1) {
6413             return propagateFloat128NaN(a, b, status);
6414         }
6415         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6416     }
6417     if ( aExp == 0 ) {
6418         ++expDiff;
6419     }
6420     else {
6421         aSig0 |= LIT64( 0x4000000000000000 );
6422     }
6423     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6424     bSig0 |= LIT64( 0x4000000000000000 );
6425  bBigger:
6426     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6427     zExp = bExp;
6428     zSign ^= 1;
6429     goto normalizeRoundAndPack;
6430  aExpBigger:
6431     if ( aExp == 0x7FFF ) {
6432         if (aSig0 | aSig1) {
6433             return propagateFloat128NaN(a, b, status);
6434         }
6435         return a;
6436     }
6437     if ( bExp == 0 ) {
6438         --expDiff;
6439     }
6440     else {
6441         bSig0 |= LIT64( 0x4000000000000000 );
6442     }
6443     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6444     aSig0 |= LIT64( 0x4000000000000000 );
6445  aBigger:
6446     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6447     zExp = aExp;
6448  normalizeRoundAndPack:
6449     --zExp;
6450     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6451                                          status);
6452
6453 }
6454
6455 /*----------------------------------------------------------------------------
6456 | Returns the result of adding the quadruple-precision floating-point values
6457 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6458 | for Binary Floating-Point Arithmetic.
6459 *----------------------------------------------------------------------------*/
6460
6461 float128 float128_add(float128 a, float128 b, float_status *status)
6462 {
6463     flag aSign, bSign;
6464
6465     aSign = extractFloat128Sign( a );
6466     bSign = extractFloat128Sign( b );
6467     if ( aSign == bSign ) {
6468         return addFloat128Sigs(a, b, aSign, status);
6469     }
6470     else {
6471         return subFloat128Sigs(a, b, aSign, status);
6472     }
6473
6474 }
6475
6476 /*----------------------------------------------------------------------------
6477 | Returns the result of subtracting the quadruple-precision floating-point
6478 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6479 | Standard for Binary Floating-Point Arithmetic.
6480 *----------------------------------------------------------------------------*/
6481
6482 float128 float128_sub(float128 a, float128 b, float_status *status)
6483 {
6484     flag aSign, bSign;
6485
6486     aSign = extractFloat128Sign( a );
6487     bSign = extractFloat128Sign( b );
6488     if ( aSign == bSign ) {
6489         return subFloat128Sigs(a, b, aSign, status);
6490     }
6491     else {
6492         return addFloat128Sigs(a, b, aSign, status);
6493     }
6494
6495 }
6496
6497 /*----------------------------------------------------------------------------
6498 | Returns the result of multiplying the quadruple-precision floating-point
6499 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6500 | Standard for Binary Floating-Point Arithmetic.
6501 *----------------------------------------------------------------------------*/
6502
6503 float128 float128_mul(float128 a, float128 b, float_status *status)
6504 {
6505     flag aSign, bSign, zSign;
6506     int32_t aExp, bExp, zExp;
6507     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6508     float128 z;
6509
6510     aSig1 = extractFloat128Frac1( a );
6511     aSig0 = extractFloat128Frac0( a );
6512     aExp = extractFloat128Exp( a );
6513     aSign = extractFloat128Sign( a );
6514     bSig1 = extractFloat128Frac1( b );
6515     bSig0 = extractFloat128Frac0( b );
6516     bExp = extractFloat128Exp( b );
6517     bSign = extractFloat128Sign( b );
6518     zSign = aSign ^ bSign;
6519     if ( aExp == 0x7FFF ) {
6520         if (    ( aSig0 | aSig1 )
6521              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6522             return propagateFloat128NaN(a, b, status);
6523         }
6524         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6525         return packFloat128( zSign, 0x7FFF, 0, 0 );
6526     }
6527     if ( bExp == 0x7FFF ) {
6528         if (bSig0 | bSig1) {
6529             return propagateFloat128NaN(a, b, status);
6530         }
6531         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6532  invalid:
6533             float_raise(float_flag_invalid, status);
6534             z.low = float128_default_nan_low;
6535             z.high = float128_default_nan_high;
6536             return z;
6537         }
6538         return packFloat128( zSign, 0x7FFF, 0, 0 );
6539     }
6540     if ( aExp == 0 ) {
6541         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6542         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6543     }
6544     if ( bExp == 0 ) {
6545         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6546         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6547     }
6548     zExp = aExp + bExp - 0x4000;
6549     aSig0 |= LIT64( 0x0001000000000000 );
6550     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6551     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6552     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6553     zSig2 |= ( zSig3 != 0 );
6554     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6555         shift128ExtraRightJamming(
6556             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6557         ++zExp;
6558     }
6559     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6560
6561 }
6562
6563 /*----------------------------------------------------------------------------
6564 | Returns the result of dividing the quadruple-precision floating-point value
6565 | `a' by the corresponding value `b'.  The operation is performed according to
6566 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6567 *----------------------------------------------------------------------------*/
6568
6569 float128 float128_div(float128 a, float128 b, float_status *status)
6570 {
6571     flag aSign, bSign, zSign;
6572     int32_t aExp, bExp, zExp;
6573     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6574     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6575     float128 z;
6576
6577     aSig1 = extractFloat128Frac1( a );
6578     aSig0 = extractFloat128Frac0( a );
6579     aExp = extractFloat128Exp( a );
6580     aSign = extractFloat128Sign( a );
6581     bSig1 = extractFloat128Frac1( b );
6582     bSig0 = extractFloat128Frac0( b );
6583     bExp = extractFloat128Exp( b );
6584     bSign = extractFloat128Sign( b );
6585     zSign = aSign ^ bSign;
6586     if ( aExp == 0x7FFF ) {
6587         if (aSig0 | aSig1) {
6588             return propagateFloat128NaN(a, b, status);
6589         }
6590         if ( bExp == 0x7FFF ) {
6591             if (bSig0 | bSig1) {
6592                 return propagateFloat128NaN(a, b, status);
6593             }
6594             goto invalid;
6595         }
6596         return packFloat128( zSign, 0x7FFF, 0, 0 );
6597     }
6598     if ( bExp == 0x7FFF ) {
6599         if (bSig0 | bSig1) {
6600             return propagateFloat128NaN(a, b, status);
6601         }
6602         return packFloat128( zSign, 0, 0, 0 );
6603     }
6604     if ( bExp == 0 ) {
6605         if ( ( bSig0 | bSig1 ) == 0 ) {
6606             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6607  invalid:
6608                 float_raise(float_flag_invalid, status);
6609                 z.low = float128_default_nan_low;
6610                 z.high = float128_default_nan_high;
6611                 return z;
6612             }
6613             float_raise(float_flag_divbyzero, status);
6614             return packFloat128( zSign, 0x7FFF, 0, 0 );
6615         }
6616         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6617     }
6618     if ( aExp == 0 ) {
6619         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6620         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6621     }
6622     zExp = aExp - bExp + 0x3FFD;
6623     shortShift128Left(
6624         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6625     shortShift128Left(
6626         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6627     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6628         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6629         ++zExp;
6630     }
6631     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6632     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6633     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6634     while ( (int64_t) rem0 < 0 ) {
6635         --zSig0;
6636         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6637     }
6638     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6639     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6640         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6641         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6642         while ( (int64_t) rem1 < 0 ) {
6643             --zSig1;
6644             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6645         }
6646         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6647     }
6648     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6649     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6650
6651 }
6652
6653 /*----------------------------------------------------------------------------
6654 | Returns the remainder of the quadruple-precision floating-point value `a'
6655 | with respect to the corresponding value `b'.  The operation is performed
6656 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6657 *----------------------------------------------------------------------------*/
6658
6659 float128 float128_rem(float128 a, float128 b, float_status *status)
6660 {
6661     flag aSign, zSign;
6662     int32_t aExp, bExp, expDiff;
6663     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6664     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6665     int64_t sigMean0;
6666     float128 z;
6667
6668     aSig1 = extractFloat128Frac1( a );
6669     aSig0 = extractFloat128Frac0( a );
6670     aExp = extractFloat128Exp( a );
6671     aSign = extractFloat128Sign( a );
6672     bSig1 = extractFloat128Frac1( b );
6673     bSig0 = extractFloat128Frac0( b );
6674     bExp = extractFloat128Exp( b );
6675     if ( aExp == 0x7FFF ) {
6676         if (    ( aSig0 | aSig1 )
6677              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6678             return propagateFloat128NaN(a, b, status);
6679         }
6680         goto invalid;
6681     }
6682     if ( bExp == 0x7FFF ) {
6683         if (bSig0 | bSig1) {
6684             return propagateFloat128NaN(a, b, status);
6685         }
6686         return a;
6687     }
6688     if ( bExp == 0 ) {
6689         if ( ( bSig0 | bSig1 ) == 0 ) {
6690  invalid:
6691             float_raise(float_flag_invalid, status);
6692             z.low = float128_default_nan_low;
6693             z.high = float128_default_nan_high;
6694             return z;
6695         }
6696         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6697     }
6698     if ( aExp == 0 ) {
6699         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6700         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6701     }
6702     expDiff = aExp - bExp;
6703     if ( expDiff < -1 ) return a;
6704     shortShift128Left(
6705         aSig0 | LIT64( 0x0001000000000000 ),
6706         aSig1,
6707         15 - ( expDiff < 0 ),
6708         &aSig0,
6709         &aSig1
6710     );
6711     shortShift128Left(
6712         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6713     q = le128( bSig0, bSig1, aSig0, aSig1 );
6714     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6715     expDiff -= 64;
6716     while ( 0 < expDiff ) {
6717         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6718         q = ( 4 < q ) ? q - 4 : 0;
6719         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6720         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6721         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6722         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6723         expDiff -= 61;
6724     }
6725     if ( -64 < expDiff ) {
6726         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6727         q = ( 4 < q ) ? q - 4 : 0;
6728         q >>= - expDiff;
6729         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6730         expDiff += 52;
6731         if ( expDiff < 0 ) {
6732             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6733         }
6734         else {
6735             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6736         }
6737         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6738         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6739     }
6740     else {
6741         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6742         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6743     }
6744     do {
6745         alternateASig0 = aSig0;
6746         alternateASig1 = aSig1;
6747         ++q;
6748         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6749     } while ( 0 <= (int64_t) aSig0 );
6750     add128(
6751         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6752     if (    ( sigMean0 < 0 )
6753          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6754         aSig0 = alternateASig0;
6755         aSig1 = alternateASig1;
6756     }
6757     zSign = ( (int64_t) aSig0 < 0 );
6758     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6759     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6760                                          status);
6761 }
6762
6763 /*----------------------------------------------------------------------------
6764 | Returns the square root of the quadruple-precision floating-point value `a'.
6765 | The operation is performed according to the IEC/IEEE Standard for Binary
6766 | Floating-Point Arithmetic.
6767 *----------------------------------------------------------------------------*/
6768
6769 float128 float128_sqrt(float128 a, float_status *status)
6770 {
6771     flag aSign;
6772     int32_t aExp, zExp;
6773     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6774     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6775     float128 z;
6776
6777     aSig1 = extractFloat128Frac1( a );
6778     aSig0 = extractFloat128Frac0( a );
6779     aExp = extractFloat128Exp( a );
6780     aSign = extractFloat128Sign( a );
6781     if ( aExp == 0x7FFF ) {
6782         if (aSig0 | aSig1) {
6783             return propagateFloat128NaN(a, a, status);
6784         }
6785         if ( ! aSign ) return a;
6786         goto invalid;
6787     }
6788     if ( aSign ) {
6789         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6790  invalid:
6791         float_raise(float_flag_invalid, status);
6792         z.low = float128_default_nan_low;
6793         z.high = float128_default_nan_high;
6794         return z;
6795     }
6796     if ( aExp == 0 ) {
6797         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6798         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6799     }
6800     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6801     aSig0 |= LIT64( 0x0001000000000000 );
6802     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6803     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6804     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6805     doubleZSig0 = zSig0<<1;
6806     mul64To128( zSig0, zSig0, &term0, &term1 );
6807     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6808     while ( (int64_t) rem0 < 0 ) {
6809         --zSig0;
6810         doubleZSig0 -= 2;
6811         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6812     }
6813     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6814     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6815         if ( zSig1 == 0 ) zSig1 = 1;
6816         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6817         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6818         mul64To128( zSig1, zSig1, &term2, &term3 );
6819         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6820         while ( (int64_t) rem1 < 0 ) {
6821             --zSig1;
6822             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6823             term3 |= 1;
6824             term2 |= doubleZSig0;
6825             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6826         }
6827         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6828     }
6829     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6830     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6831
6832 }
6833
6834 /*----------------------------------------------------------------------------
6835 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6836 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6837 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6838 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6839 *----------------------------------------------------------------------------*/
6840
6841 int float128_eq(float128 a, float128 b, float_status *status)
6842 {
6843
6844     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6845               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6846          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6847               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6848        ) {
6849         float_raise(float_flag_invalid, status);
6850         return 0;
6851     }
6852     return
6853            ( a.low == b.low )
6854         && (    ( a.high == b.high )
6855              || (    ( a.low == 0 )
6856                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6857            );
6858
6859 }
6860
6861 /*----------------------------------------------------------------------------
6862 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6863 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6864 | exception is raised if either operand is a NaN.  The comparison is performed
6865 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6866 *----------------------------------------------------------------------------*/
6867
6868 int float128_le(float128 a, float128 b, float_status *status)
6869 {
6870     flag aSign, bSign;
6871
6872     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6873               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6874          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6875               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6876        ) {
6877         float_raise(float_flag_invalid, status);
6878         return 0;
6879     }
6880     aSign = extractFloat128Sign( a );
6881     bSign = extractFloat128Sign( b );
6882     if ( aSign != bSign ) {
6883         return
6884                aSign
6885             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6886                  == 0 );
6887     }
6888     return
6889           aSign ? le128( b.high, b.low, a.high, a.low )
6890         : le128( a.high, a.low, b.high, b.low );
6891
6892 }
6893
6894 /*----------------------------------------------------------------------------
6895 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6896 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6897 | raised if either operand is a NaN.  The comparison is performed according
6898 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6899 *----------------------------------------------------------------------------*/
6900
6901 int float128_lt(float128 a, float128 b, float_status *status)
6902 {
6903     flag aSign, bSign;
6904
6905     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6906               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6907          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6908               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6909        ) {
6910         float_raise(float_flag_invalid, status);
6911         return 0;
6912     }
6913     aSign = extractFloat128Sign( a );
6914     bSign = extractFloat128Sign( b );
6915     if ( aSign != bSign ) {
6916         return
6917                aSign
6918             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6919                  != 0 );
6920     }
6921     return
6922           aSign ? lt128( b.high, b.low, a.high, a.low )
6923         : lt128( a.high, a.low, b.high, b.low );
6924
6925 }
6926
6927 /*----------------------------------------------------------------------------
6928 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6929 | be compared, and 0 otherwise.  The invalid exception is raised if either
6930 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6931 | Standard for Binary Floating-Point Arithmetic.
6932 *----------------------------------------------------------------------------*/
6933
6934 int float128_unordered(float128 a, float128 b, float_status *status)
6935 {
6936     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6937               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6938          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6939               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6940        ) {
6941         float_raise(float_flag_invalid, status);
6942         return 1;
6943     }
6944     return 0;
6945 }
6946
6947 /*----------------------------------------------------------------------------
6948 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6949 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6950 | exception.  The comparison is performed according to the IEC/IEEE Standard
6951 | for Binary Floating-Point Arithmetic.
6952 *----------------------------------------------------------------------------*/
6953
6954 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6955 {
6956
6957     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6958               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6959          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6960               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6961        ) {
6962         if (    float128_is_signaling_nan( a )
6963              || float128_is_signaling_nan( b ) ) {
6964             float_raise(float_flag_invalid, status);
6965         }
6966         return 0;
6967     }
6968     return
6969            ( a.low == b.low )
6970         && (    ( a.high == b.high )
6971              || (    ( a.low == 0 )
6972                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6973            );
6974
6975 }
6976
6977 /*----------------------------------------------------------------------------
6978 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6979 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6980 | cause an exception.  Otherwise, the comparison is performed according to the
6981 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6982 *----------------------------------------------------------------------------*/
6983
6984 int float128_le_quiet(float128 a, float128 b, float_status *status)
6985 {
6986     flag aSign, bSign;
6987
6988     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6989               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6990          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6991               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6992        ) {
6993         if (    float128_is_signaling_nan( a )
6994              || float128_is_signaling_nan( b ) ) {
6995             float_raise(float_flag_invalid, status);
6996         }
6997         return 0;
6998     }
6999     aSign = extractFloat128Sign( a );
7000     bSign = extractFloat128Sign( b );
7001     if ( aSign != bSign ) {
7002         return
7003                aSign
7004             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7005                  == 0 );
7006     }
7007     return
7008           aSign ? le128( b.high, b.low, a.high, a.low )
7009         : le128( a.high, a.low, b.high, b.low );
7010
7011 }
7012
7013 /*----------------------------------------------------------------------------
7014 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7015 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7016 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7017 | Standard for Binary Floating-Point Arithmetic.
7018 *----------------------------------------------------------------------------*/
7019
7020 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7021 {
7022     flag aSign, bSign;
7023
7024     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7025               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7026          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7027               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7028        ) {
7029         if (    float128_is_signaling_nan( a )
7030              || float128_is_signaling_nan( b ) ) {
7031             float_raise(float_flag_invalid, status);
7032         }
7033         return 0;
7034     }
7035     aSign = extractFloat128Sign( a );
7036     bSign = extractFloat128Sign( b );
7037     if ( aSign != bSign ) {
7038         return
7039                aSign
7040             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7041                  != 0 );
7042     }
7043     return
7044           aSign ? lt128( b.high, b.low, a.high, a.low )
7045         : lt128( a.high, a.low, b.high, b.low );
7046
7047 }
7048
7049 /*----------------------------------------------------------------------------
7050 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7051 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7052 | comparison is performed according to the IEC/IEEE Standard for Binary
7053 | Floating-Point Arithmetic.
7054 *----------------------------------------------------------------------------*/
7055
7056 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7057 {
7058     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7059               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7060          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7061               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7062        ) {
7063         if (    float128_is_signaling_nan( a )
7064              || float128_is_signaling_nan( b ) ) {
7065             float_raise(float_flag_invalid, status);
7066         }
7067         return 1;
7068     }
7069     return 0;
7070 }
7071
7072 /* misc functions */
7073 float32 uint32_to_float32(uint32_t a, float_status *status)
7074 {
7075     return int64_to_float32(a, status);
7076 }
7077
7078 float64 uint32_to_float64(uint32_t a, float_status *status)
7079 {
7080     return int64_to_float64(a, status);
7081 }
7082
7083 uint32_t float32_to_uint32(float32 a, float_status *status)
7084 {
7085     int64_t v;
7086     uint32_t res;
7087     int old_exc_flags = get_float_exception_flags(status);
7088
7089     v = float32_to_int64(a, status);
7090     if (v < 0) {
7091         res = 0;
7092     } else if (v > 0xffffffff) {
7093         res = 0xffffffff;
7094     } else {
7095         return v;
7096     }
7097     set_float_exception_flags(old_exc_flags, status);
7098     float_raise(float_flag_invalid, status);
7099     return res;
7100 }
7101
7102 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status)
7103 {
7104     int64_t v;
7105     uint32_t res;
7106     int old_exc_flags = get_float_exception_flags(status);
7107
7108     v = float32_to_int64_round_to_zero(a, status);
7109     if (v < 0) {
7110         res = 0;
7111     } else if (v > 0xffffffff) {
7112         res = 0xffffffff;
7113     } else {
7114         return v;
7115     }
7116     set_float_exception_flags(old_exc_flags, status);
7117     float_raise(float_flag_invalid, status);
7118     return res;
7119 }
7120
7121 int_fast16_t float32_to_int16(float32 a, float_status *status)
7122 {
7123     int32_t v;
7124     int_fast16_t res;
7125     int old_exc_flags = get_float_exception_flags(status);
7126
7127     v = float32_to_int32(a, status);
7128     if (v < -0x8000) {
7129         res = -0x8000;
7130     } else if (v > 0x7fff) {
7131         res = 0x7fff;
7132     } else {
7133         return v;
7134     }
7135
7136     set_float_exception_flags(old_exc_flags, status);
7137     float_raise(float_flag_invalid, status);
7138     return res;
7139 }
7140
7141 uint_fast16_t float32_to_uint16(float32 a, float_status *status)
7142 {
7143     int32_t v;
7144     uint_fast16_t res;
7145     int old_exc_flags = get_float_exception_flags(status);
7146
7147     v = float32_to_int32(a, status);
7148     if (v < 0) {
7149         res = 0;
7150     } else if (v > 0xffff) {
7151         res = 0xffff;
7152     } else {
7153         return v;
7154     }
7155
7156     set_float_exception_flags(old_exc_flags, status);
7157     float_raise(float_flag_invalid, status);
7158     return res;
7159 }
7160
7161 uint_fast16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
7162 {
7163     int64_t v;
7164     uint_fast16_t res;
7165     int old_exc_flags = get_float_exception_flags(status);
7166
7167     v = float32_to_int64_round_to_zero(a, status);
7168     if (v < 0) {
7169         res = 0;
7170     } else if (v > 0xffff) {
7171         res = 0xffff;
7172     } else {
7173         return v;
7174     }
7175     set_float_exception_flags(old_exc_flags, status);
7176     float_raise(float_flag_invalid, status);
7177     return res;
7178 }
7179
7180 uint32_t float64_to_uint32(float64 a, float_status *status)
7181 {
7182     uint64_t v;
7183     uint32_t res;
7184     int old_exc_flags = get_float_exception_flags(status);
7185
7186     v = float64_to_uint64(a, status);
7187     if (v > 0xffffffff) {
7188         res = 0xffffffff;
7189     } else {
7190         return v;
7191     }
7192     set_float_exception_flags(old_exc_flags, status);
7193     float_raise(float_flag_invalid, status);
7194     return res;
7195 }
7196
7197 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status)
7198 {
7199     uint64_t v;
7200     uint32_t res;
7201     int old_exc_flags = get_float_exception_flags(status);
7202
7203     v = float64_to_uint64_round_to_zero(a, status);
7204     if (v > 0xffffffff) {
7205         res = 0xffffffff;
7206     } else {
7207         return v;
7208     }
7209     set_float_exception_flags(old_exc_flags, status);
7210     float_raise(float_flag_invalid, status);
7211     return res;
7212 }
7213
7214 int_fast16_t float64_to_int16(float64 a, float_status *status)
7215 {
7216     int64_t v;
7217     int_fast16_t res;
7218     int old_exc_flags = get_float_exception_flags(status);
7219
7220     v = float64_to_int32(a, status);
7221     if (v < -0x8000) {
7222         res = -0x8000;
7223     } else if (v > 0x7fff) {
7224         res = 0x7fff;
7225     } else {
7226         return v;
7227     }
7228
7229     set_float_exception_flags(old_exc_flags, status);
7230     float_raise(float_flag_invalid, status);
7231     return res;
7232 }
7233
7234 uint_fast16_t float64_to_uint16(float64 a, float_status *status)
7235 {
7236     int64_t v;
7237     uint_fast16_t res;
7238     int old_exc_flags = get_float_exception_flags(status);
7239
7240     v = float64_to_int32(a, status);
7241     if (v < 0) {
7242         res = 0;
7243     } else if (v > 0xffff) {
7244         res = 0xffff;
7245     } else {
7246         return v;
7247     }
7248
7249     set_float_exception_flags(old_exc_flags, status);
7250     float_raise(float_flag_invalid, status);
7251     return res;
7252 }
7253
7254 uint_fast16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
7255 {
7256     int64_t v;
7257     uint_fast16_t res;
7258     int old_exc_flags = get_float_exception_flags(status);
7259
7260     v = float64_to_int64_round_to_zero(a, status);
7261     if (v < 0) {
7262         res = 0;
7263     } else if (v > 0xffff) {
7264         res = 0xffff;
7265     } else {
7266         return v;
7267     }
7268     set_float_exception_flags(old_exc_flags, status);
7269     float_raise(float_flag_invalid, status);
7270     return res;
7271 }
7272
7273 /*----------------------------------------------------------------------------
7274 | Returns the result of converting the double-precision floating-point value
7275 | `a' to the 64-bit unsigned integer format.  The conversion is
7276 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7277 | Arithmetic---which means in particular that the conversion is rounded
7278 | according to the current rounding mode.  If `a' is a NaN, the largest
7279 | positive integer is returned.  If the conversion overflows, the
7280 | largest unsigned integer is returned.  If 'a' is negative, the value is
7281 | rounded and zero is returned; negative values that do not round to zero
7282 | will raise the inexact exception.
7283 *----------------------------------------------------------------------------*/
7284
7285 uint64_t float64_to_uint64(float64 a, float_status *status)
7286 {
7287     flag aSign;
7288     int_fast16_t aExp, shiftCount;
7289     uint64_t aSig, aSigExtra;
7290     a = float64_squash_input_denormal(a, status);
7291
7292     aSig = extractFloat64Frac(a);
7293     aExp = extractFloat64Exp(a);
7294     aSign = extractFloat64Sign(a);
7295     if (aSign && (aExp > 1022)) {
7296         float_raise(float_flag_invalid, status);
7297         if (float64_is_any_nan(a)) {
7298             return LIT64(0xFFFFFFFFFFFFFFFF);
7299         } else {
7300             return 0;
7301         }
7302     }
7303     if (aExp) {
7304         aSig |= LIT64(0x0010000000000000);
7305     }
7306     shiftCount = 0x433 - aExp;
7307     if (shiftCount <= 0) {
7308         if (0x43E < aExp) {
7309             float_raise(float_flag_invalid, status);
7310             return LIT64(0xFFFFFFFFFFFFFFFF);
7311         }
7312         aSigExtra = 0;
7313         aSig <<= -shiftCount;
7314     } else {
7315         shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7316     }
7317     return roundAndPackUint64(aSign, aSig, aSigExtra, status);
7318 }
7319
7320 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
7321 {
7322     signed char current_rounding_mode = status->float_rounding_mode;
7323     set_float_rounding_mode(float_round_to_zero, status);
7324     int64_t v = float64_to_uint64(a, status);
7325     set_float_rounding_mode(current_rounding_mode, status);
7326     return v;
7327 }
7328
7329 #define COMPARE(s, nan_exp)                                                  \
7330 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7331                                       int is_quiet, float_status *status)    \
7332 {                                                                            \
7333     flag aSign, bSign;                                                       \
7334     uint ## s ## _t av, bv;                                                  \
7335     a = float ## s ## _squash_input_denormal(a, status);                     \
7336     b = float ## s ## _squash_input_denormal(b, status);                     \
7337                                                                              \
7338     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
7339          extractFloat ## s ## Frac( a ) ) ||                                 \
7340         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
7341           extractFloat ## s ## Frac( b ) )) {                                \
7342         if (!is_quiet ||                                                     \
7343             float ## s ## _is_signaling_nan( a ) ||                          \
7344             float ## s ## _is_signaling_nan( b ) ) {                         \
7345             float_raise(float_flag_invalid, status);                         \
7346         }                                                                    \
7347         return float_relation_unordered;                                     \
7348     }                                                                        \
7349     aSign = extractFloat ## s ## Sign( a );                                  \
7350     bSign = extractFloat ## s ## Sign( b );                                  \
7351     av = float ## s ## _val(a);                                              \
7352     bv = float ## s ## _val(b);                                              \
7353     if ( aSign != bSign ) {                                                  \
7354         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
7355             /* zero case */                                                  \
7356             return float_relation_equal;                                     \
7357         } else {                                                             \
7358             return 1 - (2 * aSign);                                          \
7359         }                                                                    \
7360     } else {                                                                 \
7361         if (av == bv) {                                                      \
7362             return float_relation_equal;                                     \
7363         } else {                                                             \
7364             return 1 - 2 * (aSign ^ ( av < bv ));                            \
7365         }                                                                    \
7366     }                                                                        \
7367 }                                                                            \
7368                                                                              \
7369 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
7370 {                                                                            \
7371     return float ## s ## _compare_internal(a, b, 0, status);                 \
7372 }                                                                            \
7373                                                                              \
7374 int float ## s ## _compare_quiet(float ## s a, float ## s b,                 \
7375                                  float_status *status)                       \
7376 {                                                                            \
7377     return float ## s ## _compare_internal(a, b, 1, status);                 \
7378 }
7379
7380 COMPARE(32, 0xff)
7381 COMPARE(64, 0x7ff)
7382
7383 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7384                                             int is_quiet, float_status *status)
7385 {
7386     flag aSign, bSign;
7387
7388     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7389           ( extractFloatx80Frac( a )<<1 ) ) ||
7390         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7391           ( extractFloatx80Frac( b )<<1 ) )) {
7392         if (!is_quiet ||
7393             floatx80_is_signaling_nan( a ) ||
7394             floatx80_is_signaling_nan( b ) ) {
7395             float_raise(float_flag_invalid, status);
7396         }
7397         return float_relation_unordered;
7398     }
7399     aSign = extractFloatx80Sign( a );
7400     bSign = extractFloatx80Sign( b );
7401     if ( aSign != bSign ) {
7402
7403         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7404              ( ( a.low | b.low ) == 0 ) ) {
7405             /* zero case */
7406             return float_relation_equal;
7407         } else {
7408             return 1 - (2 * aSign);
7409         }
7410     } else {
7411         if (a.low == b.low && a.high == b.high) {
7412             return float_relation_equal;
7413         } else {
7414             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7415         }
7416     }
7417 }
7418
7419 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7420 {
7421     return floatx80_compare_internal(a, b, 0, status);
7422 }
7423
7424 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7425 {
7426     return floatx80_compare_internal(a, b, 1, status);
7427 }
7428
7429 static inline int float128_compare_internal(float128 a, float128 b,
7430                                             int is_quiet, float_status *status)
7431 {
7432     flag aSign, bSign;
7433
7434     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7435           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7436         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7437           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7438         if (!is_quiet ||
7439             float128_is_signaling_nan( a ) ||
7440             float128_is_signaling_nan( b ) ) {
7441             float_raise(float_flag_invalid, status);
7442         }
7443         return float_relation_unordered;
7444     }
7445     aSign = extractFloat128Sign( a );
7446     bSign = extractFloat128Sign( b );
7447     if ( aSign != bSign ) {
7448         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7449             /* zero case */
7450             return float_relation_equal;
7451         } else {
7452             return 1 - (2 * aSign);
7453         }
7454     } else {
7455         if (a.low == b.low && a.high == b.high) {
7456             return float_relation_equal;
7457         } else {
7458             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7459         }
7460     }
7461 }
7462
7463 int float128_compare(float128 a, float128 b, float_status *status)
7464 {
7465     return float128_compare_internal(a, b, 0, status);
7466 }
7467
7468 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7469 {
7470     return float128_compare_internal(a, b, 1, status);
7471 }
7472
7473 /* min() and max() functions. These can't be implemented as
7474  * 'compare and pick one input' because that would mishandle
7475  * NaNs and +0 vs -0.
7476  *
7477  * minnum() and maxnum() functions. These are similar to the min()
7478  * and max() functions but if one of the arguments is a QNaN and
7479  * the other is numerical then the numerical argument is returned.
7480  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7481  * and maxNum() operations. min() and max() are the typical min/max
7482  * semantics provided by many CPUs which predate that specification.
7483  *
7484  * minnummag() and maxnummag() functions correspond to minNumMag()
7485  * and minNumMag() from the IEEE-754 2008.
7486  */
7487 #define MINMAX(s)                                                       \
7488 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
7489                                                int ismin, int isieee,   \
7490                                                int ismag,               \
7491                                                float_status *status)    \
7492 {                                                                       \
7493     flag aSign, bSign;                                                  \
7494     uint ## s ## _t av, bv, aav, abv;                                   \
7495     a = float ## s ## _squash_input_denormal(a, status);                \
7496     b = float ## s ## _squash_input_denormal(b, status);                \
7497     if (float ## s ## _is_any_nan(a) ||                                 \
7498         float ## s ## _is_any_nan(b)) {                                 \
7499         if (isieee) {                                                   \
7500             if (float ## s ## _is_quiet_nan(a) &&                       \
7501                 !float ## s ##_is_any_nan(b)) {                         \
7502                 return b;                                               \
7503             } else if (float ## s ## _is_quiet_nan(b) &&                \
7504                        !float ## s ## _is_any_nan(a)) {                 \
7505                 return a;                                               \
7506             }                                                           \
7507         }                                                               \
7508         return propagateFloat ## s ## NaN(a, b, status);                \
7509     }                                                                   \
7510     aSign = extractFloat ## s ## Sign(a);                               \
7511     bSign = extractFloat ## s ## Sign(b);                               \
7512     av = float ## s ## _val(a);                                         \
7513     bv = float ## s ## _val(b);                                         \
7514     if (ismag) {                                                        \
7515         aav = float ## s ## _abs(av);                                   \
7516         abv = float ## s ## _abs(bv);                                   \
7517         if (aav != abv) {                                               \
7518             if (ismin) {                                                \
7519                 return (aav < abv) ? a : b;                             \
7520             } else {                                                    \
7521                 return (aav < abv) ? b : a;                             \
7522             }                                                           \
7523         }                                                               \
7524     }                                                                   \
7525     if (aSign != bSign) {                                               \
7526         if (ismin) {                                                    \
7527             return aSign ? a : b;                                       \
7528         } else {                                                        \
7529             return aSign ? b : a;                                       \
7530         }                                                               \
7531     } else {                                                            \
7532         if (ismin) {                                                    \
7533             return (aSign ^ (av < bv)) ? a : b;                         \
7534         } else {                                                        \
7535             return (aSign ^ (av < bv)) ? b : a;                         \
7536         }                                                               \
7537     }                                                                   \
7538 }                                                                       \
7539                                                                         \
7540 float ## s float ## s ## _min(float ## s a, float ## s b,               \
7541                               float_status *status)                     \
7542 {                                                                       \
7543     return float ## s ## _minmax(a, b, 1, 0, 0, status);                \
7544 }                                                                       \
7545                                                                         \
7546 float ## s float ## s ## _max(float ## s a, float ## s b,               \
7547                               float_status *status)                     \
7548 {                                                                       \
7549     return float ## s ## _minmax(a, b, 0, 0, 0, status);                \
7550 }                                                                       \
7551                                                                         \
7552 float ## s float ## s ## _minnum(float ## s a, float ## s b,            \
7553                                  float_status *status)                  \
7554 {                                                                       \
7555     return float ## s ## _minmax(a, b, 1, 1, 0, status);                \
7556 }                                                                       \
7557                                                                         \
7558 float ## s float ## s ## _maxnum(float ## s a, float ## s b,            \
7559                                  float_status *status)                  \
7560 {                                                                       \
7561     return float ## s ## _minmax(a, b, 0, 1, 0, status);                \
7562 }                                                                       \
7563                                                                         \
7564 float ## s float ## s ## _minnummag(float ## s a, float ## s b,         \
7565                                     float_status *status)               \
7566 {                                                                       \
7567     return float ## s ## _minmax(a, b, 1, 1, 1, status);                \
7568 }                                                                       \
7569                                                                         \
7570 float ## s float ## s ## _maxnummag(float ## s a, float ## s b,         \
7571                                     float_status *status)               \
7572 {                                                                       \
7573     return float ## s ## _minmax(a, b, 0, 1, 1, status);                \
7574 }
7575
7576 MINMAX(32)
7577 MINMAX(64)
7578
7579
7580 /* Multiply A by 2 raised to the power N.  */
7581 float32 float32_scalbn(float32 a, int n, float_status *status)
7582 {
7583     flag aSign;
7584     int16_t aExp;
7585     uint32_t aSig;
7586
7587     a = float32_squash_input_denormal(a, status);
7588     aSig = extractFloat32Frac( a );
7589     aExp = extractFloat32Exp( a );
7590     aSign = extractFloat32Sign( a );
7591
7592     if ( aExp == 0xFF ) {
7593         if ( aSig ) {
7594             return propagateFloat32NaN(a, a, status);
7595         }
7596         return a;
7597     }
7598     if (aExp != 0) {
7599         aSig |= 0x00800000;
7600     } else if (aSig == 0) {
7601         return a;
7602     } else {
7603         aExp++;
7604     }
7605
7606     if (n > 0x200) {
7607         n = 0x200;
7608     } else if (n < -0x200) {
7609         n = -0x200;
7610     }
7611
7612     aExp += n - 1;
7613     aSig <<= 7;
7614     return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
7615 }
7616
7617 float64 float64_scalbn(float64 a, int n, float_status *status)
7618 {
7619     flag aSign;
7620     int16_t aExp;
7621     uint64_t aSig;
7622
7623     a = float64_squash_input_denormal(a, status);
7624     aSig = extractFloat64Frac( a );
7625     aExp = extractFloat64Exp( a );
7626     aSign = extractFloat64Sign( a );
7627
7628     if ( aExp == 0x7FF ) {
7629         if ( aSig ) {
7630             return propagateFloat64NaN(a, a, status);
7631         }
7632         return a;
7633     }
7634     if (aExp != 0) {
7635         aSig |= LIT64( 0x0010000000000000 );
7636     } else if (aSig == 0) {
7637         return a;
7638     } else {
7639         aExp++;
7640     }
7641
7642     if (n > 0x1000) {
7643         n = 0x1000;
7644     } else if (n < -0x1000) {
7645         n = -0x1000;
7646     }
7647
7648     aExp += n - 1;
7649     aSig <<= 10;
7650     return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
7651 }
7652
7653 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7654 {
7655     flag aSign;
7656     int32_t aExp;
7657     uint64_t aSig;
7658
7659     aSig = extractFloatx80Frac( a );
7660     aExp = extractFloatx80Exp( a );
7661     aSign = extractFloatx80Sign( a );
7662
7663     if ( aExp == 0x7FFF ) {
7664         if ( aSig<<1 ) {
7665             return propagateFloatx80NaN(a, a, status);
7666         }
7667         return a;
7668     }
7669
7670     if (aExp == 0) {
7671         if (aSig == 0) {
7672             return a;
7673         }
7674         aExp++;
7675     }
7676
7677     if (n > 0x10000) {
7678         n = 0x10000;
7679     } else if (n < -0x10000) {
7680         n = -0x10000;
7681     }
7682
7683     aExp += n;
7684     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7685                                          aSign, aExp, aSig, 0, status);
7686 }
7687
7688 float128 float128_scalbn(float128 a, int n, float_status *status)
7689 {
7690     flag aSign;
7691     int32_t aExp;
7692     uint64_t aSig0, aSig1;
7693
7694     aSig1 = extractFloat128Frac1( a );
7695     aSig0 = extractFloat128Frac0( a );
7696     aExp = extractFloat128Exp( a );
7697     aSign = extractFloat128Sign( a );
7698     if ( aExp == 0x7FFF ) {
7699         if ( aSig0 | aSig1 ) {
7700             return propagateFloat128NaN(a, a, status);
7701         }
7702         return a;
7703     }
7704     if (aExp != 0) {
7705         aSig0 |= LIT64( 0x0001000000000000 );
7706     } else if (aSig0 == 0 && aSig1 == 0) {
7707         return a;
7708     } else {
7709         aExp++;
7710     }
7711
7712     if (n > 0x10000) {
7713         n = 0x10000;
7714     } else if (n < -0x10000) {
7715         n = -0x10000;
7716     }
7717
7718     aExp += n - 1;
7719     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7720                                          , status);
7721
7722 }