fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * The code in this source file is derived from release 2a of the SoftFloat
   5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6  * some later contributions) are provided under that license, as detailed below.
   7  * It has subsequently been modified by contributors to the QEMU Project,
   8  * so some portions are provided under:
   9  *  the SoftFloat-2a license
  10  *  the BSD license
  11  *  GPL-v2-or-later
  12  *
  13  * Any future contributions to this file after December 1st 2014 will be
  14  * taken to be licensed under the Softfloat-2a license unless specifically
  15  * indicated otherwise.
  16  */
  17
  18 /*
  19 ===============================================================================
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* BSD licensing:
  48  * Copyright (c) 2006, Fabrice Bellard
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are met:
  53  *
  54  * 1. Redistributions of source code must retain the above copyright notice,
  55  * this list of conditions and the following disclaimer.
  56  *
  57  * 2. Redistributions in binary form must reproduce the above copyright notice,
  58  * this list of conditions and the following disclaimer in the documentation
  59  * and/or other materials provided with the distribution.
  60  *
  61  * 3. Neither the name of the copyright holder nor the names of its contributors
  62  * may be used to endorse or promote products derived from this software without
  63  * specific prior written permission.
  64  *
  65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75  * THE POSSIBILITY OF SUCH DAMAGE.
  76  */
  77
  78 /* Portions of this work are licensed under the terms of the GNU GPL,
  79  * version 2 or later. See the COPYING file in the top-level directory.
  80  */
  81
  82 /* softfloat (and in particular the code in softfloat-specialize.h) is
  83  * target-dependent and needs the TARGET_* macros.
  84  */
  85 #include "config.h"
  86
  87 #include "fpu/softfloat.h"
  88
  89 /* We only need stdlib for abort() */
  90 #include <stdlib.h>
  91
  92 /*----------------------------------------------------------------------------
  93 | Primitive arithmetic functions, including multi-word arithmetic, and
  94 | division and square root approximations.  (Can be specialized to target if
  95 | desired.)
  96 *----------------------------------------------------------------------------*/
  97 #include "softfloat-macros.h"
  98
  99 /*----------------------------------------------------------------------------
 100 | Functions and definitions to determine:  (1) whether tininess for underflow
 101 | is detected before or after rounding by default, (2) what (if anything)
 102 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
 103 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 104 | are propagated from function inputs to output.  These details are target-
 105 | specific.
 106 *----------------------------------------------------------------------------*/
 107 #include "softfloat-specialize.h"
 108
 109 /*----------------------------------------------------------------------------
 110 | Returns the fraction bits of the half-precision floating-point value `a'.
 111 *----------------------------------------------------------------------------*/
 112
 113 static inline uint32_t extractFloat16Frac(float16 a)
 114 {
 115     return float16_val(a) & 0x3ff;
 116 }
 117
 118 /*----------------------------------------------------------------------------
 119 | Returns the exponent bits of the half-precision floating-point value `a'.
 120 *----------------------------------------------------------------------------*/
 121
 122 static inline int_fast16_t extractFloat16Exp(float16 a)
 123 {
 124     return (float16_val(a) >> 10) & 0x1f;
 125 }
 126
 127 /*----------------------------------------------------------------------------
 128 | Returns the sign bit of the single-precision floating-point value `a'.
 129 *----------------------------------------------------------------------------*/
 130
 131 static inline flag extractFloat16Sign(float16 a)
 132 {
 133     return float16_val(a)>>15;
 134 }
 135
 136 /*----------------------------------------------------------------------------
 137 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
 138 | and 7, and returns the properly rounded 32-bit integer corresponding to the
 139 | input.  If `zSign' is 1, the input is negated before being converted to an
 140 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 141 | is simply rounded to an integer, with the inexact exception raised if the
 142 | input cannot be represented exactly as an integer.  However, if the fixed-
 143 | point input is too large, the invalid exception is raised and the largest
 144 | positive or negative integer is returned.
 145 *----------------------------------------------------------------------------*/
 146
 147 static int32 roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
 148 {
 149     int8 roundingMode;
 150     flag roundNearestEven;
 151     int8 roundIncrement, roundBits;
 152     int32_t z;
 153
 154     roundingMode = STATUS(float_rounding_mode);
 155     roundNearestEven = ( roundingMode == float_round_nearest_even );
 156     switch (roundingMode) {
 157     case float_round_nearest_even:
 158     case float_round_ties_away:
 159         roundIncrement = 0x40;
 160         break;
 161     case float_round_to_zero:
 162         roundIncrement = 0;
 163         break;
 164     case float_round_up:
 165         roundIncrement = zSign ? 0 : 0x7f;
 166         break;
 167     case float_round_down:
 168         roundIncrement = zSign ? 0x7f : 0;
 169         break;
 170     default:
 171         abort();
 172     }
 173     roundBits = absZ & 0x7F;
 174     absZ = ( absZ + roundIncrement )>>7;
 175     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 176     z = absZ;
 177     if ( zSign ) z = - z;
 178     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 179         float_raise( float_flag_invalid STATUS_VAR);
 180         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
 181     }
 182     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 183     return z;
 184
 185 }
 186
 187 /*----------------------------------------------------------------------------
 188 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 189 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 190 | and returns the properly rounded 64-bit integer corresponding to the input.
 191 | If `zSign' is 1, the input is negated before being converted to an integer.
 192 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 193 | the inexact exception raised if the input cannot be represented exactly as
 194 | an integer.  However, if the fixed-point input is too large, the invalid
 195 | exception is raised and the largest positive or negative integer is
 196 | returned.
 197 *----------------------------------------------------------------------------*/
 198
 199 static int64 roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
 200                                float_status *status)
 201 {
 202     int8 roundingMode;
 203     flag roundNearestEven, increment;
 204     int64_t z;
 205
 206     roundingMode = STATUS(float_rounding_mode);
 207     roundNearestEven = ( roundingMode == float_round_nearest_even );
 208     switch (roundingMode) {
 209     case float_round_nearest_even:
 210     case float_round_ties_away:
 211         increment = ((int64_t) absZ1 < 0);
 212         break;
 213     case float_round_to_zero:
 214         increment = 0;
 215         break;
 216     case float_round_up:
 217         increment = !zSign && absZ1;
 218         break;
 219     case float_round_down:
 220         increment = zSign && absZ1;
 221         break;
 222     default:
 223         abort();
 224     }
 225     if ( increment ) {
 226         ++absZ0;
 227         if ( absZ0 == 0 ) goto overflow;
 228         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 229     }
 230     z = absZ0;
 231     if ( zSign ) z = - z;
 232     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 233  overflow:
 234         float_raise( float_flag_invalid STATUS_VAR);
 235         return
 236               zSign ? (int64_t) LIT64( 0x8000000000000000 )
 237             : LIT64( 0x7FFFFFFFFFFFFFFF );
 238     }
 239     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 240     return z;
 241
 242 }
 243
 244 /*----------------------------------------------------------------------------
 245 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 246 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 247 | and returns the properly rounded 64-bit unsigned integer corresponding to the
 248 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
 249 | with the inexact exception raised if the input cannot be represented exactly
 250 | as an integer.  However, if the fixed-point input is too large, the invalid
 251 | exception is raised and the largest unsigned integer is returned.
 252 *----------------------------------------------------------------------------*/
 253
 254 static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,
 255                                 uint64_t absZ1, float_status *status)
 256 {
 257     int8 roundingMode;
 258     flag roundNearestEven, increment;
 259
 260     roundingMode = STATUS(float_rounding_mode);
 261     roundNearestEven = (roundingMode == float_round_nearest_even);
 262     switch (roundingMode) {
 263     case float_round_nearest_even:
 264     case float_round_ties_away:
 265         increment = ((int64_t)absZ1 < 0);
 266         break;
 267     case float_round_to_zero:
 268         increment = 0;
 269         break;
 270     case float_round_up:
 271         increment = !zSign && absZ1;
 272         break;
 273     case float_round_down:
 274         increment = zSign && absZ1;
 275         break;
 276     default:
 277         abort();
 278     }
 279     if (increment) {
 280         ++absZ0;
 281         if (absZ0 == 0) {
 282             float_raise(float_flag_invalid STATUS_VAR);
 283             return LIT64(0xFFFFFFFFFFFFFFFF);
 284         }
 285         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
 286     }
 287
 288     if (zSign && absZ0) {
 289         float_raise(float_flag_invalid STATUS_VAR);
 290         return 0;
 291     }
 292
 293     if (absZ1) {
 294         STATUS(float_exception_flags) |= float_flag_inexact;
 295     }
 296     return absZ0;
 297 }
 298
 299 /*----------------------------------------------------------------------------
 300 | Returns the fraction bits of the single-precision floating-point value `a'.
 301 *----------------------------------------------------------------------------*/
 302
 303 static inline uint32_t extractFloat32Frac( float32 a )
 304 {
 305
 306     return float32_val(a) & 0x007FFFFF;
 307
 308 }
 309
 310 /*----------------------------------------------------------------------------
 311 | Returns the exponent bits of the single-precision floating-point value `a'.
 312 *----------------------------------------------------------------------------*/
 313
 314 static inline int_fast16_t extractFloat32Exp(float32 a)
 315 {
 316
 317     return ( float32_val(a)>>23 ) & 0xFF;
 318
 319 }
 320
 321 /*----------------------------------------------------------------------------
 322 | Returns the sign bit of the single-precision floating-point value `a'.
 323 *----------------------------------------------------------------------------*/
 324
 325 static inline flag extractFloat32Sign( float32 a )
 326 {
 327
 328     return float32_val(a)>>31;
 329
 330 }
 331
 332 /*----------------------------------------------------------------------------
 333 | If `a' is denormal and we are in flush-to-zero mode then set the
 334 | input-denormal exception and return zero. Otherwise just return the value.
 335 *----------------------------------------------------------------------------*/
 336 float32 float32_squash_input_denormal(float32 a, float_status *status)
 337 {
 338     if (STATUS(flush_inputs_to_zero)) {
 339         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
 340             float_raise(float_flag_input_denormal STATUS_VAR);
 341             return make_float32(float32_val(a) & 0x80000000);
 342         }
 343     }
 344     return a;
 345 }
 346
 347 /*----------------------------------------------------------------------------
 348 | Normalizes the subnormal single-precision floating-point value represented
 349 | by the denormalized significand `aSig'.  The normalized exponent and
 350 | significand are stored at the locations pointed to by `zExpPtr' and
 351 | `zSigPtr', respectively.
 352 *----------------------------------------------------------------------------*/
 353
 354 static void
 355  normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
 356 {
 357     int8 shiftCount;
 358
 359     shiftCount = countLeadingZeros32( aSig ) - 8;
 360     *zSigPtr = aSig<<shiftCount;
 361     *zExpPtr = 1 - shiftCount;
 362
 363 }
 364
 365 /*----------------------------------------------------------------------------
 366 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 367 | single-precision floating-point value, returning the result.  After being
 368 | shifted into the proper positions, the three fields are simply added
 369 | together to form the result.  This means that any integer portion of `zSig'
 370 | will be added into the exponent.  Since a properly normalized significand
 371 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 372 | than the desired result exponent whenever `zSig' is a complete, normalized
 373 | significand.
 374 *----------------------------------------------------------------------------*/
 375
 376 static inline float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
 377 {
 378
 379     return make_float32(
 380           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
 381
 382 }
 383
 384 /*----------------------------------------------------------------------------
 385 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 386 | and significand `zSig', and returns the proper single-precision floating-
 387 | point value corresponding to the abstract input.  Ordinarily, the abstract
 388 | value is simply rounded and packed into the single-precision format, with
 389 | the inexact exception raised if the abstract input cannot be represented
 390 | exactly.  However, if the abstract value is too large, the overflow and
 391 | inexact exceptions are raised and an infinity or maximal finite value is
 392 | returned.  If the abstract value is too small, the input value is rounded to
 393 | a subnormal number, and the underflow and inexact exceptions are raised if
 394 | the abstract input cannot be represented exactly as a subnormal single-
 395 | precision floating-point number.
 396 |     The input significand `zSig' has its binary point between bits 30
 397 | and 29, which is 7 bits to the left of the usual location.  This shifted
 398 | significand must be normalized or smaller.  If `zSig' is not normalized,
 399 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 400 | and it must not require rounding.  In the usual case that `zSig' is
 401 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 402 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 403 | Binary Floating-Point Arithmetic.
 404 *----------------------------------------------------------------------------*/
 405
 406 static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig,
 407                                    float_status *status)
 408 {
 409     int8 roundingMode;
 410     flag roundNearestEven;
 411     int8 roundIncrement, roundBits;
 412     flag isTiny;
 413
 414     roundingMode = STATUS(float_rounding_mode);
 415     roundNearestEven = ( roundingMode == float_round_nearest_even );
 416     switch (roundingMode) {
 417     case float_round_nearest_even:
 418     case float_round_ties_away:
 419         roundIncrement = 0x40;
 420         break;
 421     case float_round_to_zero:
 422         roundIncrement = 0;
 423         break;
 424     case float_round_up:
 425         roundIncrement = zSign ? 0 : 0x7f;
 426         break;
 427     case float_round_down:
 428         roundIncrement = zSign ? 0x7f : 0;
 429         break;
 430     default:
 431         abort();
 432         break;
 433     }
 434     roundBits = zSig & 0x7F;
 435     if ( 0xFD <= (uint16_t) zExp ) {
 436         if (    ( 0xFD < zExp )
 437              || (    ( zExp == 0xFD )
 438                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
 439            ) {
 440             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 441             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 442         }
 443         if ( zExp < 0 ) {
 444             if (STATUS(flush_to_zero)) {
 445                 float_raise(float_flag_output_denormal STATUS_VAR);
 446                 return packFloat32(zSign, 0, 0);
 447             }
 448             isTiny =
 449                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 450                 || ( zExp < -1 )
 451                 || ( zSig + roundIncrement < 0x80000000 );
 452             shift32RightJamming( zSig, - zExp, &zSig );
 453             zExp = 0;
 454             roundBits = zSig & 0x7F;
 455             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 456         }
 457     }
 458     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 459     zSig = ( zSig + roundIncrement )>>7;
 460     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 461     if ( zSig == 0 ) zExp = 0;
 462     return packFloat32( zSign, zExp, zSig );
 463
 464 }
 465
 466 /*----------------------------------------------------------------------------
 467 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 468 | and significand `zSig', and returns the proper single-precision floating-
 469 | point value corresponding to the abstract input.  This routine is just like
 470 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 471 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 472 | floating-point exponent.
 473 *----------------------------------------------------------------------------*/
 474
 475 static float32
 476  normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig,
 477                               float_status *status)
 478 {
 479     int8 shiftCount;
 480
 481     shiftCount = countLeadingZeros32( zSig ) - 1;
 482     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 483
 484 }
 485
 486 /*----------------------------------------------------------------------------
 487 | Returns the fraction bits of the double-precision floating-point value `a'.
 488 *----------------------------------------------------------------------------*/
 489
 490 static inline uint64_t extractFloat64Frac( float64 a )
 491 {
 492
 493     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 494
 495 }
 496
 497 /*----------------------------------------------------------------------------
 498 | Returns the exponent bits of the double-precision floating-point value `a'.
 499 *----------------------------------------------------------------------------*/
 500
 501 static inline int_fast16_t extractFloat64Exp(float64 a)
 502 {
 503
 504     return ( float64_val(a)>>52 ) & 0x7FF;
 505
 506 }
 507
 508 /*----------------------------------------------------------------------------
 509 | Returns the sign bit of the double-precision floating-point value `a'.
 510 *----------------------------------------------------------------------------*/
 511
 512 static inline flag extractFloat64Sign( float64 a )
 513 {
 514
 515     return float64_val(a)>>63;
 516
 517 }
 518
 519 /*----------------------------------------------------------------------------
 520 | If `a' is denormal and we are in flush-to-zero mode then set the
 521 | input-denormal exception and return zero. Otherwise just return the value.
 522 *----------------------------------------------------------------------------*/
 523 float64 float64_squash_input_denormal(float64 a, float_status *status)
 524 {
 525     if (STATUS(flush_inputs_to_zero)) {
 526         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
 527             float_raise(float_flag_input_denormal STATUS_VAR);
 528             return make_float64(float64_val(a) & (1ULL << 63));
 529         }
 530     }
 531     return a;
 532 }
 533
 534 /*----------------------------------------------------------------------------
 535 | Normalizes the subnormal double-precision floating-point value represented
 536 | by the denormalized significand `aSig'.  The normalized exponent and
 537 | significand are stored at the locations pointed to by `zExpPtr' and
 538 | `zSigPtr', respectively.
 539 *----------------------------------------------------------------------------*/
 540
 541 static void
 542  normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
 543 {
 544     int8 shiftCount;
 545
 546     shiftCount = countLeadingZeros64( aSig ) - 11;
 547     *zSigPtr = aSig<<shiftCount;
 548     *zExpPtr = 1 - shiftCount;
 549
 550 }
 551
 552 /*----------------------------------------------------------------------------
 553 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 554 | double-precision floating-point value, returning the result.  After being
 555 | shifted into the proper positions, the three fields are simply added
 556 | together to form the result.  This means that any integer portion of `zSig'
 557 | will be added into the exponent.  Since a properly normalized significand
 558 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 559 | than the desired result exponent whenever `zSig' is a complete, normalized
 560 | significand.
 561 *----------------------------------------------------------------------------*/
 562
 563 static inline float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
 564 {
 565
 566     return make_float64(
 567         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
 568
 569 }
 570
 571 /*----------------------------------------------------------------------------
 572 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 573 | and significand `zSig', and returns the proper double-precision floating-
 574 | point value corresponding to the abstract input.  Ordinarily, the abstract
 575 | value is simply rounded and packed into the double-precision format, with
 576 | the inexact exception raised if the abstract input cannot be represented
 577 | exactly.  However, if the abstract value is too large, the overflow and
 578 | inexact exceptions are raised and an infinity or maximal finite value is
 579 | returned.  If the abstract value is too small, the input value is rounded to
 580 | a subnormal number, and the underflow and inexact exceptions are raised if
 581 | the abstract input cannot be represented exactly as a subnormal double-
 582 | precision floating-point number.
 583 |     The input significand `zSig' has its binary point between bits 62
 584 | and 61, which is 10 bits to the left of the usual location.  This shifted
 585 | significand must be normalized or smaller.  If `zSig' is not normalized,
 586 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 587 | and it must not require rounding.  In the usual case that `zSig' is
 588 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 589 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 590 | Binary Floating-Point Arithmetic.
 591 *----------------------------------------------------------------------------*/
 592
 593 static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig,
 594                                    float_status *status)
 595 {
 596     int8 roundingMode;
 597     flag roundNearestEven;
 598     int_fast16_t roundIncrement, roundBits;
 599     flag isTiny;
 600
 601     roundingMode = STATUS(float_rounding_mode);
 602     roundNearestEven = ( roundingMode == float_round_nearest_even );
 603     switch (roundingMode) {
 604     case float_round_nearest_even:
 605     case float_round_ties_away:
 606         roundIncrement = 0x200;
 607         break;
 608     case float_round_to_zero:
 609         roundIncrement = 0;
 610         break;
 611     case float_round_up:
 612         roundIncrement = zSign ? 0 : 0x3ff;
 613         break;
 614     case float_round_down:
 615         roundIncrement = zSign ? 0x3ff : 0;
 616         break;
 617     default:
 618         abort();
 619     }
 620     roundBits = zSig & 0x3FF;
 621     if ( 0x7FD <= (uint16_t) zExp ) {
 622         if (    ( 0x7FD < zExp )
 623              || (    ( zExp == 0x7FD )
 624                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
 625            ) {
 626             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 627             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 628         }
 629         if ( zExp < 0 ) {
 630             if (STATUS(flush_to_zero)) {
 631                 float_raise(float_flag_output_denormal STATUS_VAR);
 632                 return packFloat64(zSign, 0, 0);
 633             }
 634             isTiny =
 635                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 636                 || ( zExp < -1 )
 637                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 638             shift64RightJamming( zSig, - zExp, &zSig );
 639             zExp = 0;
 640             roundBits = zSig & 0x3FF;
 641             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 642         }
 643     }
 644     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 645     zSig = ( zSig + roundIncrement )>>10;
 646     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 647     if ( zSig == 0 ) zExp = 0;
 648     return packFloat64( zSign, zExp, zSig );
 649
 650 }
 651
 652 /*----------------------------------------------------------------------------
 653 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 654 | and significand `zSig', and returns the proper double-precision floating-
 655 | point value corresponding to the abstract input.  This routine is just like
 656 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 657 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 658 | floating-point exponent.
 659 *----------------------------------------------------------------------------*/
 660
 661 static float64
 662  normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig,
 663                               float_status *status)
 664 {
 665     int8 shiftCount;
 666
 667     shiftCount = countLeadingZeros64( zSig ) - 1;
 668     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 669
 670 }
 671
 672 /*----------------------------------------------------------------------------
 673 | Returns the fraction bits of the extended double-precision floating-point
 674 | value `a'.
 675 *----------------------------------------------------------------------------*/
 676
 677 static inline uint64_t extractFloatx80Frac( floatx80 a )
 678 {
 679
 680     return a.low;
 681
 682 }
 683
 684 /*----------------------------------------------------------------------------
 685 | Returns the exponent bits of the extended double-precision floating-point
 686 | value `a'.
 687 *----------------------------------------------------------------------------*/
 688
 689 static inline int32 extractFloatx80Exp( floatx80 a )
 690 {
 691
 692     return a.high & 0x7FFF;
 693
 694 }
 695
 696 /*----------------------------------------------------------------------------
 697 | Returns the sign bit of the extended double-precision floating-point value
 698 | `a'.
 699 *----------------------------------------------------------------------------*/
 700
 701 static inline flag extractFloatx80Sign( floatx80 a )
 702 {
 703
 704     return a.high>>15;
 705
 706 }
 707
 708 /*----------------------------------------------------------------------------
 709 | Normalizes the subnormal extended double-precision floating-point value
 710 | represented by the denormalized significand `aSig'.  The normalized exponent
 711 | and significand are stored at the locations pointed to by `zExpPtr' and
 712 | `zSigPtr', respectively.
 713 *----------------------------------------------------------------------------*/
 714
 715 static void
 716  normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
 717 {
 718     int8 shiftCount;
 719
 720     shiftCount = countLeadingZeros64( aSig );
 721     *zSigPtr = aSig<<shiftCount;
 722     *zExpPtr = 1 - shiftCount;
 723
 724 }
 725
 726 /*----------------------------------------------------------------------------
 727 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 728 | extended double-precision floating-point value, returning the result.
 729 *----------------------------------------------------------------------------*/
 730
 731 static inline floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
 732 {
 733     floatx80 z;
 734
 735     z.low = zSig;
 736     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
 737     return z;
 738
 739 }
 740
 741 /*----------------------------------------------------------------------------
 742 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 743 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 744 | and returns the proper extended double-precision floating-point value
 745 | corresponding to the abstract input.  Ordinarily, the abstract value is
 746 | rounded and packed into the extended double-precision format, with the
 747 | inexact exception raised if the abstract input cannot be represented
 748 | exactly.  However, if the abstract value is too large, the overflow and
 749 | inexact exceptions are raised and an infinity or maximal finite value is
 750 | returned.  If the abstract value is too small, the input value is rounded to
 751 | a subnormal number, and the underflow and inexact exceptions are raised if
 752 | the abstract input cannot be represented exactly as a subnormal extended
 753 | double-precision floating-point number.
 754 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 755 | number of bits as single or double precision, respectively.  Otherwise, the
 756 | result is rounded to the full precision of the extended double-precision
 757 | format.
 758 |     The input significand must be normalized or smaller.  If the input
 759 | significand is not normalized, `zExp' must be 0; in that case, the result
 760 | returned is a subnormal number, and it must not require rounding.  The
 761 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 762 | Floating-Point Arithmetic.
 763 *----------------------------------------------------------------------------*/
 764
 765 static floatx80 roundAndPackFloatx80(int8 roundingPrecision, flag zSign,
 766                                      int32 zExp, uint64_t zSig0, uint64_t zSig1,
 767                                      float_status *status)
 768 {
 769     int8 roundingMode;
 770     flag roundNearestEven, increment, isTiny;
 771     int64 roundIncrement, roundMask, roundBits;
 772
 773     roundingMode = STATUS(float_rounding_mode);
 774     roundNearestEven = ( roundingMode == float_round_nearest_even );
 775     if ( roundingPrecision == 80 ) goto precision80;
 776     if ( roundingPrecision == 64 ) {
 777         roundIncrement = LIT64( 0x0000000000000400 );
 778         roundMask = LIT64( 0x00000000000007FF );
 779     }
 780     else if ( roundingPrecision == 32 ) {
 781         roundIncrement = LIT64( 0x0000008000000000 );
 782         roundMask = LIT64( 0x000000FFFFFFFFFF );
 783     }
 784     else {
 785         goto precision80;
 786     }
 787     zSig0 |= ( zSig1 != 0 );
 788     switch (roundingMode) {
 789     case float_round_nearest_even:
 790     case float_round_ties_away:
 791         break;
 792     case float_round_to_zero:
 793         roundIncrement = 0;
 794         break;
 795     case float_round_up:
 796         roundIncrement = zSign ? 0 : roundMask;
 797         break;
 798     case float_round_down:
 799         roundIncrement = zSign ? roundMask : 0;
 800         break;
 801     default:
 802         abort();
 803     }
 804     roundBits = zSig0 & roundMask;
 805     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 806         if (    ( 0x7FFE < zExp )
 807              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 808            ) {
 809             goto overflow;
 810         }
 811         if ( zExp <= 0 ) {
 812             if (STATUS(flush_to_zero)) {
 813                 float_raise(float_flag_output_denormal STATUS_VAR);
 814                 return packFloatx80(zSign, 0, 0);
 815             }
 816             isTiny =
 817                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 818                 || ( zExp < 0 )
 819                 || ( zSig0 <= zSig0 + roundIncrement );
 820             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 821             zExp = 0;
 822             roundBits = zSig0 & roundMask;
 823             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 824             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 825             zSig0 += roundIncrement;
 826             if ( (int64_t) zSig0 < 0 ) zExp = 1;
 827             roundIncrement = roundMask + 1;
 828             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 829                 roundMask |= roundIncrement;
 830             }
 831             zSig0 &= ~ roundMask;
 832             return packFloatx80( zSign, zExp, zSig0 );
 833         }
 834     }
 835     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 836     zSig0 += roundIncrement;
 837     if ( zSig0 < roundIncrement ) {
 838         ++zExp;
 839         zSig0 = LIT64( 0x8000000000000000 );
 840     }
 841     roundIncrement = roundMask + 1;
 842     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 843         roundMask |= roundIncrement;
 844     }
 845     zSig0 &= ~ roundMask;
 846     if ( zSig0 == 0 ) zExp = 0;
 847     return packFloatx80( zSign, zExp, zSig0 );
 848  precision80:
 849     switch (roundingMode) {
 850     case float_round_nearest_even:
 851     case float_round_ties_away:
 852         increment = ((int64_t)zSig1 < 0);
 853         break;
 854     case float_round_to_zero:
 855         increment = 0;
 856         break;
 857     case float_round_up:
 858         increment = !zSign && zSig1;
 859         break;
 860     case float_round_down:
 861         increment = zSign && zSig1;
 862         break;
 863     default:
 864         abort();
 865     }
 866     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 867         if (    ( 0x7FFE < zExp )
 868              || (    ( zExp == 0x7FFE )
 869                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 870                   && increment
 871                 )
 872            ) {
 873             roundMask = 0;
 874  overflow:
 875             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 876             if (    ( roundingMode == float_round_to_zero )
 877                  || ( zSign && ( roundingMode == float_round_up ) )
 878                  || ( ! zSign && ( roundingMode == float_round_down ) )
 879                ) {
 880                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 881             }
 882             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 883         }
 884         if ( zExp <= 0 ) {
 885             isTiny =
 886                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 887                 || ( zExp < 0 )
 888                 || ! increment
 889                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 890             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 891             zExp = 0;
 892             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 893             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 894             switch (roundingMode) {
 895             case float_round_nearest_even:
 896             case float_round_ties_away:
 897                 increment = ((int64_t)zSig1 < 0);
 898                 break;
 899             case float_round_to_zero:
 900                 increment = 0;
 901                 break;
 902             case float_round_up:
 903                 increment = !zSign && zSig1;
 904                 break;
 905             case float_round_down:
 906                 increment = zSign && zSig1;
 907                 break;
 908             default:
 909                 abort();
 910             }
 911             if ( increment ) {
 912                 ++zSig0;
 913                 zSig0 &=
 914                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 915                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
 916             }
 917             return packFloatx80( zSign, zExp, zSig0 );
 918         }
 919     }
 920     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 921     if ( increment ) {
 922         ++zSig0;
 923         if ( zSig0 == 0 ) {
 924             ++zExp;
 925             zSig0 = LIT64( 0x8000000000000000 );
 926         }
 927         else {
 928             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 929         }
 930     }
 931     else {
 932         if ( zSig0 == 0 ) zExp = 0;
 933     }
 934     return packFloatx80( zSign, zExp, zSig0 );
 935
 936 }
 937
 938 /*----------------------------------------------------------------------------
 939 | Takes an abstract floating-point value having sign `zSign', exponent
 940 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 941 | and returns the proper extended double-precision floating-point value
 942 | corresponding to the abstract input.  This routine is just like
 943 | `roundAndPackFloatx80' except that the input significand does not have to be
 944 | normalized.
 945 *----------------------------------------------------------------------------*/
 946
 947 static floatx80 normalizeRoundAndPackFloatx80(int8 roundingPrecision,
 948                                               flag zSign, int32 zExp,
 949                                               uint64_t zSig0, uint64_t zSig1,
 950                                               float_status *status)
 951 {
 952     int8 shiftCount;
 953
 954     if ( zSig0 == 0 ) {
 955         zSig0 = zSig1;
 956         zSig1 = 0;
 957         zExp -= 64;
 958     }
 959     shiftCount = countLeadingZeros64( zSig0 );
 960     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 961     zExp -= shiftCount;
 962     return
 963         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 964
 965 }
 966
 967 /*----------------------------------------------------------------------------
 968 | Returns the least-significant 64 fraction bits of the quadruple-precision
 969 | floating-point value `a'.
 970 *----------------------------------------------------------------------------*/
 971
 972 static inline uint64_t extractFloat128Frac1( float128 a )
 973 {
 974
 975     return a.low;
 976
 977 }
 978
 979 /*----------------------------------------------------------------------------
 980 | Returns the most-significant 48 fraction bits of the quadruple-precision
 981 | floating-point value `a'.
 982 *----------------------------------------------------------------------------*/
 983
 984 static inline uint64_t extractFloat128Frac0( float128 a )
 985 {
 986
 987     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 988
 989 }
 990
 991 /*----------------------------------------------------------------------------
 992 | Returns the exponent bits of the quadruple-precision floating-point value
 993 | `a'.
 994 *----------------------------------------------------------------------------*/
 995
 996 static inline int32 extractFloat128Exp( float128 a )
 997 {
 998
 999     return ( a.high>>48 ) & 0x7FFF;
1000
1001 }
1002
1003 /*----------------------------------------------------------------------------
1004 | Returns the sign bit of the quadruple-precision floating-point value `a'.
1005 *----------------------------------------------------------------------------*/
1006
1007 static inline flag extractFloat128Sign( float128 a )
1008 {
1009
1010     return a.high>>63;
1011
1012 }
1013
1014 /*----------------------------------------------------------------------------
1015 | Normalizes the subnormal quadruple-precision floating-point value
1016 | represented by the denormalized significand formed by the concatenation of
1017 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
1018 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
1019 | significand are stored at the location pointed to by `zSig0Ptr', and the
1020 | least significant 64 bits of the normalized significand are stored at the
1021 | location pointed to by `zSig1Ptr'.
1022 *----------------------------------------------------------------------------*/
1023
1024 static void
1025  normalizeFloat128Subnormal(
1026      uint64_t aSig0,
1027      uint64_t aSig1,
1028      int32 *zExpPtr,
1029      uint64_t *zSig0Ptr,
1030      uint64_t *zSig1Ptr
1031  )
1032 {
1033     int8 shiftCount;
1034
1035     if ( aSig0 == 0 ) {
1036         shiftCount = countLeadingZeros64( aSig1 ) - 15;
1037         if ( shiftCount < 0 ) {
1038             *zSig0Ptr = aSig1>>( - shiftCount );
1039             *zSig1Ptr = aSig1<<( shiftCount & 63 );
1040         }
1041         else {
1042             *zSig0Ptr = aSig1<<shiftCount;
1043             *zSig1Ptr = 0;
1044         }
1045         *zExpPtr = - shiftCount - 63;
1046     }
1047     else {
1048         shiftCount = countLeadingZeros64( aSig0 ) - 15;
1049         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1050         *zExpPtr = 1 - shiftCount;
1051     }
1052
1053 }
1054
1055 /*----------------------------------------------------------------------------
1056 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1057 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1058 | floating-point value, returning the result.  After being shifted into the
1059 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1060 | added together to form the most significant 32 bits of the result.  This
1061 | means that any integer portion of `zSig0' will be added into the exponent.
1062 | Since a properly normalized significand will have an integer portion equal
1063 | to 1, the `zExp' input should be 1 less than the desired result exponent
1064 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1065 | significand.
1066 *----------------------------------------------------------------------------*/
1067
1068 static inline float128
1069  packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
1070 {
1071     float128 z;
1072
1073     z.low = zSig1;
1074     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1075     return z;
1076
1077 }
1078
1079 /*----------------------------------------------------------------------------
1080 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1081 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1082 | and `zSig2', and returns the proper quadruple-precision floating-point value
1083 | corresponding to the abstract input.  Ordinarily, the abstract value is
1084 | simply rounded and packed into the quadruple-precision format, with the
1085 | inexact exception raised if the abstract input cannot be represented
1086 | exactly.  However, if the abstract value is too large, the overflow and
1087 | inexact exceptions are raised and an infinity or maximal finite value is
1088 | returned.  If the abstract value is too small, the input value is rounded to
1089 | a subnormal number, and the underflow and inexact exceptions are raised if
1090 | the abstract input cannot be represented exactly as a subnormal quadruple-
1091 | precision floating-point number.
1092 |     The input significand must be normalized or smaller.  If the input
1093 | significand is not normalized, `zExp' must be 0; in that case, the result
1094 | returned is a subnormal number, and it must not require rounding.  In the
1095 | usual case that the input significand is normalized, `zExp' must be 1 less
1096 | than the ``true'' floating-point exponent.  The handling of underflow and
1097 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1098 *----------------------------------------------------------------------------*/
1099
1100 static float128 roundAndPackFloat128(flag zSign, int32 zExp,
1101                                      uint64_t zSig0, uint64_t zSig1,
1102                                      uint64_t zSig2, float_status *status)
1103 {
1104     int8 roundingMode;
1105     flag roundNearestEven, increment, isTiny;
1106
1107     roundingMode = STATUS(float_rounding_mode);
1108     roundNearestEven = ( roundingMode == float_round_nearest_even );
1109     switch (roundingMode) {
1110     case float_round_nearest_even:
1111     case float_round_ties_away:
1112         increment = ((int64_t)zSig2 < 0);
1113         break;
1114     case float_round_to_zero:
1115         increment = 0;
1116         break;
1117     case float_round_up:
1118         increment = !zSign && zSig2;
1119         break;
1120     case float_round_down:
1121         increment = zSign && zSig2;
1122         break;
1123     default:
1124         abort();
1125     }
1126     if ( 0x7FFD <= (uint32_t) zExp ) {
1127         if (    ( 0x7FFD < zExp )
1128              || (    ( zExp == 0x7FFD )
1129                   && eq128(
1130                          LIT64( 0x0001FFFFFFFFFFFF ),
1131                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1132                          zSig0,
1133                          zSig1
1134                      )
1135                   && increment
1136                 )
1137            ) {
1138             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1139             if (    ( roundingMode == float_round_to_zero )
1140                  || ( zSign && ( roundingMode == float_round_up ) )
1141                  || ( ! zSign && ( roundingMode == float_round_down ) )
1142                ) {
1143                 return
1144                     packFloat128(
1145                         zSign,
1146                         0x7FFE,
1147                         LIT64( 0x0000FFFFFFFFFFFF ),
1148                         LIT64( 0xFFFFFFFFFFFFFFFF )
1149                     );
1150             }
1151             return packFloat128( zSign, 0x7FFF, 0, 0 );
1152         }
1153         if ( zExp < 0 ) {
1154             if (STATUS(flush_to_zero)) {
1155                 float_raise(float_flag_output_denormal STATUS_VAR);
1156                 return packFloat128(zSign, 0, 0, 0);
1157             }
1158             isTiny =
1159                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1160                 || ( zExp < -1 )
1161                 || ! increment
1162                 || lt128(
1163                        zSig0,
1164                        zSig1,
1165                        LIT64( 0x0001FFFFFFFFFFFF ),
1166                        LIT64( 0xFFFFFFFFFFFFFFFF )
1167                    );
1168             shift128ExtraRightJamming(
1169                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1170             zExp = 0;
1171             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1172             switch (roundingMode) {
1173             case float_round_nearest_even:
1174             case float_round_ties_away:
1175                 increment = ((int64_t)zSig2 < 0);
1176                 break;
1177             case float_round_to_zero:
1178                 increment = 0;
1179                 break;
1180             case float_round_up:
1181                 increment = !zSign && zSig2;
1182                 break;
1183             case float_round_down:
1184                 increment = zSign && zSig2;
1185                 break;
1186             default:
1187                 abort();
1188             }
1189         }
1190     }
1191     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1192     if ( increment ) {
1193         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1194         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1195     }
1196     else {
1197         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1198     }
1199     return packFloat128( zSign, zExp, zSig0, zSig1 );
1200
1201 }
1202
1203 /*----------------------------------------------------------------------------
1204 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1205 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1206 | returns the proper quadruple-precision floating-point value corresponding
1207 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1208 | except that the input significand has fewer bits and does not have to be
1209 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1210 | point exponent.
1211 *----------------------------------------------------------------------------*/
1212
1213 static float128 normalizeRoundAndPackFloat128(flag zSign, int32 zExp,
1214                                               uint64_t zSig0, uint64_t zSig1,
1215                                               float_status *status)
1216 {
1217     int8 shiftCount;
1218     uint64_t zSig2;
1219
1220     if ( zSig0 == 0 ) {
1221         zSig0 = zSig1;
1222         zSig1 = 0;
1223         zExp -= 64;
1224     }
1225     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1226     if ( 0 <= shiftCount ) {
1227         zSig2 = 0;
1228         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1229     }
1230     else {
1231         shift128ExtraRightJamming(
1232             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1233     }
1234     zExp -= shiftCount;
1235     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1236
1237 }
1238
1239 /*----------------------------------------------------------------------------
1240 | Returns the result of converting the 32-bit two's complement integer `a'
1241 | to the single-precision floating-point format.  The conversion is performed
1242 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1243 *----------------------------------------------------------------------------*/
1244
1245 float32 int32_to_float32(int32_t a, float_status *status)
1246 {
1247     flag zSign;
1248
1249     if ( a == 0 ) return float32_zero;
1250     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1251     zSign = ( a < 0 );
1252     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1253
1254 }
1255
1256 /*----------------------------------------------------------------------------
1257 | Returns the result of converting the 32-bit two's complement integer `a'
1258 | to the double-precision floating-point format.  The conversion is performed
1259 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1260 *----------------------------------------------------------------------------*/
1261
1262 float64 int32_to_float64(int32_t a, float_status *status)
1263 {
1264     flag zSign;
1265     uint32 absA;
1266     int8 shiftCount;
1267     uint64_t zSig;
1268
1269     if ( a == 0 ) return float64_zero;
1270     zSign = ( a < 0 );
1271     absA = zSign ? - a : a;
1272     shiftCount = countLeadingZeros32( absA ) + 21;
1273     zSig = absA;
1274     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1275
1276 }
1277
1278 /*----------------------------------------------------------------------------
1279 | Returns the result of converting the 32-bit two's complement integer `a'
1280 | to the extended double-precision floating-point format.  The conversion
1281 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1282 | Arithmetic.
1283 *----------------------------------------------------------------------------*/
1284
1285 floatx80 int32_to_floatx80(int32_t a, float_status *status)
1286 {
1287     flag zSign;
1288     uint32 absA;
1289     int8 shiftCount;
1290     uint64_t zSig;
1291
1292     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1293     zSign = ( a < 0 );
1294     absA = zSign ? - a : a;
1295     shiftCount = countLeadingZeros32( absA ) + 32;
1296     zSig = absA;
1297     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1298
1299 }
1300
1301 /*----------------------------------------------------------------------------
1302 | Returns the result of converting the 32-bit two's complement integer `a' to
1303 | the quadruple-precision floating-point format.  The conversion is performed
1304 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1305 *----------------------------------------------------------------------------*/
1306
1307 float128 int32_to_float128(int32_t a, float_status *status)
1308 {
1309     flag zSign;
1310     uint32 absA;
1311     int8 shiftCount;
1312     uint64_t zSig0;
1313
1314     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1315     zSign = ( a < 0 );
1316     absA = zSign ? - a : a;
1317     shiftCount = countLeadingZeros32( absA ) + 17;
1318     zSig0 = absA;
1319     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1320
1321 }
1322
1323 /*----------------------------------------------------------------------------
1324 | Returns the result of converting the 64-bit two's complement integer `a'
1325 | to the single-precision floating-point format.  The conversion is performed
1326 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1327 *----------------------------------------------------------------------------*/
1328
1329 float32 int64_to_float32(int64_t a, float_status *status)
1330 {
1331     flag zSign;
1332     uint64 absA;
1333     int8 shiftCount;
1334
1335     if ( a == 0 ) return float32_zero;
1336     zSign = ( a < 0 );
1337     absA = zSign ? - a : a;
1338     shiftCount = countLeadingZeros64( absA ) - 40;
1339     if ( 0 <= shiftCount ) {
1340         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1341     }
1342     else {
1343         shiftCount += 7;
1344         if ( shiftCount < 0 ) {
1345             shift64RightJamming( absA, - shiftCount, &absA );
1346         }
1347         else {
1348             absA <<= shiftCount;
1349         }
1350         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1351     }
1352
1353 }
1354
1355 /*----------------------------------------------------------------------------
1356 | Returns the result of converting the 64-bit two's complement integer `a'
1357 | to the double-precision floating-point format.  The conversion is performed
1358 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1359 *----------------------------------------------------------------------------*/
1360
1361 float64 int64_to_float64(int64_t a, float_status *status)
1362 {
1363     flag zSign;
1364
1365     if ( a == 0 ) return float64_zero;
1366     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1367         return packFloat64( 1, 0x43E, 0 );
1368     }
1369     zSign = ( a < 0 );
1370     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1371
1372 }
1373
1374 /*----------------------------------------------------------------------------
1375 | Returns the result of converting the 64-bit two's complement integer `a'
1376 | to the extended double-precision floating-point format.  The conversion
1377 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1378 | Arithmetic.
1379 *----------------------------------------------------------------------------*/
1380
1381 floatx80 int64_to_floatx80(int64_t a, float_status *status)
1382 {
1383     flag zSign;
1384     uint64 absA;
1385     int8 shiftCount;
1386
1387     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1388     zSign = ( a < 0 );
1389     absA = zSign ? - a : a;
1390     shiftCount = countLeadingZeros64( absA );
1391     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1392
1393 }
1394
1395 /*----------------------------------------------------------------------------
1396 | Returns the result of converting the 64-bit two's complement integer `a' to
1397 | the quadruple-precision floating-point format.  The conversion is performed
1398 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1399 *----------------------------------------------------------------------------*/
1400
1401 float128 int64_to_float128(int64_t a, float_status *status)
1402 {
1403     flag zSign;
1404     uint64 absA;
1405     int8 shiftCount;
1406     int32 zExp;
1407     uint64_t zSig0, zSig1;
1408
1409     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1410     zSign = ( a < 0 );
1411     absA = zSign ? - a : a;
1412     shiftCount = countLeadingZeros64( absA ) + 49;
1413     zExp = 0x406E - shiftCount;
1414     if ( 64 <= shiftCount ) {
1415         zSig1 = 0;
1416         zSig0 = absA;
1417         shiftCount -= 64;
1418     }
1419     else {
1420         zSig1 = absA;
1421         zSig0 = 0;
1422     }
1423     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1424     return packFloat128( zSign, zExp, zSig0, zSig1 );
1425
1426 }
1427
1428 /*----------------------------------------------------------------------------
1429 | Returns the result of converting the 64-bit unsigned integer `a'
1430 | to the single-precision floating-point format.  The conversion is performed
1431 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1432 *----------------------------------------------------------------------------*/
1433
1434 float32 uint64_to_float32(uint64_t a, float_status *status)
1435 {
1436     int shiftcount;
1437
1438     if (a == 0) {
1439         return float32_zero;
1440     }
1441
1442     /* Determine (left) shift needed to put first set bit into bit posn 23
1443      * (since packFloat32() expects the binary point between bits 23 and 22);
1444      * this is the fast case for smallish numbers.
1445      */
1446     shiftcount = countLeadingZeros64(a) - 40;
1447     if (shiftcount >= 0) {
1448         return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1449     }
1450     /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1451      * expects the binary point between bits 30 and 29, hence the + 7.
1452      */
1453     shiftcount += 7;
1454     if (shiftcount < 0) {
1455         shift64RightJamming(a, -shiftcount, &a);
1456     } else {
1457         a <<= shiftcount;
1458     }
1459
1460     return roundAndPackFloat32(0, 0x9c - shiftcount, a STATUS_VAR);
1461 }
1462
1463 /*----------------------------------------------------------------------------
1464 | Returns the result of converting the 64-bit unsigned integer `a'
1465 | to the double-precision floating-point format.  The conversion is performed
1466 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1467 *----------------------------------------------------------------------------*/
1468
1469 float64 uint64_to_float64(uint64_t a, float_status *status)
1470 {
1471     int exp = 0x43C;
1472     int shiftcount;
1473
1474     if (a == 0) {
1475         return float64_zero;
1476     }
1477
1478     shiftcount = countLeadingZeros64(a) - 1;
1479     if (shiftcount < 0) {
1480         shift64RightJamming(a, -shiftcount, &a);
1481     } else {
1482         a <<= shiftcount;
1483     }
1484     return roundAndPackFloat64(0, exp - shiftcount, a STATUS_VAR);
1485 }
1486
1487 /*----------------------------------------------------------------------------
1488 | Returns the result of converting the 64-bit unsigned integer `a'
1489 | to the quadruple-precision floating-point format.  The conversion is performed
1490 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1491 *----------------------------------------------------------------------------*/
1492
1493 float128 uint64_to_float128(uint64_t a, float_status *status)
1494 {
1495     if (a == 0) {
1496         return float128_zero;
1497     }
1498     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1499 }
1500
1501 /*----------------------------------------------------------------------------
1502 | Returns the result of converting the single-precision floating-point value
1503 | `a' to the 32-bit two's complement integer format.  The conversion is
1504 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1505 | Arithmetic---which means in particular that the conversion is rounded
1506 | according to the current rounding mode.  If `a' is a NaN, the largest
1507 | positive integer is returned.  Otherwise, if the conversion overflows, the
1508 | largest integer with the same sign as `a' is returned.
1509 *----------------------------------------------------------------------------*/
1510
1511 int32 float32_to_int32(float32 a, float_status *status)
1512 {
1513     flag aSign;
1514     int_fast16_t aExp, shiftCount;
1515     uint32_t aSig;
1516     uint64_t aSig64;
1517
1518     a = float32_squash_input_denormal(a STATUS_VAR);
1519     aSig = extractFloat32Frac( a );
1520     aExp = extractFloat32Exp( a );
1521     aSign = extractFloat32Sign( a );
1522     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1523     if ( aExp ) aSig |= 0x00800000;
1524     shiftCount = 0xAF - aExp;
1525     aSig64 = aSig;
1526     aSig64 <<= 32;
1527     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1528     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1529
1530 }
1531
1532 /*----------------------------------------------------------------------------
1533 | Returns the result of converting the single-precision floating-point value
1534 | `a' to the 32-bit two's complement integer format.  The conversion is
1535 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1536 | Arithmetic, except that the conversion is always rounded toward zero.
1537 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1538 | the conversion overflows, the largest integer with the same sign as `a' is
1539 | returned.
1540 *----------------------------------------------------------------------------*/
1541
1542 int32 float32_to_int32_round_to_zero(float32 a, float_status *status)
1543 {
1544     flag aSign;
1545     int_fast16_t aExp, shiftCount;
1546     uint32_t aSig;
1547     int32_t z;
1548     a = float32_squash_input_denormal(a STATUS_VAR);
1549
1550     aSig = extractFloat32Frac( a );
1551     aExp = extractFloat32Exp( a );
1552     aSign = extractFloat32Sign( a );
1553     shiftCount = aExp - 0x9E;
1554     if ( 0 <= shiftCount ) {
1555         if ( float32_val(a) != 0xCF000000 ) {
1556             float_raise( float_flag_invalid STATUS_VAR);
1557             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1558         }
1559         return (int32_t) 0x80000000;
1560     }
1561     else if ( aExp <= 0x7E ) {
1562         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1563         return 0;
1564     }
1565     aSig = ( aSig | 0x00800000 )<<8;
1566     z = aSig>>( - shiftCount );
1567     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1568         STATUS(float_exception_flags) |= float_flag_inexact;
1569     }
1570     if ( aSign ) z = - z;
1571     return z;
1572
1573 }
1574
1575 /*----------------------------------------------------------------------------
1576 | Returns the result of converting the single-precision floating-point value
1577 | `a' to the 16-bit two's complement integer format.  The conversion is
1578 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1579 | Arithmetic, except that the conversion is always rounded toward zero.
1580 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1581 | the conversion overflows, the largest integer with the same sign as `a' is
1582 | returned.
1583 *----------------------------------------------------------------------------*/
1584
1585 int_fast16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
1586 {
1587     flag aSign;
1588     int_fast16_t aExp, shiftCount;
1589     uint32_t aSig;
1590     int32 z;
1591
1592     aSig = extractFloat32Frac( a );
1593     aExp = extractFloat32Exp( a );
1594     aSign = extractFloat32Sign( a );
1595     shiftCount = aExp - 0x8E;
1596     if ( 0 <= shiftCount ) {
1597         if ( float32_val(a) != 0xC7000000 ) {
1598             float_raise( float_flag_invalid STATUS_VAR);
1599             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1600                 return 0x7FFF;
1601             }
1602         }
1603         return (int32_t) 0xffff8000;
1604     }
1605     else if ( aExp <= 0x7E ) {
1606         if ( aExp | aSig ) {
1607             STATUS(float_exception_flags) |= float_flag_inexact;
1608         }
1609         return 0;
1610     }
1611     shiftCount -= 0x10;
1612     aSig = ( aSig | 0x00800000 )<<8;
1613     z = aSig>>( - shiftCount );
1614     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1615         STATUS(float_exception_flags) |= float_flag_inexact;
1616     }
1617     if ( aSign ) {
1618         z = - z;
1619     }
1620     return z;
1621
1622 }
1623
1624 /*----------------------------------------------------------------------------
1625 | Returns the result of converting the single-precision floating-point value
1626 | `a' to the 64-bit two's complement integer format.  The conversion is
1627 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1628 | Arithmetic---which means in particular that the conversion is rounded
1629 | according to the current rounding mode.  If `a' is a NaN, the largest
1630 | positive integer is returned.  Otherwise, if the conversion overflows, the
1631 | largest integer with the same sign as `a' is returned.
1632 *----------------------------------------------------------------------------*/
1633
1634 int64 float32_to_int64(float32 a, float_status *status)
1635 {
1636     flag aSign;
1637     int_fast16_t aExp, shiftCount;
1638     uint32_t aSig;
1639     uint64_t aSig64, aSigExtra;
1640     a = float32_squash_input_denormal(a STATUS_VAR);
1641
1642     aSig = extractFloat32Frac( a );
1643     aExp = extractFloat32Exp( a );
1644     aSign = extractFloat32Sign( a );
1645     shiftCount = 0xBE - aExp;
1646     if ( shiftCount < 0 ) {
1647         float_raise( float_flag_invalid STATUS_VAR);
1648         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1649             return LIT64( 0x7FFFFFFFFFFFFFFF );
1650         }
1651         return (int64_t) LIT64( 0x8000000000000000 );
1652     }
1653     if ( aExp ) aSig |= 0x00800000;
1654     aSig64 = aSig;
1655     aSig64 <<= 40;
1656     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1657     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1658
1659 }
1660
1661 /*----------------------------------------------------------------------------
1662 | Returns the result of converting the single-precision floating-point value
1663 | `a' to the 64-bit unsigned integer format.  The conversion is
1664 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1665 | Arithmetic---which means in particular that the conversion is rounded
1666 | according to the current rounding mode.  If `a' is a NaN, the largest
1667 | unsigned integer is returned.  Otherwise, if the conversion overflows, the
1668 | largest unsigned integer is returned.  If the 'a' is negative, the result
1669 | is rounded and zero is returned; values that do not round to zero will
1670 | raise the inexact exception flag.
1671 *----------------------------------------------------------------------------*/
1672
1673 uint64 float32_to_uint64(float32 a, float_status *status)
1674 {
1675     flag aSign;
1676     int_fast16_t aExp, shiftCount;
1677     uint32_t aSig;
1678     uint64_t aSig64, aSigExtra;
1679     a = float32_squash_input_denormal(a STATUS_VAR);
1680
1681     aSig = extractFloat32Frac(a);
1682     aExp = extractFloat32Exp(a);
1683     aSign = extractFloat32Sign(a);
1684     if ((aSign) && (aExp > 126)) {
1685         float_raise(float_flag_invalid STATUS_VAR);
1686         if (float32_is_any_nan(a)) {
1687             return LIT64(0xFFFFFFFFFFFFFFFF);
1688         } else {
1689             return 0;
1690         }
1691     }
1692     shiftCount = 0xBE - aExp;
1693     if (aExp) {
1694         aSig |= 0x00800000;
1695     }
1696     if (shiftCount < 0) {
1697         float_raise(float_flag_invalid STATUS_VAR);
1698         return LIT64(0xFFFFFFFFFFFFFFFF);
1699     }
1700
1701     aSig64 = aSig;
1702     aSig64 <<= 40;
1703     shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1704     return roundAndPackUint64(aSign, aSig64, aSigExtra STATUS_VAR);
1705 }
1706
1707 /*----------------------------------------------------------------------------
1708 | Returns the result of converting the single-precision floating-point value
1709 | `a' to the 64-bit unsigned integer format.  The conversion is
1710 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1711 | Arithmetic, except that the conversion is always rounded toward zero.  If
1712 | `a' is a NaN, the largest unsigned integer is returned.  Otherwise, if the
1713 | conversion overflows, the largest unsigned integer is returned.  If the
1714 | 'a' is negative, the result is rounded and zero is returned; values that do
1715 | not round to zero will raise the inexact flag.
1716 *----------------------------------------------------------------------------*/
1717
1718 uint64 float32_to_uint64_round_to_zero(float32 a, float_status *status)
1719 {
1720     signed char current_rounding_mode = STATUS(float_rounding_mode);
1721     set_float_rounding_mode(float_round_to_zero STATUS_VAR);
1722     int64_t v = float32_to_uint64(a STATUS_VAR);
1723     set_float_rounding_mode(current_rounding_mode STATUS_VAR);
1724     return v;
1725 }
1726
1727 /*----------------------------------------------------------------------------
1728 | Returns the result of converting the single-precision floating-point value
1729 | `a' to the 64-bit two's complement integer format.  The conversion is
1730 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1731 | Arithmetic, except that the conversion is always rounded toward zero.  If
1732 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1733 | conversion overflows, the largest integer with the same sign as `a' is
1734 | returned.
1735 *----------------------------------------------------------------------------*/
1736
1737 int64 float32_to_int64_round_to_zero(float32 a, float_status *status)
1738 {
1739     flag aSign;
1740     int_fast16_t aExp, shiftCount;
1741     uint32_t aSig;
1742     uint64_t aSig64;
1743     int64 z;
1744     a = float32_squash_input_denormal(a STATUS_VAR);
1745
1746     aSig = extractFloat32Frac( a );
1747     aExp = extractFloat32Exp( a );
1748     aSign = extractFloat32Sign( a );
1749     shiftCount = aExp - 0xBE;
1750     if ( 0 <= shiftCount ) {
1751         if ( float32_val(a) != 0xDF000000 ) {
1752             float_raise( float_flag_invalid STATUS_VAR);
1753             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1754                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1755             }
1756         }
1757         return (int64_t) LIT64( 0x8000000000000000 );
1758     }
1759     else if ( aExp <= 0x7E ) {
1760         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1761         return 0;
1762     }
1763     aSig64 = aSig | 0x00800000;
1764     aSig64 <<= 40;
1765     z = aSig64>>( - shiftCount );
1766     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1767         STATUS(float_exception_flags) |= float_flag_inexact;
1768     }
1769     if ( aSign ) z = - z;
1770     return z;
1771
1772 }
1773
1774 /*----------------------------------------------------------------------------
1775 | Returns the result of converting the single-precision floating-point value
1776 | `a' to the double-precision floating-point format.  The conversion is
1777 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1778 | Arithmetic.
1779 *----------------------------------------------------------------------------*/
1780
1781 float64 float32_to_float64(float32 a, float_status *status)
1782 {
1783     flag aSign;
1784     int_fast16_t aExp;
1785     uint32_t aSig;
1786     a = float32_squash_input_denormal(a STATUS_VAR);
1787
1788     aSig = extractFloat32Frac( a );
1789     aExp = extractFloat32Exp( a );
1790     aSign = extractFloat32Sign( a );
1791     if ( aExp == 0xFF ) {
1792         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1793         return packFloat64( aSign, 0x7FF, 0 );
1794     }
1795     if ( aExp == 0 ) {
1796         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1797         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1798         --aExp;
1799     }
1800     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1801
1802 }
1803
1804 /*----------------------------------------------------------------------------
1805 | Returns the result of converting the single-precision floating-point value
1806 | `a' to the extended double-precision floating-point format.  The conversion
1807 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1808 | Arithmetic.
1809 *----------------------------------------------------------------------------*/
1810
1811 floatx80 float32_to_floatx80(float32 a, float_status *status)
1812 {
1813     flag aSign;
1814     int_fast16_t aExp;
1815     uint32_t aSig;
1816
1817     a = float32_squash_input_denormal(a STATUS_VAR);
1818     aSig = extractFloat32Frac( a );
1819     aExp = extractFloat32Exp( a );
1820     aSign = extractFloat32Sign( a );
1821     if ( aExp == 0xFF ) {
1822         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1823         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1824     }
1825     if ( aExp == 0 ) {
1826         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1827         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1828     }
1829     aSig |= 0x00800000;
1830     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1831
1832 }
1833
1834 /*----------------------------------------------------------------------------
1835 | Returns the result of converting the single-precision floating-point value
1836 | `a' to the double-precision floating-point format.  The conversion is
1837 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1838 | Arithmetic.
1839 *----------------------------------------------------------------------------*/
1840
1841 float128 float32_to_float128(float32 a, float_status *status)
1842 {
1843     flag aSign;
1844     int_fast16_t aExp;
1845     uint32_t aSig;
1846
1847     a = float32_squash_input_denormal(a STATUS_VAR);
1848     aSig = extractFloat32Frac( a );
1849     aExp = extractFloat32Exp( a );
1850     aSign = extractFloat32Sign( a );
1851     if ( aExp == 0xFF ) {
1852         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1853         return packFloat128( aSign, 0x7FFF, 0, 0 );
1854     }
1855     if ( aExp == 0 ) {
1856         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1857         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1858         --aExp;
1859     }
1860     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1861
1862 }
1863
1864 /*----------------------------------------------------------------------------
1865 | Rounds the single-precision floating-point value `a' to an integer, and
1866 | returns the result as a single-precision floating-point value.  The
1867 | operation is performed according to the IEC/IEEE Standard for Binary
1868 | Floating-Point Arithmetic.
1869 *----------------------------------------------------------------------------*/
1870
1871 float32 float32_round_to_int(float32 a, float_status *status)
1872 {
1873     flag aSign;
1874     int_fast16_t aExp;
1875     uint32_t lastBitMask, roundBitsMask;
1876     uint32_t z;
1877     a = float32_squash_input_denormal(a STATUS_VAR);
1878
1879     aExp = extractFloat32Exp( a );
1880     if ( 0x96 <= aExp ) {
1881         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1882             return propagateFloat32NaN( a, a STATUS_VAR );
1883         }
1884         return a;
1885     }
1886     if ( aExp <= 0x7E ) {
1887         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1888         STATUS(float_exception_flags) |= float_flag_inexact;
1889         aSign = extractFloat32Sign( a );
1890         switch ( STATUS(float_rounding_mode) ) {
1891          case float_round_nearest_even:
1892             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1893                 return packFloat32( aSign, 0x7F, 0 );
1894             }
1895             break;
1896         case float_round_ties_away:
1897             if (aExp == 0x7E) {
1898                 return packFloat32(aSign, 0x7F, 0);
1899             }
1900             break;
1901          case float_round_down:
1902             return make_float32(aSign ? 0xBF800000 : 0);
1903          case float_round_up:
1904             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1905         }
1906         return packFloat32( aSign, 0, 0 );
1907     }
1908     lastBitMask = 1;
1909     lastBitMask <<= 0x96 - aExp;
1910     roundBitsMask = lastBitMask - 1;
1911     z = float32_val(a);
1912     switch (STATUS(float_rounding_mode)) {
1913     case float_round_nearest_even:
1914         z += lastBitMask>>1;
1915         if ((z & roundBitsMask) == 0) {
1916             z &= ~lastBitMask;
1917         }
1918         break;
1919     case float_round_ties_away:
1920         z += lastBitMask >> 1;
1921         break;
1922     case float_round_to_zero:
1923         break;
1924     case float_round_up:
1925         if (!extractFloat32Sign(make_float32(z))) {
1926             z += roundBitsMask;
1927         }
1928         break;
1929     case float_round_down:
1930         if (extractFloat32Sign(make_float32(z))) {
1931             z += roundBitsMask;
1932         }
1933         break;
1934     default:
1935         abort();
1936     }
1937     z &= ~ roundBitsMask;
1938     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1939     return make_float32(z);
1940
1941 }
1942
1943 /*----------------------------------------------------------------------------
1944 | Returns the result of adding the absolute values of the single-precision
1945 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1946 | before being returned.  `zSign' is ignored if the result is a NaN.
1947 | The addition is performed according to the IEC/IEEE Standard for Binary
1948 | Floating-Point Arithmetic.
1949 *----------------------------------------------------------------------------*/
1950
1951 static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
1952                               float_status *status)
1953 {
1954     int_fast16_t aExp, bExp, zExp;
1955     uint32_t aSig, bSig, zSig;
1956     int_fast16_t expDiff;
1957
1958     aSig = extractFloat32Frac( a );
1959     aExp = extractFloat32Exp( a );
1960     bSig = extractFloat32Frac( b );
1961     bExp = extractFloat32Exp( b );
1962     expDiff = aExp - bExp;
1963     aSig <<= 6;
1964     bSig <<= 6;
1965     if ( 0 < expDiff ) {
1966         if ( aExp == 0xFF ) {
1967             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1968             return a;
1969         }
1970         if ( bExp == 0 ) {
1971             --expDiff;
1972         }
1973         else {
1974             bSig |= 0x20000000;
1975         }
1976         shift32RightJamming( bSig, expDiff, &bSig );
1977         zExp = aExp;
1978     }
1979     else if ( expDiff < 0 ) {
1980         if ( bExp == 0xFF ) {
1981             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1982             return packFloat32( zSign, 0xFF, 0 );
1983         }
1984         if ( aExp == 0 ) {
1985             ++expDiff;
1986         }
1987         else {
1988             aSig |= 0x20000000;
1989         }
1990         shift32RightJamming( aSig, - expDiff, &aSig );
1991         zExp = bExp;
1992     }
1993     else {
1994         if ( aExp == 0xFF ) {
1995             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1996             return a;
1997         }
1998         if ( aExp == 0 ) {
1999             if (STATUS(flush_to_zero)) {
2000                 if (aSig | bSig) {
2001                     float_raise(float_flag_output_denormal STATUS_VAR);
2002                 }
2003                 return packFloat32(zSign, 0, 0);
2004             }
2005             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2006         }
2007         zSig = 0x40000000 + aSig + bSig;
2008         zExp = aExp;
2009         goto roundAndPack;
2010     }
2011     aSig |= 0x20000000;
2012     zSig = ( aSig + bSig )<<1;
2013     --zExp;
2014     if ( (int32_t) zSig < 0 ) {
2015         zSig = aSig + bSig;
2016         ++zExp;
2017     }
2018  roundAndPack:
2019     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2020
2021 }
2022
2023 /*----------------------------------------------------------------------------
2024 | Returns the result of subtracting the absolute values of the single-
2025 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2026 | difference is negated before being returned.  `zSign' is ignored if the
2027 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2028 | Standard for Binary Floating-Point Arithmetic.
2029 *----------------------------------------------------------------------------*/
2030
2031 static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
2032                               float_status *status)
2033 {
2034     int_fast16_t aExp, bExp, zExp;
2035     uint32_t aSig, bSig, zSig;
2036     int_fast16_t expDiff;
2037
2038     aSig = extractFloat32Frac( a );
2039     aExp = extractFloat32Exp( a );
2040     bSig = extractFloat32Frac( b );
2041     bExp = extractFloat32Exp( b );
2042     expDiff = aExp - bExp;
2043     aSig <<= 7;
2044     bSig <<= 7;
2045     if ( 0 < expDiff ) goto aExpBigger;
2046     if ( expDiff < 0 ) goto bExpBigger;
2047     if ( aExp == 0xFF ) {
2048         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2049         float_raise( float_flag_invalid STATUS_VAR);
2050         return float32_default_nan;
2051     }
2052     if ( aExp == 0 ) {
2053         aExp = 1;
2054         bExp = 1;
2055     }
2056     if ( bSig < aSig ) goto aBigger;
2057     if ( aSig < bSig ) goto bBigger;
2058     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2059  bExpBigger:
2060     if ( bExp == 0xFF ) {
2061         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2062         return packFloat32( zSign ^ 1, 0xFF, 0 );
2063     }
2064     if ( aExp == 0 ) {
2065         ++expDiff;
2066     }
2067     else {
2068         aSig |= 0x40000000;
2069     }
2070     shift32RightJamming( aSig, - expDiff, &aSig );
2071     bSig |= 0x40000000;
2072  bBigger:
2073     zSig = bSig - aSig;
2074     zExp = bExp;
2075     zSign ^= 1;
2076     goto normalizeRoundAndPack;
2077  aExpBigger:
2078     if ( aExp == 0xFF ) {
2079         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2080         return a;
2081     }
2082     if ( bExp == 0 ) {
2083         --expDiff;
2084     }
2085     else {
2086         bSig |= 0x40000000;
2087     }
2088     shift32RightJamming( bSig, expDiff, &bSig );
2089     aSig |= 0x40000000;
2090  aBigger:
2091     zSig = aSig - bSig;
2092     zExp = aExp;
2093  normalizeRoundAndPack:
2094     --zExp;
2095     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2096
2097 }
2098
2099 /*----------------------------------------------------------------------------
2100 | Returns the result of adding the single-precision floating-point values `a'
2101 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2102 | Binary Floating-Point Arithmetic.
2103 *----------------------------------------------------------------------------*/
2104
2105 float32 float32_add(float32 a, float32 b, float_status *status)
2106 {
2107     flag aSign, bSign;
2108     a = float32_squash_input_denormal(a STATUS_VAR);
2109     b = float32_squash_input_denormal(b STATUS_VAR);
2110
2111     aSign = extractFloat32Sign( a );
2112     bSign = extractFloat32Sign( b );
2113     if ( aSign == bSign ) {
2114         return addFloat32Sigs( a, b, aSign STATUS_VAR);
2115     }
2116     else {
2117         return subFloat32Sigs( a, b, aSign STATUS_VAR );
2118     }
2119
2120 }
2121
2122 /*----------------------------------------------------------------------------
2123 | Returns the result of subtracting the single-precision floating-point values
2124 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2125 | for Binary Floating-Point Arithmetic.
2126 *----------------------------------------------------------------------------*/
2127
2128 float32 float32_sub(float32 a, float32 b, float_status *status)
2129 {
2130     flag aSign, bSign;
2131     a = float32_squash_input_denormal(a STATUS_VAR);
2132     b = float32_squash_input_denormal(b STATUS_VAR);
2133
2134     aSign = extractFloat32Sign( a );
2135     bSign = extractFloat32Sign( b );
2136     if ( aSign == bSign ) {
2137         return subFloat32Sigs( a, b, aSign STATUS_VAR );
2138     }
2139     else {
2140         return addFloat32Sigs( a, b, aSign STATUS_VAR );
2141     }
2142
2143 }
2144
2145 /*----------------------------------------------------------------------------
2146 | Returns the result of multiplying the single-precision floating-point values
2147 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2148 | for Binary Floating-Point Arithmetic.
2149 *----------------------------------------------------------------------------*/
2150
2151 float32 float32_mul(float32 a, float32 b, float_status *status)
2152 {
2153     flag aSign, bSign, zSign;
2154     int_fast16_t aExp, bExp, zExp;
2155     uint32_t aSig, bSig;
2156     uint64_t zSig64;
2157     uint32_t zSig;
2158
2159     a = float32_squash_input_denormal(a STATUS_VAR);
2160     b = float32_squash_input_denormal(b STATUS_VAR);
2161
2162     aSig = extractFloat32Frac( a );
2163     aExp = extractFloat32Exp( a );
2164     aSign = extractFloat32Sign( a );
2165     bSig = extractFloat32Frac( b );
2166     bExp = extractFloat32Exp( b );
2167     bSign = extractFloat32Sign( b );
2168     zSign = aSign ^ bSign;
2169     if ( aExp == 0xFF ) {
2170         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2171             return propagateFloat32NaN( a, b STATUS_VAR );
2172         }
2173         if ( ( bExp | bSig ) == 0 ) {
2174             float_raise( float_flag_invalid STATUS_VAR);
2175             return float32_default_nan;
2176         }
2177         return packFloat32( zSign, 0xFF, 0 );
2178     }
2179     if ( bExp == 0xFF ) {
2180         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2181         if ( ( aExp | aSig ) == 0 ) {
2182             float_raise( float_flag_invalid STATUS_VAR);
2183             return float32_default_nan;
2184         }
2185         return packFloat32( zSign, 0xFF, 0 );
2186     }
2187     if ( aExp == 0 ) {
2188         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2189         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2190     }
2191     if ( bExp == 0 ) {
2192         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2193         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2194     }
2195     zExp = aExp + bExp - 0x7F;
2196     aSig = ( aSig | 0x00800000 )<<7;
2197     bSig = ( bSig | 0x00800000 )<<8;
2198     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2199     zSig = zSig64;
2200     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2201         zSig <<= 1;
2202         --zExp;
2203     }
2204     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2205
2206 }
2207
2208 /*----------------------------------------------------------------------------
2209 | Returns the result of dividing the single-precision floating-point value `a'
2210 | by the corresponding value `b'.  The operation is performed according to the
2211 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2212 *----------------------------------------------------------------------------*/
2213
2214 float32 float32_div(float32 a, float32 b, float_status *status)
2215 {
2216     flag aSign, bSign, zSign;
2217     int_fast16_t aExp, bExp, zExp;
2218     uint32_t aSig, bSig, zSig;
2219     a = float32_squash_input_denormal(a STATUS_VAR);
2220     b = float32_squash_input_denormal(b STATUS_VAR);
2221
2222     aSig = extractFloat32Frac( a );
2223     aExp = extractFloat32Exp( a );
2224     aSign = extractFloat32Sign( a );
2225     bSig = extractFloat32Frac( b );
2226     bExp = extractFloat32Exp( b );
2227     bSign = extractFloat32Sign( b );
2228     zSign = aSign ^ bSign;
2229     if ( aExp == 0xFF ) {
2230         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2231         if ( bExp == 0xFF ) {
2232             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2233             float_raise( float_flag_invalid STATUS_VAR);
2234             return float32_default_nan;
2235         }
2236         return packFloat32( zSign, 0xFF, 0 );
2237     }
2238     if ( bExp == 0xFF ) {
2239         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2240         return packFloat32( zSign, 0, 0 );
2241     }
2242     if ( bExp == 0 ) {
2243         if ( bSig == 0 ) {
2244             if ( ( aExp | aSig ) == 0 ) {
2245                 float_raise( float_flag_invalid STATUS_VAR);
2246                 return float32_default_nan;
2247             }
2248             float_raise( float_flag_divbyzero STATUS_VAR);
2249             return packFloat32( zSign, 0xFF, 0 );
2250         }
2251         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2252     }
2253     if ( aExp == 0 ) {
2254         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2255         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2256     }
2257     zExp = aExp - bExp + 0x7D;
2258     aSig = ( aSig | 0x00800000 )<<7;
2259     bSig = ( bSig | 0x00800000 )<<8;
2260     if ( bSig <= ( aSig + aSig ) ) {
2261         aSig >>= 1;
2262         ++zExp;
2263     }
2264     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2265     if ( ( zSig & 0x3F ) == 0 ) {
2266         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2267     }
2268     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2269
2270 }
2271
2272 /*----------------------------------------------------------------------------
2273 | Returns the remainder of the single-precision floating-point value `a'
2274 | with respect to the corresponding value `b'.  The operation is performed
2275 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2276 *----------------------------------------------------------------------------*/
2277
2278 float32 float32_rem(float32 a, float32 b, float_status *status)
2279 {
2280     flag aSign, zSign;
2281     int_fast16_t aExp, bExp, expDiff;
2282     uint32_t aSig, bSig;
2283     uint32_t q;
2284     uint64_t aSig64, bSig64, q64;
2285     uint32_t alternateASig;
2286     int32_t sigMean;
2287     a = float32_squash_input_denormal(a STATUS_VAR);
2288     b = float32_squash_input_denormal(b STATUS_VAR);
2289
2290     aSig = extractFloat32Frac( a );
2291     aExp = extractFloat32Exp( a );
2292     aSign = extractFloat32Sign( a );
2293     bSig = extractFloat32Frac( b );
2294     bExp = extractFloat32Exp( b );
2295     if ( aExp == 0xFF ) {
2296         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2297             return propagateFloat32NaN( a, b STATUS_VAR );
2298         }
2299         float_raise( float_flag_invalid STATUS_VAR);
2300         return float32_default_nan;
2301     }
2302     if ( bExp == 0xFF ) {
2303         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2304         return a;
2305     }
2306     if ( bExp == 0 ) {
2307         if ( bSig == 0 ) {
2308             float_raise( float_flag_invalid STATUS_VAR);
2309             return float32_default_nan;
2310         }
2311         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2312     }
2313     if ( aExp == 0 ) {
2314         if ( aSig == 0 ) return a;
2315         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2316     }
2317     expDiff = aExp - bExp;
2318     aSig |= 0x00800000;
2319     bSig |= 0x00800000;
2320     if ( expDiff < 32 ) {
2321         aSig <<= 8;
2322         bSig <<= 8;
2323         if ( expDiff < 0 ) {
2324             if ( expDiff < -1 ) return a;
2325             aSig >>= 1;
2326         }
2327         q = ( bSig <= aSig );
2328         if ( q ) aSig -= bSig;
2329         if ( 0 < expDiff ) {
2330             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2331             q >>= 32 - expDiff;
2332             bSig >>= 2;
2333             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2334         }
2335         else {
2336             aSig >>= 2;
2337             bSig >>= 2;
2338         }
2339     }
2340     else {
2341         if ( bSig <= aSig ) aSig -= bSig;
2342         aSig64 = ( (uint64_t) aSig )<<40;
2343         bSig64 = ( (uint64_t) bSig )<<40;
2344         expDiff -= 64;
2345         while ( 0 < expDiff ) {
2346             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2347             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2348             aSig64 = - ( ( bSig * q64 )<<38 );
2349             expDiff -= 62;
2350         }
2351         expDiff += 64;
2352         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2353         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2354         q = q64>>( 64 - expDiff );
2355         bSig <<= 6;
2356         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2357     }
2358     do {
2359         alternateASig = aSig;
2360         ++q;
2361         aSig -= bSig;
2362     } while ( 0 <= (int32_t) aSig );
2363     sigMean = aSig + alternateASig;
2364     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2365         aSig = alternateASig;
2366     }
2367     zSign = ( (int32_t) aSig < 0 );
2368     if ( zSign ) aSig = - aSig;
2369     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2370
2371 }
2372
2373 /*----------------------------------------------------------------------------
2374 | Returns the result of multiplying the single-precision floating-point values
2375 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2376 | multiplication.  The operation is performed according to the IEC/IEEE
2377 | Standard for Binary Floating-Point Arithmetic 754-2008.
2378 | The flags argument allows the caller to select negation of the
2379 | addend, the intermediate product, or the final result. (The difference
2380 | between this and having the caller do a separate negation is that negating
2381 | externally will flip the sign bit on NaNs.)
2382 *----------------------------------------------------------------------------*/
2383
2384 float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2385                        float_status *status)
2386 {
2387     flag aSign, bSign, cSign, zSign;
2388     int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2389     uint32_t aSig, bSig, cSig;
2390     flag pInf, pZero, pSign;
2391     uint64_t pSig64, cSig64, zSig64;
2392     uint32_t pSig;
2393     int shiftcount;
2394     flag signflip, infzero;
2395
2396     a = float32_squash_input_denormal(a STATUS_VAR);
2397     b = float32_squash_input_denormal(b STATUS_VAR);
2398     c = float32_squash_input_denormal(c STATUS_VAR);
2399     aSig = extractFloat32Frac(a);
2400     aExp = extractFloat32Exp(a);
2401     aSign = extractFloat32Sign(a);
2402     bSig = extractFloat32Frac(b);
2403     bExp = extractFloat32Exp(b);
2404     bSign = extractFloat32Sign(b);
2405     cSig = extractFloat32Frac(c);
2406     cExp = extractFloat32Exp(c);
2407     cSign = extractFloat32Sign(c);
2408
2409     infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2410                (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2411
2412     /* It is implementation-defined whether the cases of (0,inf,qnan)
2413      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2414      * they return if they do), so we have to hand this information
2415      * off to the target-specific pick-a-NaN routine.
2416      */
2417     if (((aExp == 0xff) && aSig) ||
2418         ((bExp == 0xff) && bSig) ||
2419         ((cExp == 0xff) && cSig)) {
2420         return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2421     }
2422
2423     if (infzero) {
2424         float_raise(float_flag_invalid STATUS_VAR);
2425         return float32_default_nan;
2426     }
2427
2428     if (flags & float_muladd_negate_c) {
2429         cSign ^= 1;
2430     }
2431
2432     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2433
2434     /* Work out the sign and type of the product */
2435     pSign = aSign ^ bSign;
2436     if (flags & float_muladd_negate_product) {
2437         pSign ^= 1;
2438     }
2439     pInf = (aExp == 0xff) || (bExp == 0xff);
2440     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2441
2442     if (cExp == 0xff) {
2443         if (pInf && (pSign ^ cSign)) {
2444             /* addition of opposite-signed infinities => InvalidOperation */
2445             float_raise(float_flag_invalid STATUS_VAR);
2446             return float32_default_nan;
2447         }
2448         /* Otherwise generate an infinity of the same sign */
2449         return packFloat32(cSign ^ signflip, 0xff, 0);
2450     }
2451
2452     if (pInf) {
2453         return packFloat32(pSign ^ signflip, 0xff, 0);
2454     }
2455
2456     if (pZero) {
2457         if (cExp == 0) {
2458             if (cSig == 0) {
2459                 /* Adding two exact zeroes */
2460                 if (pSign == cSign) {
2461                     zSign = pSign;
2462                 } else if (STATUS(float_rounding_mode) == float_round_down) {
2463                     zSign = 1;
2464                 } else {
2465                     zSign = 0;
2466                 }
2467                 return packFloat32(zSign ^ signflip, 0, 0);
2468             }
2469             /* Exact zero plus a denorm */
2470             if (STATUS(flush_to_zero)) {
2471                 float_raise(float_flag_output_denormal STATUS_VAR);
2472                 return packFloat32(cSign ^ signflip, 0, 0);
2473             }
2474         }
2475         /* Zero plus something non-zero : just return the something */
2476         if (flags & float_muladd_halve_result) {
2477             if (cExp == 0) {
2478                 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2479             }
2480             /* Subtract one to halve, and one again because roundAndPackFloat32
2481              * wants one less than the true exponent.
2482              */
2483             cExp -= 2;
2484             cSig = (cSig | 0x00800000) << 7;
2485             return roundAndPackFloat32(cSign ^ signflip, cExp, cSig STATUS_VAR);
2486         }
2487         return packFloat32(cSign ^ signflip, cExp, cSig);
2488     }
2489
2490     if (aExp == 0) {
2491         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2492     }
2493     if (bExp == 0) {
2494         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2495     }
2496
2497     /* Calculate the actual result a * b + c */
2498
2499     /* Multiply first; this is easy. */
2500     /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2501      * because we want the true exponent, not the "one-less-than"
2502      * flavour that roundAndPackFloat32() takes.
2503      */
2504     pExp = aExp + bExp - 0x7e;
2505     aSig = (aSig | 0x00800000) << 7;
2506     bSig = (bSig | 0x00800000) << 8;
2507     pSig64 = (uint64_t)aSig * bSig;
2508     if ((int64_t)(pSig64 << 1) >= 0) {
2509         pSig64 <<= 1;
2510         pExp--;
2511     }
2512
2513     zSign = pSign ^ signflip;
2514
2515     /* Now pSig64 is the significand of the multiply, with the explicit bit in
2516      * position 62.
2517      */
2518     if (cExp == 0) {
2519         if (!cSig) {
2520             /* Throw out the special case of c being an exact zero now */
2521             shift64RightJamming(pSig64, 32, &pSig64);
2522             pSig = pSig64;
2523             if (flags & float_muladd_halve_result) {
2524                 pExp--;
2525             }
2526             return roundAndPackFloat32(zSign, pExp - 1,
2527                                        pSig STATUS_VAR);
2528         }
2529         normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2530     }
2531
2532     cSig64 = (uint64_t)cSig << (62 - 23);
2533     cSig64 |= LIT64(0x4000000000000000);
2534     expDiff = pExp - cExp;
2535
2536     if (pSign == cSign) {
2537         /* Addition */
2538         if (expDiff > 0) {
2539             /* scale c to match p */
2540             shift64RightJamming(cSig64, expDiff, &cSig64);
2541             zExp = pExp;
2542         } else if (expDiff < 0) {
2543             /* scale p to match c */
2544             shift64RightJamming(pSig64, -expDiff, &pSig64);
2545             zExp = cExp;
2546         } else {
2547             /* no scaling needed */
2548             zExp = cExp;
2549         }
2550         /* Add significands and make sure explicit bit ends up in posn 62 */
2551         zSig64 = pSig64 + cSig64;
2552         if ((int64_t)zSig64 < 0) {
2553             shift64RightJamming(zSig64, 1, &zSig64);
2554         } else {
2555             zExp--;
2556         }
2557     } else {
2558         /* Subtraction */
2559         if (expDiff > 0) {
2560             shift64RightJamming(cSig64, expDiff, &cSig64);
2561             zSig64 = pSig64 - cSig64;
2562             zExp = pExp;
2563         } else if (expDiff < 0) {
2564             shift64RightJamming(pSig64, -expDiff, &pSig64);
2565             zSig64 = cSig64 - pSig64;
2566             zExp = cExp;
2567             zSign ^= 1;
2568         } else {
2569             zExp = pExp;
2570             if (cSig64 < pSig64) {
2571                 zSig64 = pSig64 - cSig64;
2572             } else if (pSig64 < cSig64) {
2573                 zSig64 = cSig64 - pSig64;
2574                 zSign ^= 1;
2575             } else {
2576                 /* Exact zero */
2577                 zSign = signflip;
2578                 if (STATUS(float_rounding_mode) == float_round_down) {
2579                     zSign ^= 1;
2580                 }
2581                 return packFloat32(zSign, 0, 0);
2582             }
2583         }
2584         --zExp;
2585         /* Normalize to put the explicit bit back into bit 62. */
2586         shiftcount = countLeadingZeros64(zSig64) - 1;
2587         zSig64 <<= shiftcount;
2588         zExp -= shiftcount;
2589     }
2590     if (flags & float_muladd_halve_result) {
2591         zExp--;
2592     }
2593
2594     shift64RightJamming(zSig64, 32, &zSig64);
2595     return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2596 }
2597
2598
2599 /*----------------------------------------------------------------------------
2600 | Returns the square root of the single-precision floating-point value `a'.
2601 | The operation is performed according to the IEC/IEEE Standard for Binary
2602 | Floating-Point Arithmetic.
2603 *----------------------------------------------------------------------------*/
2604
2605 float32 float32_sqrt(float32 a, float_status *status)
2606 {
2607     flag aSign;
2608     int_fast16_t aExp, zExp;
2609     uint32_t aSig, zSig;
2610     uint64_t rem, term;
2611     a = float32_squash_input_denormal(a STATUS_VAR);
2612
2613     aSig = extractFloat32Frac( a );
2614     aExp = extractFloat32Exp( a );
2615     aSign = extractFloat32Sign( a );
2616     if ( aExp == 0xFF ) {
2617         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2618         if ( ! aSign ) return a;
2619         float_raise( float_flag_invalid STATUS_VAR);
2620         return float32_default_nan;
2621     }
2622     if ( aSign ) {
2623         if ( ( aExp | aSig ) == 0 ) return a;
2624         float_raise( float_flag_invalid STATUS_VAR);
2625         return float32_default_nan;
2626     }
2627     if ( aExp == 0 ) {
2628         if ( aSig == 0 ) return float32_zero;
2629         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2630     }
2631     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2632     aSig = ( aSig | 0x00800000 )<<8;
2633     zSig = estimateSqrt32( aExp, aSig ) + 2;
2634     if ( ( zSig & 0x7F ) <= 5 ) {
2635         if ( zSig < 2 ) {
2636             zSig = 0x7FFFFFFF;
2637             goto roundAndPack;
2638         }
2639         aSig >>= aExp & 1;
2640         term = ( (uint64_t) zSig ) * zSig;
2641         rem = ( ( (uint64_t) aSig )<<32 ) - term;
2642         while ( (int64_t) rem < 0 ) {
2643             --zSig;
2644             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2645         }
2646         zSig |= ( rem != 0 );
2647     }
2648     shift32RightJamming( zSig, 1, &zSig );
2649  roundAndPack:
2650     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2651
2652 }
2653
2654 /*----------------------------------------------------------------------------
2655 | Returns the binary exponential of the single-precision floating-point value
2656 | `a'. The operation is performed according to the IEC/IEEE Standard for
2657 | Binary Floating-Point Arithmetic.
2658 |
2659 | Uses the following identities:
2660 |
2661 | 1. -------------------------------------------------------------------------
2662 |      x    x*ln(2)
2663 |     2  = e
2664 |
2665 | 2. -------------------------------------------------------------------------
2666 |                      2     3     4     5           n
2667 |      x        x     x     x     x     x           x
2668 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2669 |               1!    2!    3!    4!    5!          n!
2670 *----------------------------------------------------------------------------*/
2671
2672 static const float64 float32_exp2_coefficients[15] =
2673 {
2674     const_float64( 0x3ff0000000000000ll ), /*  1 */
2675     const_float64( 0x3fe0000000000000ll ), /*  2 */
2676     const_float64( 0x3fc5555555555555ll ), /*  3 */
2677     const_float64( 0x3fa5555555555555ll ), /*  4 */
2678     const_float64( 0x3f81111111111111ll ), /*  5 */
2679     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2680     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2681     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2682     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2683     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2684     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2685     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2686     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2687     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2688     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2689 };
2690
2691 float32 float32_exp2(float32 a, float_status *status)
2692 {
2693     flag aSign;
2694     int_fast16_t aExp;
2695     uint32_t aSig;
2696     float64 r, x, xn;
2697     int i;
2698     a = float32_squash_input_denormal(a STATUS_VAR);
2699
2700     aSig = extractFloat32Frac( a );
2701     aExp = extractFloat32Exp( a );
2702     aSign = extractFloat32Sign( a );
2703
2704     if ( aExp == 0xFF) {
2705         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2706         return (aSign) ? float32_zero : a;
2707     }
2708     if (aExp == 0) {
2709         if (aSig == 0) return float32_one;
2710     }
2711
2712     float_raise( float_flag_inexact STATUS_VAR);
2713
2714     /* ******************************* */
2715     /* using float64 for approximation */
2716     /* ******************************* */
2717     x = float32_to_float64(a STATUS_VAR);
2718     x = float64_mul(x, float64_ln2 STATUS_VAR);
2719
2720     xn = x;
2721     r = float64_one;
2722     for (i = 0 ; i < 15 ; i++) {
2723         float64 f;
2724
2725         f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2726         r = float64_add(r, f STATUS_VAR);
2727
2728         xn = float64_mul(xn, x STATUS_VAR);
2729     }
2730
2731     return float64_to_float32(r, status);
2732 }
2733
2734 /*----------------------------------------------------------------------------
2735 | Returns the binary log of the single-precision floating-point value `a'.
2736 | The operation is performed according to the IEC/IEEE Standard for Binary
2737 | Floating-Point Arithmetic.
2738 *----------------------------------------------------------------------------*/
2739 float32 float32_log2(float32 a, float_status *status)
2740 {
2741     flag aSign, zSign;
2742     int_fast16_t aExp;
2743     uint32_t aSig, zSig, i;
2744
2745     a = float32_squash_input_denormal(a STATUS_VAR);
2746     aSig = extractFloat32Frac( a );
2747     aExp = extractFloat32Exp( a );
2748     aSign = extractFloat32Sign( a );
2749
2750     if ( aExp == 0 ) {
2751         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2752         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2753     }
2754     if ( aSign ) {
2755         float_raise( float_flag_invalid STATUS_VAR);
2756         return float32_default_nan;
2757     }
2758     if ( aExp == 0xFF ) {
2759         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2760         return a;
2761     }
2762
2763     aExp -= 0x7F;
2764     aSig |= 0x00800000;
2765     zSign = aExp < 0;
2766     zSig = aExp << 23;
2767
2768     for (i = 1 << 22; i > 0; i >>= 1) {
2769         aSig = ( (uint64_t)aSig * aSig ) >> 23;
2770         if ( aSig & 0x01000000 ) {
2771             aSig >>= 1;
2772             zSig |= i;
2773         }
2774     }
2775
2776     if ( zSign )
2777         zSig = -zSig;
2778
2779     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2780 }
2781
2782 /*----------------------------------------------------------------------------
2783 | Returns 1 if the single-precision floating-point value `a' is equal to
2784 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2785 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2786 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2787 *----------------------------------------------------------------------------*/
2788
2789 int float32_eq(float32 a, float32 b, float_status *status)
2790 {
2791     uint32_t av, bv;
2792     a = float32_squash_input_denormal(a STATUS_VAR);
2793     b = float32_squash_input_denormal(b STATUS_VAR);
2794
2795     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2796          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2797        ) {
2798         float_raise( float_flag_invalid STATUS_VAR);
2799         return 0;
2800     }
2801     av = float32_val(a);
2802     bv = float32_val(b);
2803     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2804 }
2805
2806 /*----------------------------------------------------------------------------
2807 | Returns 1 if the single-precision floating-point value `a' is less than
2808 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
2809 | exception is raised if either operand is a NaN.  The comparison is performed
2810 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2811 *----------------------------------------------------------------------------*/
2812
2813 int float32_le(float32 a, float32 b, float_status *status)
2814 {
2815     flag aSign, bSign;
2816     uint32_t av, bv;
2817     a = float32_squash_input_denormal(a STATUS_VAR);
2818     b = float32_squash_input_denormal(b STATUS_VAR);
2819
2820     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2821          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2822        ) {
2823         float_raise( float_flag_invalid STATUS_VAR);
2824         return 0;
2825     }
2826     aSign = extractFloat32Sign( a );
2827     bSign = extractFloat32Sign( b );
2828     av = float32_val(a);
2829     bv = float32_val(b);
2830     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2831     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2832
2833 }
2834
2835 /*----------------------------------------------------------------------------
2836 | Returns 1 if the single-precision floating-point value `a' is less than
2837 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2838 | raised if either operand is a NaN.  The comparison is performed according
2839 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2840 *----------------------------------------------------------------------------*/
2841
2842 int float32_lt(float32 a, float32 b, float_status *status)
2843 {
2844     flag aSign, bSign;
2845     uint32_t av, bv;
2846     a = float32_squash_input_denormal(a STATUS_VAR);
2847     b = float32_squash_input_denormal(b STATUS_VAR);
2848
2849     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2850          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2851        ) {
2852         float_raise( float_flag_invalid STATUS_VAR);
2853         return 0;
2854     }
2855     aSign = extractFloat32Sign( a );
2856     bSign = extractFloat32Sign( b );
2857     av = float32_val(a);
2858     bv = float32_val(b);
2859     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2860     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2861
2862 }
2863
2864 /*----------------------------------------------------------------------------
2865 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2866 | be compared, and 0 otherwise.  The invalid exception is raised if either
2867 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
2868 | Standard for Binary Floating-Point Arithmetic.
2869 *----------------------------------------------------------------------------*/
2870
2871 int float32_unordered(float32 a, float32 b, float_status *status)
2872 {
2873     a = float32_squash_input_denormal(a STATUS_VAR);
2874     b = float32_squash_input_denormal(b STATUS_VAR);
2875
2876     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2877          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2878        ) {
2879         float_raise( float_flag_invalid STATUS_VAR);
2880         return 1;
2881     }
2882     return 0;
2883 }
2884
2885 /*----------------------------------------------------------------------------
2886 | Returns 1 if the single-precision floating-point value `a' is equal to
2887 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2888 | exception.  The comparison is performed according to the IEC/IEEE Standard
2889 | for Binary Floating-Point Arithmetic.
2890 *----------------------------------------------------------------------------*/
2891
2892 int float32_eq_quiet(float32 a, float32 b, float_status *status)
2893 {
2894     a = float32_squash_input_denormal(a STATUS_VAR);
2895     b = float32_squash_input_denormal(b STATUS_VAR);
2896
2897     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2898          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2899        ) {
2900         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2901             float_raise( float_flag_invalid STATUS_VAR);
2902         }
2903         return 0;
2904     }
2905     return ( float32_val(a) == float32_val(b) ) ||
2906             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2907 }
2908
2909 /*----------------------------------------------------------------------------
2910 | Returns 1 if the single-precision floating-point value `a' is less than or
2911 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2912 | cause an exception.  Otherwise, the comparison is performed according to the
2913 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2914 *----------------------------------------------------------------------------*/
2915
2916 int float32_le_quiet(float32 a, float32 b, float_status *status)
2917 {
2918     flag aSign, bSign;
2919     uint32_t av, bv;
2920     a = float32_squash_input_denormal(a STATUS_VAR);
2921     b = float32_squash_input_denormal(b STATUS_VAR);
2922
2923     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2924          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2925        ) {
2926         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2927             float_raise( float_flag_invalid STATUS_VAR);
2928         }
2929         return 0;
2930     }
2931     aSign = extractFloat32Sign( a );
2932     bSign = extractFloat32Sign( b );
2933     av = float32_val(a);
2934     bv = float32_val(b);
2935     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2936     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2937
2938 }
2939
2940 /*----------------------------------------------------------------------------
2941 | Returns 1 if the single-precision floating-point value `a' is less than
2942 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2943 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2944 | Standard for Binary Floating-Point Arithmetic.
2945 *----------------------------------------------------------------------------*/
2946
2947 int float32_lt_quiet(float32 a, float32 b, float_status *status)
2948 {
2949     flag aSign, bSign;
2950     uint32_t av, bv;
2951     a = float32_squash_input_denormal(a STATUS_VAR);
2952     b = float32_squash_input_denormal(b STATUS_VAR);
2953
2954     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2955          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2956        ) {
2957         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2958             float_raise( float_flag_invalid STATUS_VAR);
2959         }
2960         return 0;
2961     }
2962     aSign = extractFloat32Sign( a );
2963     bSign = extractFloat32Sign( b );
2964     av = float32_val(a);
2965     bv = float32_val(b);
2966     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2967     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2968
2969 }
2970
2971 /*----------------------------------------------------------------------------
2972 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2973 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
2974 | comparison is performed according to the IEC/IEEE Standard for Binary
2975 | Floating-Point Arithmetic.
2976 *----------------------------------------------------------------------------*/
2977
2978 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
2979 {
2980     a = float32_squash_input_denormal(a STATUS_VAR);
2981     b = float32_squash_input_denormal(b STATUS_VAR);
2982
2983     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2984          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2985        ) {
2986         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2987             float_raise( float_flag_invalid STATUS_VAR);
2988         }
2989         return 1;
2990     }
2991     return 0;
2992 }
2993
2994 /*----------------------------------------------------------------------------
2995 | Returns the result of converting the double-precision floating-point value
2996 | `a' to the 32-bit two's complement integer format.  The conversion is
2997 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2998 | Arithmetic---which means in particular that the conversion is rounded
2999 | according to the current rounding mode.  If `a' is a NaN, the largest
3000 | positive integer is returned.  Otherwise, if the conversion overflows, the
3001 | largest integer with the same sign as `a' is returned.
3002 *----------------------------------------------------------------------------*/
3003
3004 int32 float64_to_int32(float64 a, float_status *status)
3005 {
3006     flag aSign;
3007     int_fast16_t aExp, shiftCount;
3008     uint64_t aSig;
3009     a = float64_squash_input_denormal(a STATUS_VAR);
3010
3011     aSig = extractFloat64Frac( a );
3012     aExp = extractFloat64Exp( a );
3013     aSign = extractFloat64Sign( a );
3014     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3015     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3016     shiftCount = 0x42C - aExp;
3017     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3018     return roundAndPackInt32( aSign, aSig STATUS_VAR );
3019
3020 }
3021
3022 /*----------------------------------------------------------------------------
3023 | Returns the result of converting the double-precision floating-point value
3024 | `a' to the 32-bit two's complement integer format.  The conversion is
3025 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3026 | Arithmetic, except that the conversion is always rounded toward zero.
3027 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3028 | the conversion overflows, the largest integer with the same sign as `a' is
3029 | returned.
3030 *----------------------------------------------------------------------------*/
3031
3032 int32 float64_to_int32_round_to_zero(float64 a, float_status *status)
3033 {
3034     flag aSign;
3035     int_fast16_t aExp, shiftCount;
3036     uint64_t aSig, savedASig;
3037     int32_t z;
3038     a = float64_squash_input_denormal(a STATUS_VAR);
3039
3040     aSig = extractFloat64Frac( a );
3041     aExp = extractFloat64Exp( a );
3042     aSign = extractFloat64Sign( a );
3043     if ( 0x41E < aExp ) {
3044         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3045         goto invalid;
3046     }
3047     else if ( aExp < 0x3FF ) {
3048         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3049         return 0;
3050     }
3051     aSig |= LIT64( 0x0010000000000000 );
3052     shiftCount = 0x433 - aExp;
3053     savedASig = aSig;
3054     aSig >>= shiftCount;
3055     z = aSig;
3056     if ( aSign ) z = - z;
3057     if ( ( z < 0 ) ^ aSign ) {
3058  invalid:
3059         float_raise( float_flag_invalid STATUS_VAR);
3060         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3061     }
3062     if ( ( aSig<<shiftCount ) != savedASig ) {
3063         STATUS(float_exception_flags) |= float_flag_inexact;
3064     }
3065     return z;
3066
3067 }
3068
3069 /*----------------------------------------------------------------------------
3070 | Returns the result of converting the double-precision floating-point value
3071 | `a' to the 16-bit two's complement integer format.  The conversion is
3072 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3073 | Arithmetic, except that the conversion is always rounded toward zero.
3074 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3075 | the conversion overflows, the largest integer with the same sign as `a' is
3076 | returned.
3077 *----------------------------------------------------------------------------*/
3078
3079 int_fast16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
3080 {
3081     flag aSign;
3082     int_fast16_t aExp, shiftCount;
3083     uint64_t aSig, savedASig;
3084     int32 z;
3085
3086     aSig = extractFloat64Frac( a );
3087     aExp = extractFloat64Exp( a );
3088     aSign = extractFloat64Sign( a );
3089     if ( 0x40E < aExp ) {
3090         if ( ( aExp == 0x7FF ) && aSig ) {
3091             aSign = 0;
3092         }
3093         goto invalid;
3094     }
3095     else if ( aExp < 0x3FF ) {
3096         if ( aExp || aSig ) {
3097             STATUS(float_exception_flags) |= float_flag_inexact;
3098         }
3099         return 0;
3100     }
3101     aSig |= LIT64( 0x0010000000000000 );
3102     shiftCount = 0x433 - aExp;
3103     savedASig = aSig;
3104     aSig >>= shiftCount;
3105     z = aSig;
3106     if ( aSign ) {
3107         z = - z;
3108     }
3109     if ( ( (int16_t)z < 0 ) ^ aSign ) {
3110  invalid:
3111         float_raise( float_flag_invalid STATUS_VAR);
3112         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3113     }
3114     if ( ( aSig<<shiftCount ) != savedASig ) {
3115         STATUS(float_exception_flags) |= float_flag_inexact;
3116     }
3117     return z;
3118 }
3119
3120 /*----------------------------------------------------------------------------
3121 | Returns the result of converting the double-precision floating-point value
3122 | `a' to the 64-bit two's complement integer format.  The conversion is
3123 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3124 | Arithmetic---which means in particular that the conversion is rounded
3125 | according to the current rounding mode.  If `a' is a NaN, the largest
3126 | positive integer is returned.  Otherwise, if the conversion overflows, the
3127 | largest integer with the same sign as `a' is returned.
3128 *----------------------------------------------------------------------------*/
3129
3130 int64 float64_to_int64(float64 a, float_status *status)
3131 {
3132     flag aSign;
3133     int_fast16_t aExp, shiftCount;
3134     uint64_t aSig, aSigExtra;
3135     a = float64_squash_input_denormal(a STATUS_VAR);
3136
3137     aSig = extractFloat64Frac( a );
3138     aExp = extractFloat64Exp( a );
3139     aSign = extractFloat64Sign( a );
3140     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3141     shiftCount = 0x433 - aExp;
3142     if ( shiftCount <= 0 ) {
3143         if ( 0x43E < aExp ) {
3144             float_raise( float_flag_invalid STATUS_VAR);
3145             if (    ! aSign
3146                  || (    ( aExp == 0x7FF )
3147                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
3148                ) {
3149                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3150             }
3151             return (int64_t) LIT64( 0x8000000000000000 );
3152         }
3153         aSigExtra = 0;
3154         aSig <<= - shiftCount;
3155     }
3156     else {
3157         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3158     }
3159     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3160
3161 }
3162
3163 /*----------------------------------------------------------------------------
3164 | Returns the result of converting the double-precision floating-point value
3165 | `a' to the 64-bit two's complement integer format.  The conversion is
3166 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3167 | Arithmetic, except that the conversion is always rounded toward zero.
3168 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3169 | the conversion overflows, the largest integer with the same sign as `a' is
3170 | returned.
3171 *----------------------------------------------------------------------------*/
3172
3173 int64 float64_to_int64_round_to_zero(float64 a, float_status *status)
3174 {
3175     flag aSign;
3176     int_fast16_t aExp, shiftCount;
3177     uint64_t aSig;
3178     int64 z;
3179     a = float64_squash_input_denormal(a STATUS_VAR);
3180
3181     aSig = extractFloat64Frac( a );
3182     aExp = extractFloat64Exp( a );
3183     aSign = extractFloat64Sign( a );
3184     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3185     shiftCount = aExp - 0x433;
3186     if ( 0 <= shiftCount ) {
3187         if ( 0x43E <= aExp ) {
3188             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3189                 float_raise( float_flag_invalid STATUS_VAR);
3190                 if (    ! aSign
3191                      || (    ( aExp == 0x7FF )
3192                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
3193                    ) {
3194                     return LIT64( 0x7FFFFFFFFFFFFFFF );
3195                 }
3196             }
3197             return (int64_t) LIT64( 0x8000000000000000 );
3198         }
3199         z = aSig<<shiftCount;
3200     }
3201     else {
3202         if ( aExp < 0x3FE ) {
3203             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3204             return 0;
3205         }
3206         z = aSig>>( - shiftCount );
3207         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3208             STATUS(float_exception_flags) |= float_flag_inexact;
3209         }
3210     }
3211     if ( aSign ) z = - z;
3212     return z;
3213
3214 }
3215
3216 /*----------------------------------------------------------------------------
3217 | Returns the result of converting the double-precision floating-point value
3218 | `a' to the single-precision floating-point format.  The conversion is
3219 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3220 | Arithmetic.
3221 *----------------------------------------------------------------------------*/
3222
3223 float32 float64_to_float32(float64 a, float_status *status)
3224 {
3225     flag aSign;
3226     int_fast16_t aExp;
3227     uint64_t aSig;
3228     uint32_t zSig;
3229     a = float64_squash_input_denormal(a STATUS_VAR);
3230
3231     aSig = extractFloat64Frac( a );
3232     aExp = extractFloat64Exp( a );
3233     aSign = extractFloat64Sign( a );
3234     if ( aExp == 0x7FF ) {
3235         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3236         return packFloat32( aSign, 0xFF, 0 );
3237     }
3238     shift64RightJamming( aSig, 22, &aSig );
3239     zSig = aSig;
3240     if ( aExp || zSig ) {
3241         zSig |= 0x40000000;
3242         aExp -= 0x381;
3243     }
3244     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
3245
3246 }
3247
3248
3249 /*----------------------------------------------------------------------------
3250 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3251 | half-precision floating-point value, returning the result.  After being
3252 | shifted into the proper positions, the three fields are simply added
3253 | together to form the result.  This means that any integer portion of `zSig'
3254 | will be added into the exponent.  Since a properly normalized significand
3255 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3256 | than the desired result exponent whenever `zSig' is a complete, normalized
3257 | significand.
3258 *----------------------------------------------------------------------------*/
3259 static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3260 {
3261     return make_float16(
3262         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3263 }
3264
3265 /*----------------------------------------------------------------------------
3266 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3267 | and significand `zSig', and returns the proper half-precision floating-
3268 | point value corresponding to the abstract input.  Ordinarily, the abstract
3269 | value is simply rounded and packed into the half-precision format, with
3270 | the inexact exception raised if the abstract input cannot be represented
3271 | exactly.  However, if the abstract value is too large, the overflow and
3272 | inexact exceptions are raised and an infinity or maximal finite value is
3273 | returned.  If the abstract value is too small, the input value is rounded to
3274 | a subnormal number, and the underflow and inexact exceptions are raised if
3275 | the abstract input cannot be represented exactly as a subnormal half-
3276 | precision floating-point number.
3277 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3278 | ARM-style "alternative representation", which omits the NaN and Inf
3279 | encodings in order to raise the maximum representable exponent by one.
3280 |     The input significand `zSig' has its binary point between bits 22
3281 | and 23, which is 13 bits to the left of the usual location.  This shifted
3282 | significand must be normalized or smaller.  If `zSig' is not normalized,
3283 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3284 | and it must not require rounding.  In the usual case that `zSig' is
3285 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3286 | Note the slightly odd position of the binary point in zSig compared with the
3287 | other roundAndPackFloat functions. This should probably be fixed if we
3288 | need to implement more float16 routines than just conversion.
3289 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3290 | Binary Floating-Point Arithmetic.
3291 *----------------------------------------------------------------------------*/
3292
3293 static float32 roundAndPackFloat16(flag zSign, int_fast16_t zExp,
3294                                    uint32_t zSig, flag ieee,
3295                                    float_status *status)
3296 {
3297     int maxexp = ieee ? 29 : 30;
3298     uint32_t mask;
3299     uint32_t increment;
3300     bool rounding_bumps_exp;
3301     bool is_tiny = false;
3302
3303     /* Calculate the mask of bits of the mantissa which are not
3304      * representable in half-precision and will be lost.
3305      */
3306     if (zExp < 1) {
3307         /* Will be denormal in halfprec */
3308         mask = 0x00ffffff;
3309         if (zExp >= -11) {
3310             mask >>= 11 + zExp;
3311         }
3312     } else {
3313         /* Normal number in halfprec */
3314         mask = 0x00001fff;
3315     }
3316
3317     switch (STATUS(float_rounding_mode)) {
3318     case float_round_nearest_even:
3319         increment = (mask + 1) >> 1;
3320         if ((zSig & mask) == increment) {
3321             increment = zSig & (increment << 1);
3322         }
3323         break;
3324     case float_round_ties_away:
3325         increment = (mask + 1) >> 1;
3326         break;
3327     case float_round_up:
3328         increment = zSign ? 0 : mask;
3329         break;
3330     case float_round_down:
3331         increment = zSign ? mask : 0;
3332         break;
3333     default: /* round_to_zero */
3334         increment = 0;
3335         break;
3336     }
3337
3338     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3339
3340     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3341         if (ieee) {
3342             float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR);
3343             return packFloat16(zSign, 0x1f, 0);
3344         } else {
3345             float_raise(float_flag_invalid STATUS_VAR);
3346             return packFloat16(zSign, 0x1f, 0x3ff);
3347         }
3348     }
3349
3350     if (zExp < 0) {
3351         /* Note that flush-to-zero does not affect half-precision results */
3352         is_tiny =
3353             (STATUS(float_detect_tininess) == float_tininess_before_rounding)
3354             || (zExp < -1)
3355             || (!rounding_bumps_exp);
3356     }
3357     if (zSig & mask) {
3358         float_raise(float_flag_inexact STATUS_VAR);
3359         if (is_tiny) {
3360             float_raise(float_flag_underflow STATUS_VAR);
3361         }
3362     }
3363
3364     zSig += increment;
3365     if (rounding_bumps_exp) {
3366         zSig >>= 1;
3367         zExp++;
3368     }
3369
3370     if (zExp < -10) {
3371         return packFloat16(zSign, 0, 0);
3372     }
3373     if (zExp < 0) {
3374         zSig >>= -zExp;
3375         zExp = 0;
3376     }
3377     return packFloat16(zSign, zExp, zSig >> 13);
3378 }
3379
3380 static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr,
3381                                       uint32_t *zSigPtr)
3382 {
3383     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3384     *zSigPtr = aSig << shiftCount;
3385     *zExpPtr = 1 - shiftCount;
3386 }
3387
3388 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3389    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3390
3391 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3392 {
3393     flag aSign;
3394     int_fast16_t aExp;
3395     uint32_t aSig;
3396
3397     aSign = extractFloat16Sign(a);
3398     aExp = extractFloat16Exp(a);
3399     aSig = extractFloat16Frac(a);
3400
3401     if (aExp == 0x1f && ieee) {
3402         if (aSig) {
3403             return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3404         }
3405         return packFloat32(aSign, 0xff, 0);
3406     }
3407     if (aExp == 0) {
3408         if (aSig == 0) {
3409             return packFloat32(aSign, 0, 0);
3410         }
3411
3412         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3413         aExp--;
3414     }
3415     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3416 }
3417
3418 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3419 {
3420     flag aSign;
3421     int_fast16_t aExp;
3422     uint32_t aSig;
3423
3424     a = float32_squash_input_denormal(a STATUS_VAR);
3425
3426     aSig = extractFloat32Frac( a );
3427     aExp = extractFloat32Exp( a );
3428     aSign = extractFloat32Sign( a );
3429     if ( aExp == 0xFF ) {
3430         if (aSig) {
3431             /* Input is a NaN */
3432             if (!ieee) {
3433                 float_raise(float_flag_invalid STATUS_VAR);
3434                 return packFloat16(aSign, 0, 0);
3435             }
3436             return commonNaNToFloat16(
3437                 float32ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3438         }
3439         /* Infinity */
3440         if (!ieee) {
3441             float_raise(float_flag_invalid STATUS_VAR);
3442             return packFloat16(aSign, 0x1f, 0x3ff);
3443         }
3444         return packFloat16(aSign, 0x1f, 0);
3445     }
3446     if (aExp == 0 && aSig == 0) {
3447         return packFloat16(aSign, 0, 0);
3448     }
3449     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3450      * even if the input is denormal; however this is harmless because
3451      * the largest possible single-precision denormal is still smaller
3452      * than the smallest representable half-precision denormal, and so we
3453      * will end up ignoring aSig and returning via the "always return zero"
3454      * codepath.
3455      */
3456     aSig |= 0x00800000;
3457     aExp -= 0x71;
3458
3459     return roundAndPackFloat16(aSign, aExp, aSig, ieee STATUS_VAR);
3460 }
3461
3462 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3463 {
3464     flag aSign;
3465     int_fast16_t aExp;
3466     uint32_t aSig;
3467
3468     aSign = extractFloat16Sign(a);
3469     aExp = extractFloat16Exp(a);
3470     aSig = extractFloat16Frac(a);
3471
3472     if (aExp == 0x1f && ieee) {
3473         if (aSig) {
3474             return commonNaNToFloat64(
3475                 float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3476         }
3477         return packFloat64(aSign, 0x7ff, 0);
3478     }
3479     if (aExp == 0) {
3480         if (aSig == 0) {
3481             return packFloat64(aSign, 0, 0);
3482         }
3483
3484         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3485         aExp--;
3486     }
3487     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3488 }
3489
3490 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3491 {
3492     flag aSign;
3493     int_fast16_t aExp;
3494     uint64_t aSig;
3495     uint32_t zSig;
3496
3497     a = float64_squash_input_denormal(a STATUS_VAR);
3498
3499     aSig = extractFloat64Frac(a);
3500     aExp = extractFloat64Exp(a);
3501     aSign = extractFloat64Sign(a);
3502     if (aExp == 0x7FF) {
3503         if (aSig) {
3504             /* Input is a NaN */
3505             if (!ieee) {
3506                 float_raise(float_flag_invalid STATUS_VAR);
3507                 return packFloat16(aSign, 0, 0);
3508             }
3509             return commonNaNToFloat16(
3510                 float64ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3511         }
3512         /* Infinity */
3513         if (!ieee) {
3514             float_raise(float_flag_invalid STATUS_VAR);
3515             return packFloat16(aSign, 0x1f, 0x3ff);
3516         }
3517         return packFloat16(aSign, 0x1f, 0);
3518     }
3519     shift64RightJamming(aSig, 29, &aSig);
3520     zSig = aSig;
3521     if (aExp == 0 && zSig == 0) {
3522         return packFloat16(aSign, 0, 0);
3523     }
3524     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3525      * even if the input is denormal; however this is harmless because
3526      * the largest possible single-precision denormal is still smaller
3527      * than the smallest representable half-precision denormal, and so we
3528      * will end up ignoring aSig and returning via the "always return zero"
3529      * codepath.
3530      */
3531     zSig |= 0x00800000;
3532     aExp -= 0x3F1;
3533
3534     return roundAndPackFloat16(aSign, aExp, zSig, ieee STATUS_VAR);
3535 }
3536
3537 /*----------------------------------------------------------------------------
3538 | Returns the result of converting the double-precision floating-point value
3539 | `a' to the extended double-precision floating-point format.  The conversion
3540 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3541 | Arithmetic.
3542 *----------------------------------------------------------------------------*/
3543
3544 floatx80 float64_to_floatx80(float64 a, float_status *status)
3545 {
3546     flag aSign;
3547     int_fast16_t aExp;
3548     uint64_t aSig;
3549
3550     a = float64_squash_input_denormal(a STATUS_VAR);
3551     aSig = extractFloat64Frac( a );
3552     aExp = extractFloat64Exp( a );
3553     aSign = extractFloat64Sign( a );
3554     if ( aExp == 0x7FF ) {
3555         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3556         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3557     }
3558     if ( aExp == 0 ) {
3559         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3560         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3561     }
3562     return
3563         packFloatx80(
3564             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3565
3566 }
3567
3568 /*----------------------------------------------------------------------------
3569 | Returns the result of converting the double-precision floating-point value
3570 | `a' to the quadruple-precision floating-point format.  The conversion is
3571 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3572 | Arithmetic.
3573 *----------------------------------------------------------------------------*/
3574
3575 float128 float64_to_float128(float64 a, float_status *status)
3576 {
3577     flag aSign;
3578     int_fast16_t aExp;
3579     uint64_t aSig, zSig0, zSig1;
3580
3581     a = float64_squash_input_denormal(a STATUS_VAR);
3582     aSig = extractFloat64Frac( a );
3583     aExp = extractFloat64Exp( a );
3584     aSign = extractFloat64Sign( a );
3585     if ( aExp == 0x7FF ) {
3586         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3587         return packFloat128( aSign, 0x7FFF, 0, 0 );
3588     }
3589     if ( aExp == 0 ) {
3590         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3591         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3592         --aExp;
3593     }
3594     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3595     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3596
3597 }
3598
3599 /*----------------------------------------------------------------------------
3600 | Rounds the double-precision floating-point value `a' to an integer, and
3601 | returns the result as a double-precision floating-point value.  The
3602 | operation is performed according to the IEC/IEEE Standard for Binary
3603 | Floating-Point Arithmetic.
3604 *----------------------------------------------------------------------------*/
3605
3606 float64 float64_round_to_int(float64 a, float_status *status)
3607 {
3608     flag aSign;
3609     int_fast16_t aExp;
3610     uint64_t lastBitMask, roundBitsMask;
3611     uint64_t z;
3612     a = float64_squash_input_denormal(a STATUS_VAR);
3613
3614     aExp = extractFloat64Exp( a );
3615     if ( 0x433 <= aExp ) {
3616         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3617             return propagateFloat64NaN( a, a STATUS_VAR );
3618         }
3619         return a;
3620     }
3621     if ( aExp < 0x3FF ) {
3622         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3623         STATUS(float_exception_flags) |= float_flag_inexact;
3624         aSign = extractFloat64Sign( a );
3625         switch ( STATUS(float_rounding_mode) ) {
3626          case float_round_nearest_even:
3627             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3628                 return packFloat64( aSign, 0x3FF, 0 );
3629             }
3630             break;
3631         case float_round_ties_away:
3632             if (aExp == 0x3FE) {
3633                 return packFloat64(aSign, 0x3ff, 0);
3634             }
3635             break;
3636          case float_round_down:
3637             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3638          case float_round_up:
3639             return make_float64(
3640             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3641         }
3642         return packFloat64( aSign, 0, 0 );
3643     }
3644     lastBitMask = 1;
3645     lastBitMask <<= 0x433 - aExp;
3646     roundBitsMask = lastBitMask - 1;
3647     z = float64_val(a);
3648     switch (STATUS(float_rounding_mode)) {
3649     case float_round_nearest_even:
3650         z += lastBitMask >> 1;
3651         if ((z & roundBitsMask) == 0) {
3652             z &= ~lastBitMask;
3653         }
3654         break;
3655     case float_round_ties_away:
3656         z += lastBitMask >> 1;
3657         break;
3658     case float_round_to_zero:
3659         break;
3660     case float_round_up:
3661         if (!extractFloat64Sign(make_float64(z))) {
3662             z += roundBitsMask;
3663         }
3664         break;
3665     case float_round_down:
3666         if (extractFloat64Sign(make_float64(z))) {
3667             z += roundBitsMask;
3668         }
3669         break;
3670     default:
3671         abort();
3672     }
3673     z &= ~ roundBitsMask;
3674     if ( z != float64_val(a) )
3675         STATUS(float_exception_flags) |= float_flag_inexact;
3676     return make_float64(z);
3677
3678 }
3679
3680 float64 float64_trunc_to_int(float64 a, float_status *status)
3681 {
3682     int oldmode;
3683     float64 res;
3684     oldmode = STATUS(float_rounding_mode);
3685     STATUS(float_rounding_mode) = float_round_to_zero;
3686     res = float64_round_to_int(a STATUS_VAR);
3687     STATUS(float_rounding_mode) = oldmode;
3688     return res;
3689 }
3690
3691 /*----------------------------------------------------------------------------
3692 | Returns the result of adding the absolute values of the double-precision
3693 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3694 | before being returned.  `zSign' is ignored if the result is a NaN.
3695 | The addition is performed according to the IEC/IEEE Standard for Binary
3696 | Floating-Point Arithmetic.
3697 *----------------------------------------------------------------------------*/
3698
3699 static float64 addFloat64Sigs(float64 a, float64 b, flag zSign,
3700                               float_status *status)
3701 {
3702     int_fast16_t aExp, bExp, zExp;
3703     uint64_t aSig, bSig, zSig;
3704     int_fast16_t expDiff;
3705
3706     aSig = extractFloat64Frac( a );
3707     aExp = extractFloat64Exp( a );
3708     bSig = extractFloat64Frac( b );
3709     bExp = extractFloat64Exp( b );
3710     expDiff = aExp - bExp;
3711     aSig <<= 9;
3712     bSig <<= 9;
3713     if ( 0 < expDiff ) {
3714         if ( aExp == 0x7FF ) {
3715             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3716             return a;
3717         }
3718         if ( bExp == 0 ) {
3719             --expDiff;
3720         }
3721         else {
3722             bSig |= LIT64( 0x2000000000000000 );
3723         }
3724         shift64RightJamming( bSig, expDiff, &bSig );
3725         zExp = aExp;
3726     }
3727     else if ( expDiff < 0 ) {
3728         if ( bExp == 0x7FF ) {
3729             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3730             return packFloat64( zSign, 0x7FF, 0 );
3731         }
3732         if ( aExp == 0 ) {
3733             ++expDiff;
3734         }
3735         else {
3736             aSig |= LIT64( 0x2000000000000000 );
3737         }
3738         shift64RightJamming( aSig, - expDiff, &aSig );
3739         zExp = bExp;
3740     }
3741     else {
3742         if ( aExp == 0x7FF ) {
3743             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3744             return a;
3745         }
3746         if ( aExp == 0 ) {
3747             if (STATUS(flush_to_zero)) {
3748                 if (aSig | bSig) {
3749                     float_raise(float_flag_output_denormal STATUS_VAR);
3750                 }
3751                 return packFloat64(zSign, 0, 0);
3752             }
3753             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3754         }
3755         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3756         zExp = aExp;
3757         goto roundAndPack;
3758     }
3759     aSig |= LIT64( 0x2000000000000000 );
3760     zSig = ( aSig + bSig )<<1;
3761     --zExp;
3762     if ( (int64_t) zSig < 0 ) {
3763         zSig = aSig + bSig;
3764         ++zExp;
3765     }
3766  roundAndPack:
3767     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3768
3769 }
3770
3771 /*----------------------------------------------------------------------------
3772 | Returns the result of subtracting the absolute values of the double-
3773 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3774 | difference is negated before being returned.  `zSign' is ignored if the
3775 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3776 | Standard for Binary Floating-Point Arithmetic.
3777 *----------------------------------------------------------------------------*/
3778
3779 static float64 subFloat64Sigs(float64 a, float64 b, flag zSign,
3780                               float_status *status)
3781 {
3782     int_fast16_t aExp, bExp, zExp;
3783     uint64_t aSig, bSig, zSig;
3784     int_fast16_t expDiff;
3785
3786     aSig = extractFloat64Frac( a );
3787     aExp = extractFloat64Exp( a );
3788     bSig = extractFloat64Frac( b );
3789     bExp = extractFloat64Exp( b );
3790     expDiff = aExp - bExp;
3791     aSig <<= 10;
3792     bSig <<= 10;
3793     if ( 0 < expDiff ) goto aExpBigger;
3794     if ( expDiff < 0 ) goto bExpBigger;
3795     if ( aExp == 0x7FF ) {
3796         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3797         float_raise( float_flag_invalid STATUS_VAR);
3798         return float64_default_nan;
3799     }
3800     if ( aExp == 0 ) {
3801         aExp = 1;
3802         bExp = 1;
3803     }
3804     if ( bSig < aSig ) goto aBigger;
3805     if ( aSig < bSig ) goto bBigger;
3806     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3807  bExpBigger:
3808     if ( bExp == 0x7FF ) {
3809         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3810         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3811     }
3812     if ( aExp == 0 ) {
3813         ++expDiff;
3814     }
3815     else {
3816         aSig |= LIT64( 0x4000000000000000 );
3817     }
3818     shift64RightJamming( aSig, - expDiff, &aSig );
3819     bSig |= LIT64( 0x4000000000000000 );
3820  bBigger:
3821     zSig = bSig - aSig;
3822     zExp = bExp;
3823     zSign ^= 1;
3824     goto normalizeRoundAndPack;
3825  aExpBigger:
3826     if ( aExp == 0x7FF ) {
3827         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3828         return a;
3829     }
3830     if ( bExp == 0 ) {
3831         --expDiff;
3832     }
3833     else {
3834         bSig |= LIT64( 0x4000000000000000 );
3835     }
3836     shift64RightJamming( bSig, expDiff, &bSig );
3837     aSig |= LIT64( 0x4000000000000000 );
3838  aBigger:
3839     zSig = aSig - bSig;
3840     zExp = aExp;
3841  normalizeRoundAndPack:
3842     --zExp;
3843     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3844
3845 }
3846
3847 /*----------------------------------------------------------------------------
3848 | Returns the result of adding the double-precision floating-point values `a'
3849 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3850 | Binary Floating-Point Arithmetic.
3851 *----------------------------------------------------------------------------*/
3852
3853 float64 float64_add(float64 a, float64 b, float_status *status)
3854 {
3855     flag aSign, bSign;
3856     a = float64_squash_input_denormal(a STATUS_VAR);
3857     b = float64_squash_input_denormal(b STATUS_VAR);
3858
3859     aSign = extractFloat64Sign( a );
3860     bSign = extractFloat64Sign( b );
3861     if ( aSign == bSign ) {
3862         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3863     }
3864     else {
3865         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3866     }
3867
3868 }
3869
3870 /*----------------------------------------------------------------------------
3871 | Returns the result of subtracting the double-precision floating-point values
3872 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3873 | for Binary Floating-Point Arithmetic.
3874 *----------------------------------------------------------------------------*/
3875
3876 float64 float64_sub(float64 a, float64 b, float_status *status)
3877 {
3878     flag aSign, bSign;
3879     a = float64_squash_input_denormal(a STATUS_VAR);
3880     b = float64_squash_input_denormal(b STATUS_VAR);
3881
3882     aSign = extractFloat64Sign( a );
3883     bSign = extractFloat64Sign( b );
3884     if ( aSign == bSign ) {
3885         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3886     }
3887     else {
3888         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3889     }
3890
3891 }
3892
3893 /*----------------------------------------------------------------------------
3894 | Returns the result of multiplying the double-precision floating-point values
3895 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3896 | for Binary Floating-Point Arithmetic.
3897 *----------------------------------------------------------------------------*/
3898
3899 float64 float64_mul(float64 a, float64 b, float_status *status)
3900 {
3901     flag aSign, bSign, zSign;
3902     int_fast16_t aExp, bExp, zExp;
3903     uint64_t aSig, bSig, zSig0, zSig1;
3904
3905     a = float64_squash_input_denormal(a STATUS_VAR);
3906     b = float64_squash_input_denormal(b STATUS_VAR);
3907
3908     aSig = extractFloat64Frac( a );
3909     aExp = extractFloat64Exp( a );
3910     aSign = extractFloat64Sign( a );
3911     bSig = extractFloat64Frac( b );
3912     bExp = extractFloat64Exp( b );
3913     bSign = extractFloat64Sign( b );
3914     zSign = aSign ^ bSign;
3915     if ( aExp == 0x7FF ) {
3916         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3917             return propagateFloat64NaN( a, b STATUS_VAR );
3918         }
3919         if ( ( bExp | bSig ) == 0 ) {
3920             float_raise( float_flag_invalid STATUS_VAR);
3921             return float64_default_nan;
3922         }
3923         return packFloat64( zSign, 0x7FF, 0 );
3924     }
3925     if ( bExp == 0x7FF ) {
3926         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3927         if ( ( aExp | aSig ) == 0 ) {
3928             float_raise( float_flag_invalid STATUS_VAR);
3929             return float64_default_nan;
3930         }
3931         return packFloat64( zSign, 0x7FF, 0 );
3932     }
3933     if ( aExp == 0 ) {
3934         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3935         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3936     }
3937     if ( bExp == 0 ) {
3938         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3939         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3940     }
3941     zExp = aExp + bExp - 0x3FF;
3942     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3943     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3944     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3945     zSig0 |= ( zSig1 != 0 );
3946     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3947         zSig0 <<= 1;
3948         --zExp;
3949     }
3950     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3951
3952 }
3953
3954 /*----------------------------------------------------------------------------
3955 | Returns the result of dividing the double-precision floating-point value `a'
3956 | by the corresponding value `b'.  The operation is performed according to
3957 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3958 *----------------------------------------------------------------------------*/
3959
3960 float64 float64_div(float64 a, float64 b, float_status *status)
3961 {
3962     flag aSign, bSign, zSign;
3963     int_fast16_t aExp, bExp, zExp;
3964     uint64_t aSig, bSig, zSig;
3965     uint64_t rem0, rem1;
3966     uint64_t term0, term1;
3967     a = float64_squash_input_denormal(a STATUS_VAR);
3968     b = float64_squash_input_denormal(b STATUS_VAR);
3969
3970     aSig = extractFloat64Frac( a );
3971     aExp = extractFloat64Exp( a );
3972     aSign = extractFloat64Sign( a );
3973     bSig = extractFloat64Frac( b );
3974     bExp = extractFloat64Exp( b );
3975     bSign = extractFloat64Sign( b );
3976     zSign = aSign ^ bSign;
3977     if ( aExp == 0x7FF ) {
3978         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3979         if ( bExp == 0x7FF ) {
3980             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3981             float_raise( float_flag_invalid STATUS_VAR);
3982             return float64_default_nan;
3983         }
3984         return packFloat64( zSign, 0x7FF, 0 );
3985     }
3986     if ( bExp == 0x7FF ) {
3987         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3988         return packFloat64( zSign, 0, 0 );
3989     }
3990     if ( bExp == 0 ) {
3991         if ( bSig == 0 ) {
3992             if ( ( aExp | aSig ) == 0 ) {
3993                 float_raise( float_flag_invalid STATUS_VAR);
3994                 return float64_default_nan;
3995             }
3996             float_raise( float_flag_divbyzero STATUS_VAR);
3997             return packFloat64( zSign, 0x7FF, 0 );
3998         }
3999         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4000     }
4001     if ( aExp == 0 ) {
4002         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4003         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4004     }
4005     zExp = aExp - bExp + 0x3FD;
4006     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4007     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4008     if ( bSig <= ( aSig + aSig ) ) {
4009         aSig >>= 1;
4010         ++zExp;
4011     }
4012     zSig = estimateDiv128To64( aSig, 0, bSig );
4013     if ( ( zSig & 0x1FF ) <= 2 ) {
4014         mul64To128( bSig, zSig, &term0, &term1 );
4015         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4016         while ( (int64_t) rem0 < 0 ) {
4017             --zSig;
4018             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4019         }
4020         zSig |= ( rem1 != 0 );
4021     }
4022     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
4023
4024 }
4025
4026 /*----------------------------------------------------------------------------
4027 | Returns the remainder of the double-precision floating-point value `a'
4028 | with respect to the corresponding value `b'.  The operation is performed
4029 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4030 *----------------------------------------------------------------------------*/
4031
4032 float64 float64_rem(float64 a, float64 b, float_status *status)
4033 {
4034     flag aSign, zSign;
4035     int_fast16_t aExp, bExp, expDiff;
4036     uint64_t aSig, bSig;
4037     uint64_t q, alternateASig;
4038     int64_t sigMean;
4039
4040     a = float64_squash_input_denormal(a STATUS_VAR);
4041     b = float64_squash_input_denormal(b STATUS_VAR);
4042     aSig = extractFloat64Frac( a );
4043     aExp = extractFloat64Exp( a );
4044     aSign = extractFloat64Sign( a );
4045     bSig = extractFloat64Frac( b );
4046     bExp = extractFloat64Exp( b );
4047     if ( aExp == 0x7FF ) {
4048         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4049             return propagateFloat64NaN( a, b STATUS_VAR );
4050         }
4051         float_raise( float_flag_invalid STATUS_VAR);
4052         return float64_default_nan;
4053     }
4054     if ( bExp == 0x7FF ) {
4055         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
4056         return a;
4057     }
4058     if ( bExp == 0 ) {
4059         if ( bSig == 0 ) {
4060             float_raise( float_flag_invalid STATUS_VAR);
4061             return float64_default_nan;
4062         }
4063         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4064     }
4065     if ( aExp == 0 ) {
4066         if ( aSig == 0 ) return a;
4067         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4068     }
4069     expDiff = aExp - bExp;
4070     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4071     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4072     if ( expDiff < 0 ) {
4073         if ( expDiff < -1 ) return a;
4074         aSig >>= 1;
4075     }
4076     q = ( bSig <= aSig );
4077     if ( q ) aSig -= bSig;
4078     expDiff -= 64;
4079     while ( 0 < expDiff ) {
4080         q = estimateDiv128To64( aSig, 0, bSig );
4081         q = ( 2 < q ) ? q - 2 : 0;
4082         aSig = - ( ( bSig>>2 ) * q );
4083         expDiff -= 62;
4084     }
4085     expDiff += 64;
4086     if ( 0 < expDiff ) {
4087         q = estimateDiv128To64( aSig, 0, bSig );
4088         q = ( 2 < q ) ? q - 2 : 0;
4089         q >>= 64 - expDiff;
4090         bSig >>= 2;
4091         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4092     }
4093     else {
4094         aSig >>= 2;
4095         bSig >>= 2;
4096     }
4097     do {
4098         alternateASig = aSig;
4099         ++q;
4100         aSig -= bSig;
4101     } while ( 0 <= (int64_t) aSig );
4102     sigMean = aSig + alternateASig;
4103     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4104         aSig = alternateASig;
4105     }
4106     zSign = ( (int64_t) aSig < 0 );
4107     if ( zSign ) aSig = - aSig;
4108     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
4109
4110 }
4111
4112 /*----------------------------------------------------------------------------
4113 | Returns the result of multiplying the double-precision floating-point values
4114 | `a' and `b' then adding 'c', with no intermediate rounding step after the
4115 | multiplication.  The operation is performed according to the IEC/IEEE
4116 | Standard for Binary Floating-Point Arithmetic 754-2008.
4117 | The flags argument allows the caller to select negation of the
4118 | addend, the intermediate product, or the final result. (The difference
4119 | between this and having the caller do a separate negation is that negating
4120 | externally will flip the sign bit on NaNs.)
4121 *----------------------------------------------------------------------------*/
4122
4123 float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
4124                        float_status *status)
4125 {
4126     flag aSign, bSign, cSign, zSign;
4127     int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
4128     uint64_t aSig, bSig, cSig;
4129     flag pInf, pZero, pSign;
4130     uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4131     int shiftcount;
4132     flag signflip, infzero;
4133
4134     a = float64_squash_input_denormal(a STATUS_VAR);
4135     b = float64_squash_input_denormal(b STATUS_VAR);
4136     c = float64_squash_input_denormal(c STATUS_VAR);
4137     aSig = extractFloat64Frac(a);
4138     aExp = extractFloat64Exp(a);
4139     aSign = extractFloat64Sign(a);
4140     bSig = extractFloat64Frac(b);
4141     bExp = extractFloat64Exp(b);
4142     bSign = extractFloat64Sign(b);
4143     cSig = extractFloat64Frac(c);
4144     cExp = extractFloat64Exp(c);
4145     cSign = extractFloat64Sign(c);
4146
4147     infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4148                (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4149
4150     /* It is implementation-defined whether the cases of (0,inf,qnan)
4151      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4152      * they return if they do), so we have to hand this information
4153      * off to the target-specific pick-a-NaN routine.
4154      */
4155     if (((aExp == 0x7ff) && aSig) ||
4156         ((bExp == 0x7ff) && bSig) ||
4157         ((cExp == 0x7ff) && cSig)) {
4158         return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
4159     }
4160
4161     if (infzero) {
4162         float_raise(float_flag_invalid STATUS_VAR);
4163         return float64_default_nan;
4164     }
4165
4166     if (flags & float_muladd_negate_c) {
4167         cSign ^= 1;
4168     }
4169
4170     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4171
4172     /* Work out the sign and type of the product */
4173     pSign = aSign ^ bSign;
4174     if (flags & float_muladd_negate_product) {
4175         pSign ^= 1;
4176     }
4177     pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4178     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4179
4180     if (cExp == 0x7ff) {
4181         if (pInf && (pSign ^ cSign)) {
4182             /* addition of opposite-signed infinities => InvalidOperation */
4183             float_raise(float_flag_invalid STATUS_VAR);
4184             return float64_default_nan;
4185         }
4186         /* Otherwise generate an infinity of the same sign */
4187         return packFloat64(cSign ^ signflip, 0x7ff, 0);
4188     }
4189
4190     if (pInf) {
4191         return packFloat64(pSign ^ signflip, 0x7ff, 0);
4192     }
4193
4194     if (pZero) {
4195         if (cExp == 0) {
4196             if (cSig == 0) {
4197                 /* Adding two exact zeroes */
4198                 if (pSign == cSign) {
4199                     zSign = pSign;
4200                 } else if (STATUS(float_rounding_mode) == float_round_down) {
4201                     zSign = 1;
4202                 } else {
4203                     zSign = 0;
4204                 }
4205                 return packFloat64(zSign ^ signflip, 0, 0);
4206             }
4207             /* Exact zero plus a denorm */
4208             if (STATUS(flush_to_zero)) {
4209                 float_raise(float_flag_output_denormal STATUS_VAR);
4210                 return packFloat64(cSign ^ signflip, 0, 0);
4211             }
4212         }
4213         /* Zero plus something non-zero : just return the something */
4214         if (flags & float_muladd_halve_result) {
4215             if (cExp == 0) {
4216                 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4217             }
4218             /* Subtract one to halve, and one again because roundAndPackFloat64
4219              * wants one less than the true exponent.
4220              */
4221             cExp -= 2;
4222             cSig = (cSig | 0x0010000000000000ULL) << 10;
4223             return roundAndPackFloat64(cSign ^ signflip, cExp, cSig STATUS_VAR);
4224         }
4225         return packFloat64(cSign ^ signflip, cExp, cSig);
4226     }
4227
4228     if (aExp == 0) {
4229         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4230     }
4231     if (bExp == 0) {
4232         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4233     }
4234
4235     /* Calculate the actual result a * b + c */
4236
4237     /* Multiply first; this is easy. */
4238     /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4239      * because we want the true exponent, not the "one-less-than"
4240      * flavour that roundAndPackFloat64() takes.
4241      */
4242     pExp = aExp + bExp - 0x3fe;
4243     aSig = (aSig | LIT64(0x0010000000000000))<<10;
4244     bSig = (bSig | LIT64(0x0010000000000000))<<11;
4245     mul64To128(aSig, bSig, &pSig0, &pSig1);
4246     if ((int64_t)(pSig0 << 1) >= 0) {
4247         shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4248         pExp--;
4249     }
4250
4251     zSign = pSign ^ signflip;
4252
4253     /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4254      * bit in position 126.
4255      */
4256     if (cExp == 0) {
4257         if (!cSig) {
4258             /* Throw out the special case of c being an exact zero now */
4259             shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4260             if (flags & float_muladd_halve_result) {
4261                 pExp--;
4262             }
4263             return roundAndPackFloat64(zSign, pExp - 1,
4264                                        pSig1 STATUS_VAR);
4265         }
4266         normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4267     }
4268
4269     /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4270      * significand of the addend, with the explicit bit in position 126.
4271      */
4272     cSig0 = cSig << (126 - 64 - 52);
4273     cSig1 = 0;
4274     cSig0 |= LIT64(0x4000000000000000);
4275     expDiff = pExp - cExp;
4276
4277     if (pSign == cSign) {
4278         /* Addition */
4279         if (expDiff > 0) {
4280             /* scale c to match p */
4281             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4282             zExp = pExp;
4283         } else if (expDiff < 0) {
4284             /* scale p to match c */
4285             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4286             zExp = cExp;
4287         } else {
4288             /* no scaling needed */
4289             zExp = cExp;
4290         }
4291         /* Add significands and make sure explicit bit ends up in posn 126 */
4292         add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4293         if ((int64_t)zSig0 < 0) {
4294             shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4295         } else {
4296             zExp--;
4297         }
4298         shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4299         if (flags & float_muladd_halve_result) {
4300             zExp--;
4301         }
4302         return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
4303     } else {
4304         /* Subtraction */
4305         if (expDiff > 0) {
4306             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4307             sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4308             zExp = pExp;
4309         } else if (expDiff < 0) {
4310             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4311             sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4312             zExp = cExp;
4313             zSign ^= 1;
4314         } else {
4315             zExp = pExp;
4316             if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4317                 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4318             } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4319                 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4320                 zSign ^= 1;
4321             } else {
4322                 /* Exact zero */
4323                 zSign = signflip;
4324                 if (STATUS(float_rounding_mode) == float_round_down) {
4325                     zSign ^= 1;
4326                 }
4327                 return packFloat64(zSign, 0, 0);
4328             }
4329         }
4330         --zExp;
4331         /* Do the equivalent of normalizeRoundAndPackFloat64() but
4332          * starting with the significand in a pair of uint64_t.
4333          */
4334         if (zSig0) {
4335             shiftcount = countLeadingZeros64(zSig0) - 1;
4336             shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4337             if (zSig1) {
4338                 zSig0 |= 1;
4339             }
4340             zExp -= shiftcount;
4341         } else {
4342             shiftcount = countLeadingZeros64(zSig1);
4343             if (shiftcount == 0) {
4344                 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4345                 zExp -= 63;
4346             } else {
4347                 shiftcount--;
4348                 zSig0 = zSig1 << shiftcount;
4349                 zExp -= (shiftcount + 64);
4350             }
4351         }
4352         if (flags & float_muladd_halve_result) {
4353             zExp--;
4354         }
4355         return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
4356     }
4357 }
4358
4359 /*----------------------------------------------------------------------------
4360 | Returns the square root of the double-precision floating-point value `a'.
4361 | The operation is performed according to the IEC/IEEE Standard for Binary
4362 | Floating-Point Arithmetic.
4363 *----------------------------------------------------------------------------*/
4364
4365 float64 float64_sqrt(float64 a, float_status *status)
4366 {
4367     flag aSign;
4368     int_fast16_t aExp, zExp;
4369     uint64_t aSig, zSig, doubleZSig;
4370     uint64_t rem0, rem1, term0, term1;
4371     a = float64_squash_input_denormal(a STATUS_VAR);
4372
4373     aSig = extractFloat64Frac( a );
4374     aExp = extractFloat64Exp( a );
4375     aSign = extractFloat64Sign( a );
4376     if ( aExp == 0x7FF ) {
4377         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
4378         if ( ! aSign ) return a;
4379         float_raise( float_flag_invalid STATUS_VAR);
4380         return float64_default_nan;
4381     }
4382     if ( aSign ) {
4383         if ( ( aExp | aSig ) == 0 ) return a;
4384         float_raise( float_flag_invalid STATUS_VAR);
4385         return float64_default_nan;
4386     }
4387     if ( aExp == 0 ) {
4388         if ( aSig == 0 ) return float64_zero;
4389         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4390     }
4391     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4392     aSig |= LIT64( 0x0010000000000000 );
4393     zSig = estimateSqrt32( aExp, aSig>>21 );
4394     aSig <<= 9 - ( aExp & 1 );
4395     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4396     if ( ( zSig & 0x1FF ) <= 5 ) {
4397         doubleZSig = zSig<<1;
4398         mul64To128( zSig, zSig, &term0, &term1 );
4399         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4400         while ( (int64_t) rem0 < 0 ) {
4401             --zSig;
4402             doubleZSig -= 2;
4403             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4404         }
4405         zSig |= ( ( rem0 | rem1 ) != 0 );
4406     }
4407     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
4408
4409 }
4410
4411 /*----------------------------------------------------------------------------
4412 | Returns the binary log of the double-precision floating-point value `a'.
4413 | The operation is performed according to the IEC/IEEE Standard for Binary
4414 | Floating-Point Arithmetic.
4415 *----------------------------------------------------------------------------*/
4416 float64 float64_log2(float64 a, float_status *status)
4417 {
4418     flag aSign, zSign;
4419     int_fast16_t aExp;
4420     uint64_t aSig, aSig0, aSig1, zSig, i;
4421     a = float64_squash_input_denormal(a STATUS_VAR);
4422
4423     aSig = extractFloat64Frac( a );
4424     aExp = extractFloat64Exp( a );
4425     aSign = extractFloat64Sign( a );
4426
4427     if ( aExp == 0 ) {
4428         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4429         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4430     }
4431     if ( aSign ) {
4432         float_raise( float_flag_invalid STATUS_VAR);
4433         return float64_default_nan;
4434     }
4435     if ( aExp == 0x7FF ) {
4436         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
4437         return a;
4438     }
4439
4440     aExp -= 0x3FF;
4441     aSig |= LIT64( 0x0010000000000000 );
4442     zSign = aExp < 0;
4443     zSig = (uint64_t)aExp << 52;
4444     for (i = 1LL << 51; i > 0; i >>= 1) {
4445         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4446         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4447         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4448             aSig >>= 1;
4449             zSig |= i;
4450         }
4451     }
4452
4453     if ( zSign )
4454         zSig = -zSig;
4455     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
4456 }
4457
4458 /*----------------------------------------------------------------------------
4459 | Returns 1 if the double-precision floating-point value `a' is equal to the
4460 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4461 | if either operand is a NaN.  Otherwise, the comparison is performed
4462 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4463 *----------------------------------------------------------------------------*/
4464
4465 int float64_eq(float64 a, float64 b, float_status *status)
4466 {
4467     uint64_t av, bv;
4468     a = float64_squash_input_denormal(a STATUS_VAR);
4469     b = float64_squash_input_denormal(b STATUS_VAR);
4470
4471     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4472          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4473        ) {
4474         float_raise( float_flag_invalid STATUS_VAR);
4475         return 0;
4476     }
4477     av = float64_val(a);
4478     bv = float64_val(b);
4479     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4480
4481 }
4482
4483 /*----------------------------------------------------------------------------
4484 | Returns 1 if the double-precision floating-point value `a' is less than or
4485 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4486 | exception is raised if either operand is a NaN.  The comparison is performed
4487 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4488 *----------------------------------------------------------------------------*/
4489
4490 int float64_le(float64 a, float64 b, float_status *status)
4491 {
4492     flag aSign, bSign;
4493     uint64_t av, bv;
4494     a = float64_squash_input_denormal(a STATUS_VAR);
4495     b = float64_squash_input_denormal(b STATUS_VAR);
4496
4497     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4498          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4499        ) {
4500         float_raise( float_flag_invalid STATUS_VAR);
4501         return 0;
4502     }
4503     aSign = extractFloat64Sign( a );
4504     bSign = extractFloat64Sign( b );
4505     av = float64_val(a);
4506     bv = float64_val(b);
4507     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4508     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4509
4510 }
4511
4512 /*----------------------------------------------------------------------------
4513 | Returns 1 if the double-precision floating-point value `a' is less than
4514 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4515 | raised if either operand is a NaN.  The comparison is performed according
4516 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4517 *----------------------------------------------------------------------------*/
4518
4519 int float64_lt(float64 a, float64 b, float_status *status)
4520 {
4521     flag aSign, bSign;
4522     uint64_t av, bv;
4523
4524     a = float64_squash_input_denormal(a STATUS_VAR);
4525     b = float64_squash_input_denormal(b STATUS_VAR);
4526     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4527          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4528        ) {
4529         float_raise( float_flag_invalid STATUS_VAR);
4530         return 0;
4531     }
4532     aSign = extractFloat64Sign( a );
4533     bSign = extractFloat64Sign( b );
4534     av = float64_val(a);
4535     bv = float64_val(b);
4536     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4537     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4538
4539 }
4540
4541 /*----------------------------------------------------------------------------
4542 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4543 | be compared, and 0 otherwise.  The invalid exception is raised if either
4544 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4545 | Standard for Binary Floating-Point Arithmetic.
4546 *----------------------------------------------------------------------------*/
4547
4548 int float64_unordered(float64 a, float64 b, float_status *status)
4549 {
4550     a = float64_squash_input_denormal(a STATUS_VAR);
4551     b = float64_squash_input_denormal(b STATUS_VAR);
4552
4553     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4554          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4555        ) {
4556         float_raise( float_flag_invalid STATUS_VAR);
4557         return 1;
4558     }
4559     return 0;
4560 }
4561
4562 /*----------------------------------------------------------------------------
4563 | Returns 1 if the double-precision floating-point value `a' is equal to the
4564 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4565 | exception.The comparison is performed according to the IEC/IEEE Standard
4566 | for Binary Floating-Point Arithmetic.
4567 *----------------------------------------------------------------------------*/
4568
4569 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4570 {
4571     uint64_t av, bv;
4572     a = float64_squash_input_denormal(a STATUS_VAR);
4573     b = float64_squash_input_denormal(b STATUS_VAR);
4574
4575     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4576          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4577        ) {
4578         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4579             float_raise( float_flag_invalid STATUS_VAR);
4580         }
4581         return 0;
4582     }
4583     av = float64_val(a);
4584     bv = float64_val(b);
4585     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4586
4587 }
4588
4589 /*----------------------------------------------------------------------------
4590 | Returns 1 if the double-precision floating-point value `a' is less than or
4591 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4592 | cause an exception.  Otherwise, the comparison is performed according to the
4593 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4594 *----------------------------------------------------------------------------*/
4595
4596 int float64_le_quiet(float64 a, float64 b, float_status *status)
4597 {
4598     flag aSign, bSign;
4599     uint64_t av, bv;
4600     a = float64_squash_input_denormal(a STATUS_VAR);
4601     b = float64_squash_input_denormal(b STATUS_VAR);
4602
4603     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4604          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4605        ) {
4606         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4607             float_raise( float_flag_invalid STATUS_VAR);
4608         }
4609         return 0;
4610     }
4611     aSign = extractFloat64Sign( a );
4612     bSign = extractFloat64Sign( b );
4613     av = float64_val(a);
4614     bv = float64_val(b);
4615     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4616     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4617
4618 }
4619
4620 /*----------------------------------------------------------------------------
4621 | Returns 1 if the double-precision floating-point value `a' is less than
4622 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4623 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4624 | Standard for Binary Floating-Point Arithmetic.
4625 *----------------------------------------------------------------------------*/
4626
4627 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4628 {
4629     flag aSign, bSign;
4630     uint64_t av, bv;
4631     a = float64_squash_input_denormal(a STATUS_VAR);
4632     b = float64_squash_input_denormal(b STATUS_VAR);
4633
4634     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4635          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4636        ) {
4637         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4638             float_raise( float_flag_invalid STATUS_VAR);
4639         }
4640         return 0;
4641     }
4642     aSign = extractFloat64Sign( a );
4643     bSign = extractFloat64Sign( b );
4644     av = float64_val(a);
4645     bv = float64_val(b);
4646     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4647     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4648
4649 }
4650
4651 /*----------------------------------------------------------------------------
4652 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4653 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4654 | comparison is performed according to the IEC/IEEE Standard for Binary
4655 | Floating-Point Arithmetic.
4656 *----------------------------------------------------------------------------*/
4657
4658 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4659 {
4660     a = float64_squash_input_denormal(a STATUS_VAR);
4661     b = float64_squash_input_denormal(b STATUS_VAR);
4662
4663     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4664          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4665        ) {
4666         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4667             float_raise( float_flag_invalid STATUS_VAR);
4668         }
4669         return 1;
4670     }
4671     return 0;
4672 }
4673
4674 /*----------------------------------------------------------------------------
4675 | Returns the result of converting the extended double-precision floating-
4676 | point value `a' to the 32-bit two's complement integer format.  The
4677 | conversion is performed according to the IEC/IEEE Standard for Binary
4678 | Floating-Point Arithmetic---which means in particular that the conversion
4679 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4680 | largest positive integer is returned.  Otherwise, if the conversion
4681 | overflows, the largest integer with the same sign as `a' is returned.
4682 *----------------------------------------------------------------------------*/
4683
4684 int32 floatx80_to_int32(floatx80 a, float_status *status)
4685 {
4686     flag aSign;
4687     int32 aExp, shiftCount;
4688     uint64_t aSig;
4689
4690     aSig = extractFloatx80Frac( a );
4691     aExp = extractFloatx80Exp( a );
4692     aSign = extractFloatx80Sign( a );
4693     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4694     shiftCount = 0x4037 - aExp;
4695     if ( shiftCount <= 0 ) shiftCount = 1;
4696     shift64RightJamming( aSig, shiftCount, &aSig );
4697     return roundAndPackInt32( aSign, aSig STATUS_VAR );
4698
4699 }
4700
4701 /*----------------------------------------------------------------------------
4702 | Returns the result of converting the extended double-precision floating-
4703 | point value `a' to the 32-bit two's complement integer format.  The
4704 | conversion is performed according to the IEC/IEEE Standard for Binary
4705 | Floating-Point Arithmetic, except that the conversion is always rounded
4706 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4707 | Otherwise, if the conversion overflows, the largest integer with the same
4708 | sign as `a' is returned.
4709 *----------------------------------------------------------------------------*/
4710
4711 int32 floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4712 {
4713     flag aSign;
4714     int32 aExp, shiftCount;
4715     uint64_t aSig, savedASig;
4716     int32_t z;
4717
4718     aSig = extractFloatx80Frac( a );
4719     aExp = extractFloatx80Exp( a );
4720     aSign = extractFloatx80Sign( a );
4721     if ( 0x401E < aExp ) {
4722         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4723         goto invalid;
4724     }
4725     else if ( aExp < 0x3FFF ) {
4726         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4727         return 0;
4728     }
4729     shiftCount = 0x403E - aExp;
4730     savedASig = aSig;
4731     aSig >>= shiftCount;
4732     z = aSig;
4733     if ( aSign ) z = - z;
4734     if ( ( z < 0 ) ^ aSign ) {
4735  invalid:
4736         float_raise( float_flag_invalid STATUS_VAR);
4737         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4738     }
4739     if ( ( aSig<<shiftCount ) != savedASig ) {
4740         STATUS(float_exception_flags) |= float_flag_inexact;
4741     }
4742     return z;
4743
4744 }
4745
4746 /*----------------------------------------------------------------------------
4747 | Returns the result of converting the extended double-precision floating-
4748 | point value `a' to the 64-bit two's complement integer format.  The
4749 | conversion is performed according to the IEC/IEEE Standard for Binary
4750 | Floating-Point Arithmetic---which means in particular that the conversion
4751 | is rounded according to the current rounding mode.  If `a' is a NaN,
4752 | the largest positive integer is returned.  Otherwise, if the conversion
4753 | overflows, the largest integer with the same sign as `a' is returned.
4754 *----------------------------------------------------------------------------*/
4755
4756 int64 floatx80_to_int64(floatx80 a, float_status *status)
4757 {
4758     flag aSign;
4759     int32 aExp, shiftCount;
4760     uint64_t aSig, aSigExtra;
4761
4762     aSig = extractFloatx80Frac( a );
4763     aExp = extractFloatx80Exp( a );
4764     aSign = extractFloatx80Sign( a );
4765     shiftCount = 0x403E - aExp;
4766     if ( shiftCount <= 0 ) {
4767         if ( shiftCount ) {
4768             float_raise( float_flag_invalid STATUS_VAR);
4769             if (    ! aSign
4770                  || (    ( aExp == 0x7FFF )
4771                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
4772                ) {
4773                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4774             }
4775             return (int64_t) LIT64( 0x8000000000000000 );
4776         }
4777         aSigExtra = 0;
4778     }
4779     else {
4780         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4781     }
4782     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4783
4784 }
4785
4786 /*----------------------------------------------------------------------------
4787 | Returns the result of converting the extended double-precision floating-
4788 | point value `a' to the 64-bit two's complement integer format.  The
4789 | conversion is performed according to the IEC/IEEE Standard for Binary
4790 | Floating-Point Arithmetic, except that the conversion is always rounded
4791 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4792 | Otherwise, if the conversion overflows, the largest integer with the same
4793 | sign as `a' is returned.
4794 *----------------------------------------------------------------------------*/
4795
4796 int64 floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4797 {
4798     flag aSign;
4799     int32 aExp, shiftCount;
4800     uint64_t aSig;
4801     int64 z;
4802
4803     aSig = extractFloatx80Frac( a );
4804     aExp = extractFloatx80Exp( a );
4805     aSign = extractFloatx80Sign( a );
4806     shiftCount = aExp - 0x403E;
4807     if ( 0 <= shiftCount ) {
4808         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4809         if ( ( a.high != 0xC03E ) || aSig ) {
4810             float_raise( float_flag_invalid STATUS_VAR);
4811             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4812                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4813             }
4814         }
4815         return (int64_t) LIT64( 0x8000000000000000 );
4816     }
4817     else if ( aExp < 0x3FFF ) {
4818         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4819         return 0;
4820     }
4821     z = aSig>>( - shiftCount );
4822     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4823         STATUS(float_exception_flags) |= float_flag_inexact;
4824     }
4825     if ( aSign ) z = - z;
4826     return z;
4827
4828 }
4829
4830 /*----------------------------------------------------------------------------
4831 | Returns the result of converting the extended double-precision floating-
4832 | point value `a' to the single-precision floating-point format.  The
4833 | conversion is performed according to the IEC/IEEE Standard for Binary
4834 | Floating-Point Arithmetic.
4835 *----------------------------------------------------------------------------*/
4836
4837 float32 floatx80_to_float32(floatx80 a, float_status *status)
4838 {
4839     flag aSign;
4840     int32 aExp;
4841     uint64_t aSig;
4842
4843     aSig = extractFloatx80Frac( a );
4844     aExp = extractFloatx80Exp( a );
4845     aSign = extractFloatx80Sign( a );
4846     if ( aExp == 0x7FFF ) {
4847         if ( (uint64_t) ( aSig<<1 ) ) {
4848             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4849         }
4850         return packFloat32( aSign, 0xFF, 0 );
4851     }
4852     shift64RightJamming( aSig, 33, &aSig );
4853     if ( aExp || aSig ) aExp -= 0x3F81;
4854     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4855
4856 }
4857
4858 /*----------------------------------------------------------------------------
4859 | Returns the result of converting the extended double-precision floating-
4860 | point value `a' to the double-precision floating-point format.  The
4861 | conversion is performed according to the IEC/IEEE Standard for Binary
4862 | Floating-Point Arithmetic.
4863 *----------------------------------------------------------------------------*/
4864
4865 float64 floatx80_to_float64(floatx80 a, float_status *status)
4866 {
4867     flag aSign;
4868     int32 aExp;
4869     uint64_t aSig, zSig;
4870
4871     aSig = extractFloatx80Frac( a );
4872     aExp = extractFloatx80Exp( a );
4873     aSign = extractFloatx80Sign( a );
4874     if ( aExp == 0x7FFF ) {
4875         if ( (uint64_t) ( aSig<<1 ) ) {
4876             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4877         }
4878         return packFloat64( aSign, 0x7FF, 0 );
4879     }
4880     shift64RightJamming( aSig, 1, &zSig );
4881     if ( aExp || aSig ) aExp -= 0x3C01;
4882     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4883
4884 }
4885
4886 /*----------------------------------------------------------------------------
4887 | Returns the result of converting the extended double-precision floating-
4888 | point value `a' to the quadruple-precision floating-point format.  The
4889 | conversion is performed according to the IEC/IEEE Standard for Binary
4890 | Floating-Point Arithmetic.
4891 *----------------------------------------------------------------------------*/
4892
4893 float128 floatx80_to_float128(floatx80 a, float_status *status)
4894 {
4895     flag aSign;
4896     int_fast16_t aExp;
4897     uint64_t aSig, zSig0, zSig1;
4898
4899     aSig = extractFloatx80Frac( a );
4900     aExp = extractFloatx80Exp( a );
4901     aSign = extractFloatx80Sign( a );
4902     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4903         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4904     }
4905     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4906     return packFloat128( aSign, aExp, zSig0, zSig1 );
4907
4908 }
4909
4910 /*----------------------------------------------------------------------------
4911 | Rounds the extended double-precision floating-point value `a' to an integer,
4912 | and returns the result as an extended quadruple-precision floating-point
4913 | value.  The operation is performed according to the IEC/IEEE Standard for
4914 | Binary Floating-Point Arithmetic.
4915 *----------------------------------------------------------------------------*/
4916
4917 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4918 {
4919     flag aSign;
4920     int32 aExp;
4921     uint64_t lastBitMask, roundBitsMask;
4922     floatx80 z;
4923
4924     aExp = extractFloatx80Exp( a );
4925     if ( 0x403E <= aExp ) {
4926         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4927             return propagateFloatx80NaN( a, a STATUS_VAR );
4928         }
4929         return a;
4930     }
4931     if ( aExp < 0x3FFF ) {
4932         if (    ( aExp == 0 )
4933              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4934             return a;
4935         }
4936         STATUS(float_exception_flags) |= float_flag_inexact;
4937         aSign = extractFloatx80Sign( a );
4938         switch ( STATUS(float_rounding_mode) ) {
4939          case float_round_nearest_even:
4940             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4941                ) {
4942                 return
4943                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4944             }
4945             break;
4946         case float_round_ties_away:
4947             if (aExp == 0x3FFE) {
4948                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4949             }
4950             break;
4951          case float_round_down:
4952             return
4953                   aSign ?
4954                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4955                 : packFloatx80( 0, 0, 0 );
4956          case float_round_up:
4957             return
4958                   aSign ? packFloatx80( 1, 0, 0 )
4959                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4960         }
4961         return packFloatx80( aSign, 0, 0 );
4962     }
4963     lastBitMask = 1;
4964     lastBitMask <<= 0x403E - aExp;
4965     roundBitsMask = lastBitMask - 1;
4966     z = a;
4967     switch (STATUS(float_rounding_mode)) {
4968     case float_round_nearest_even:
4969         z.low += lastBitMask>>1;
4970         if ((z.low & roundBitsMask) == 0) {
4971             z.low &= ~lastBitMask;
4972         }
4973         break;
4974     case float_round_ties_away:
4975         z.low += lastBitMask >> 1;
4976         break;
4977     case float_round_to_zero:
4978         break;
4979     case float_round_up:
4980         if (!extractFloatx80Sign(z)) {
4981             z.low += roundBitsMask;
4982         }
4983         break;
4984     case float_round_down:
4985         if (extractFloatx80Sign(z)) {
4986             z.low += roundBitsMask;
4987         }
4988         break;
4989     default:
4990         abort();
4991     }
4992     z.low &= ~ roundBitsMask;
4993     if ( z.low == 0 ) {
4994         ++z.high;
4995         z.low = LIT64( 0x8000000000000000 );
4996     }
4997     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4998     return z;
4999
5000 }
5001
5002 /*----------------------------------------------------------------------------
5003 | Returns the result of adding the absolute values of the extended double-
5004 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5005 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5006 | The addition is performed according to the IEC/IEEE Standard for Binary
5007 | Floating-Point Arithmetic.
5008 *----------------------------------------------------------------------------*/
5009
5010 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5011                                 float_status *status)
5012 {
5013     int32 aExp, bExp, zExp;
5014     uint64_t aSig, bSig, zSig0, zSig1;
5015     int32 expDiff;
5016
5017     aSig = extractFloatx80Frac( a );
5018     aExp = extractFloatx80Exp( a );
5019     bSig = extractFloatx80Frac( b );
5020     bExp = extractFloatx80Exp( b );
5021     expDiff = aExp - bExp;
5022     if ( 0 < expDiff ) {
5023         if ( aExp == 0x7FFF ) {
5024             if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5025             return a;
5026         }
5027         if ( bExp == 0 ) --expDiff;
5028         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5029         zExp = aExp;
5030     }
5031     else if ( expDiff < 0 ) {
5032         if ( bExp == 0x7FFF ) {
5033             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5034             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5035         }
5036         if ( aExp == 0 ) ++expDiff;
5037         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5038         zExp = bExp;
5039     }
5040     else {
5041         if ( aExp == 0x7FFF ) {
5042             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5043                 return propagateFloatx80NaN( a, b STATUS_VAR );
5044             }
5045             return a;
5046         }
5047         zSig1 = 0;
5048         zSig0 = aSig + bSig;
5049         if ( aExp == 0 ) {
5050             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5051             goto roundAndPack;
5052         }
5053         zExp = aExp;
5054         goto shiftRight1;
5055     }
5056     zSig0 = aSig + bSig;
5057     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5058  shiftRight1:
5059     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5060     zSig0 |= LIT64( 0x8000000000000000 );
5061     ++zExp;
5062  roundAndPack:
5063     return
5064         roundAndPackFloatx80(
5065             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5066
5067 }
5068
5069 /*----------------------------------------------------------------------------
5070 | Returns the result of subtracting the absolute values of the extended
5071 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5072 | difference is negated before being returned.  `zSign' is ignored if the
5073 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5074 | Standard for Binary Floating-Point Arithmetic.
5075 *----------------------------------------------------------------------------*/
5076
5077 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5078                                 float_status *status)
5079 {
5080     int32 aExp, bExp, zExp;
5081     uint64_t aSig, bSig, zSig0, zSig1;
5082     int32 expDiff;
5083     floatx80 z;
5084
5085     aSig = extractFloatx80Frac( a );
5086     aExp = extractFloatx80Exp( a );
5087     bSig = extractFloatx80Frac( b );
5088     bExp = extractFloatx80Exp( b );
5089     expDiff = aExp - bExp;
5090     if ( 0 < expDiff ) goto aExpBigger;
5091     if ( expDiff < 0 ) goto bExpBigger;
5092     if ( aExp == 0x7FFF ) {
5093         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5094             return propagateFloatx80NaN( a, b STATUS_VAR );
5095         }
5096         float_raise( float_flag_invalid STATUS_VAR);
5097         z.low = floatx80_default_nan_low;
5098         z.high = floatx80_default_nan_high;
5099         return z;
5100     }
5101     if ( aExp == 0 ) {
5102         aExp = 1;
5103         bExp = 1;
5104     }
5105     zSig1 = 0;
5106     if ( bSig < aSig ) goto aBigger;
5107     if ( aSig < bSig ) goto bBigger;
5108     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
5109  bExpBigger:
5110     if ( bExp == 0x7FFF ) {
5111         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5112         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5113     }
5114     if ( aExp == 0 ) ++expDiff;
5115     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5116  bBigger:
5117     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5118     zExp = bExp;
5119     zSign ^= 1;
5120     goto normalizeRoundAndPack;
5121  aExpBigger:
5122     if ( aExp == 0x7FFF ) {
5123         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5124         return a;
5125     }
5126     if ( bExp == 0 ) --expDiff;
5127     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5128  aBigger:
5129     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5130     zExp = aExp;
5131  normalizeRoundAndPack:
5132     return
5133         normalizeRoundAndPackFloatx80(
5134             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5135
5136 }
5137
5138 /*----------------------------------------------------------------------------
5139 | Returns the result of adding the extended double-precision floating-point
5140 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5141 | Standard for Binary Floating-Point Arithmetic.
5142 *----------------------------------------------------------------------------*/
5143
5144 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5145 {
5146     flag aSign, bSign;
5147
5148     aSign = extractFloatx80Sign( a );
5149     bSign = extractFloatx80Sign( b );
5150     if ( aSign == bSign ) {
5151         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
5152     }
5153     else {
5154         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
5155     }
5156
5157 }
5158
5159 /*----------------------------------------------------------------------------
5160 | Returns the result of subtracting the extended double-precision floating-
5161 | point values `a' and `b'.  The operation is performed according to the
5162 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5163 *----------------------------------------------------------------------------*/
5164
5165 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5166 {
5167     flag aSign, bSign;
5168
5169     aSign = extractFloatx80Sign( a );
5170     bSign = extractFloatx80Sign( b );
5171     if ( aSign == bSign ) {
5172         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
5173     }
5174     else {
5175         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
5176     }
5177
5178 }
5179
5180 /*----------------------------------------------------------------------------
5181 | Returns the result of multiplying the extended double-precision floating-
5182 | point values `a' and `b'.  The operation is performed according to the
5183 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5184 *----------------------------------------------------------------------------*/
5185
5186 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5187 {
5188     flag aSign, bSign, zSign;
5189     int32 aExp, bExp, zExp;
5190     uint64_t aSig, bSig, zSig0, zSig1;
5191     floatx80 z;
5192
5193     aSig = extractFloatx80Frac( a );
5194     aExp = extractFloatx80Exp( a );
5195     aSign = extractFloatx80Sign( a );
5196     bSig = extractFloatx80Frac( b );
5197     bExp = extractFloatx80Exp( b );
5198     bSign = extractFloatx80Sign( b );
5199     zSign = aSign ^ bSign;
5200     if ( aExp == 0x7FFF ) {
5201         if (    (uint64_t) ( aSig<<1 )
5202              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5203             return propagateFloatx80NaN( a, b STATUS_VAR );
5204         }
5205         if ( ( bExp | bSig ) == 0 ) goto invalid;
5206         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5207     }
5208     if ( bExp == 0x7FFF ) {
5209         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5210         if ( ( aExp | aSig ) == 0 ) {
5211  invalid:
5212             float_raise( float_flag_invalid STATUS_VAR);
5213             z.low = floatx80_default_nan_low;
5214             z.high = floatx80_default_nan_high;
5215             return z;
5216         }
5217         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5218     }
5219     if ( aExp == 0 ) {
5220         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5221         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5222     }
5223     if ( bExp == 0 ) {
5224         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5225         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5226     }
5227     zExp = aExp + bExp - 0x3FFE;
5228     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5229     if ( 0 < (int64_t) zSig0 ) {
5230         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5231         --zExp;
5232     }
5233     return
5234         roundAndPackFloatx80(
5235             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5236
5237 }
5238
5239 /*----------------------------------------------------------------------------
5240 | Returns the result of dividing the extended double-precision floating-point
5241 | value `a' by the corresponding value `b'.  The operation is performed
5242 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5243 *----------------------------------------------------------------------------*/
5244
5245 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5246 {
5247     flag aSign, bSign, zSign;
5248     int32 aExp, bExp, zExp;
5249     uint64_t aSig, bSig, zSig0, zSig1;
5250     uint64_t rem0, rem1, rem2, term0, term1, term2;
5251     floatx80 z;
5252
5253     aSig = extractFloatx80Frac( a );
5254     aExp = extractFloatx80Exp( a );
5255     aSign = extractFloatx80Sign( a );
5256     bSig = extractFloatx80Frac( b );
5257     bExp = extractFloatx80Exp( b );
5258     bSign = extractFloatx80Sign( b );
5259     zSign = aSign ^ bSign;
5260     if ( aExp == 0x7FFF ) {
5261         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5262         if ( bExp == 0x7FFF ) {
5263             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5264             goto invalid;
5265         }
5266         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5267     }
5268     if ( bExp == 0x7FFF ) {
5269         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5270         return packFloatx80( zSign, 0, 0 );
5271     }
5272     if ( bExp == 0 ) {
5273         if ( bSig == 0 ) {
5274             if ( ( aExp | aSig ) == 0 ) {
5275  invalid:
5276                 float_raise( float_flag_invalid STATUS_VAR);
5277                 z.low = floatx80_default_nan_low;
5278                 z.high = floatx80_default_nan_high;
5279                 return z;
5280             }
5281             float_raise( float_flag_divbyzero STATUS_VAR);
5282             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5283         }
5284         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5285     }
5286     if ( aExp == 0 ) {
5287         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5288         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5289     }
5290     zExp = aExp - bExp + 0x3FFE;
5291     rem1 = 0;
5292     if ( bSig <= aSig ) {
5293         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5294         ++zExp;
5295     }
5296     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5297     mul64To128( bSig, zSig0, &term0, &term1 );
5298     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5299     while ( (int64_t) rem0 < 0 ) {
5300         --zSig0;
5301         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5302     }
5303     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5304     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5305         mul64To128( bSig, zSig1, &term1, &term2 );
5306         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5307         while ( (int64_t) rem1 < 0 ) {
5308             --zSig1;
5309             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5310         }
5311         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5312     }
5313     return
5314         roundAndPackFloatx80(
5315             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5316
5317 }
5318
5319 /*----------------------------------------------------------------------------
5320 | Returns the remainder of the extended double-precision floating-point value
5321 | `a' with respect to the corresponding value `b'.  The operation is performed
5322 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5323 *----------------------------------------------------------------------------*/
5324
5325 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5326 {
5327     flag aSign, zSign;
5328     int32 aExp, bExp, expDiff;
5329     uint64_t aSig0, aSig1, bSig;
5330     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5331     floatx80 z;
5332
5333     aSig0 = extractFloatx80Frac( a );
5334     aExp = extractFloatx80Exp( a );
5335     aSign = extractFloatx80Sign( a );
5336     bSig = extractFloatx80Frac( b );
5337     bExp = extractFloatx80Exp( b );
5338     if ( aExp == 0x7FFF ) {
5339         if (    (uint64_t) ( aSig0<<1 )
5340              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5341             return propagateFloatx80NaN( a, b STATUS_VAR );
5342         }
5343         goto invalid;
5344     }
5345     if ( bExp == 0x7FFF ) {
5346         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5347         return a;
5348     }
5349     if ( bExp == 0 ) {
5350         if ( bSig == 0 ) {
5351  invalid:
5352             float_raise( float_flag_invalid STATUS_VAR);
5353             z.low = floatx80_default_nan_low;
5354             z.high = floatx80_default_nan_high;
5355             return z;
5356         }
5357         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5358     }
5359     if ( aExp == 0 ) {
5360         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5361         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5362     }
5363     bSig |= LIT64( 0x8000000000000000 );
5364     zSign = aSign;
5365     expDiff = aExp - bExp;
5366     aSig1 = 0;
5367     if ( expDiff < 0 ) {
5368         if ( expDiff < -1 ) return a;
5369         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5370         expDiff = 0;
5371     }
5372     q = ( bSig <= aSig0 );
5373     if ( q ) aSig0 -= bSig;
5374     expDiff -= 64;
5375     while ( 0 < expDiff ) {
5376         q = estimateDiv128To64( aSig0, aSig1, bSig );
5377         q = ( 2 < q ) ? q - 2 : 0;
5378         mul64To128( bSig, q, &term0, &term1 );
5379         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5380         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5381         expDiff -= 62;
5382     }
5383     expDiff += 64;
5384     if ( 0 < expDiff ) {
5385         q = estimateDiv128To64( aSig0, aSig1, bSig );
5386         q = ( 2 < q ) ? q - 2 : 0;
5387         q >>= 64 - expDiff;
5388         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5389         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5390         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5391         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5392             ++q;
5393             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5394         }
5395     }
5396     else {
5397         term1 = 0;
5398         term0 = bSig;
5399     }
5400     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5401     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5402          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5403               && ( q & 1 ) )
5404        ) {
5405         aSig0 = alternateASig0;
5406         aSig1 = alternateASig1;
5407         zSign = ! zSign;
5408     }
5409     return
5410         normalizeRoundAndPackFloatx80(
5411             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
5412
5413 }
5414
5415 /*----------------------------------------------------------------------------
5416 | Returns the square root of the extended double-precision floating-point
5417 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5418 | for Binary Floating-Point Arithmetic.
5419 *----------------------------------------------------------------------------*/
5420
5421 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5422 {
5423     flag aSign;
5424     int32 aExp, zExp;
5425     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5426     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5427     floatx80 z;
5428
5429     aSig0 = extractFloatx80Frac( a );
5430     aExp = extractFloatx80Exp( a );
5431     aSign = extractFloatx80Sign( a );
5432     if ( aExp == 0x7FFF ) {
5433         if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
5434         if ( ! aSign ) return a;
5435         goto invalid;
5436     }
5437     if ( aSign ) {
5438         if ( ( aExp | aSig0 ) == 0 ) return a;
5439  invalid:
5440         float_raise( float_flag_invalid STATUS_VAR);
5441         z.low = floatx80_default_nan_low;
5442         z.high = floatx80_default_nan_high;
5443         return z;
5444     }
5445     if ( aExp == 0 ) {
5446         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5447         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5448     }
5449     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5450     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5451     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5452     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5453     doubleZSig0 = zSig0<<1;
5454     mul64To128( zSig0, zSig0, &term0, &term1 );
5455     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5456     while ( (int64_t) rem0 < 0 ) {
5457         --zSig0;
5458         doubleZSig0 -= 2;
5459         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5460     }
5461     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5462     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5463         if ( zSig1 == 0 ) zSig1 = 1;
5464         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5465         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5466         mul64To128( zSig1, zSig1, &term2, &term3 );
5467         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5468         while ( (int64_t) rem1 < 0 ) {
5469             --zSig1;
5470             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5471             term3 |= 1;
5472             term2 |= doubleZSig0;
5473             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5474         }
5475         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5476     }
5477     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5478     zSig0 |= doubleZSig0;
5479     return
5480         roundAndPackFloatx80(
5481             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
5482
5483 }
5484
5485 /*----------------------------------------------------------------------------
5486 | Returns 1 if the extended double-precision floating-point value `a' is equal
5487 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5488 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5489 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5490 *----------------------------------------------------------------------------*/
5491
5492 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5493 {
5494
5495     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5496               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5497          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5498               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5499        ) {
5500         float_raise( float_flag_invalid STATUS_VAR);
5501         return 0;
5502     }
5503     return
5504            ( a.low == b.low )
5505         && (    ( a.high == b.high )
5506              || (    ( a.low == 0 )
5507                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5508            );
5509
5510 }
5511
5512 /*----------------------------------------------------------------------------
5513 | Returns 1 if the extended double-precision floating-point value `a' is
5514 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5515 | invalid exception is raised if either operand is a NaN.  The comparison is
5516 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5517 | Arithmetic.
5518 *----------------------------------------------------------------------------*/
5519
5520 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5521 {
5522     flag aSign, bSign;
5523
5524     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5525               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5526          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5527               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5528        ) {
5529         float_raise( float_flag_invalid STATUS_VAR);
5530         return 0;
5531     }
5532     aSign = extractFloatx80Sign( a );
5533     bSign = extractFloatx80Sign( b );
5534     if ( aSign != bSign ) {
5535         return
5536                aSign
5537             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5538                  == 0 );
5539     }
5540     return
5541           aSign ? le128( b.high, b.low, a.high, a.low )
5542         : le128( a.high, a.low, b.high, b.low );
5543
5544 }
5545
5546 /*----------------------------------------------------------------------------
5547 | Returns 1 if the extended double-precision floating-point value `a' is
5548 | less than the corresponding value `b', and 0 otherwise.  The invalid
5549 | exception is raised if either operand is a NaN.  The comparison is performed
5550 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5551 *----------------------------------------------------------------------------*/
5552
5553 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5554 {
5555     flag aSign, bSign;
5556
5557     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5558               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5559          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5560               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5561        ) {
5562         float_raise( float_flag_invalid STATUS_VAR);
5563         return 0;
5564     }
5565     aSign = extractFloatx80Sign( a );
5566     bSign = extractFloatx80Sign( b );
5567     if ( aSign != bSign ) {
5568         return
5569                aSign
5570             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5571                  != 0 );
5572     }
5573     return
5574           aSign ? lt128( b.high, b.low, a.high, a.low )
5575         : lt128( a.high, a.low, b.high, b.low );
5576
5577 }
5578
5579 /*----------------------------------------------------------------------------
5580 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5581 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5582 | either operand is a NaN.   The comparison is performed according to the
5583 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5584 *----------------------------------------------------------------------------*/
5585 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5586 {
5587     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5588               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5589          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5590               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5591        ) {
5592         float_raise( float_flag_invalid STATUS_VAR);
5593         return 1;
5594     }
5595     return 0;
5596 }
5597
5598 /*----------------------------------------------------------------------------
5599 | Returns 1 if the extended double-precision floating-point value `a' is
5600 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5601 | cause an exception.  The comparison is performed according to the IEC/IEEE
5602 | Standard for Binary Floating-Point Arithmetic.
5603 *----------------------------------------------------------------------------*/
5604
5605 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5606 {
5607
5608     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5609               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5610          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5611               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5612        ) {
5613         if (    floatx80_is_signaling_nan( a )
5614              || floatx80_is_signaling_nan( b ) ) {
5615             float_raise( float_flag_invalid STATUS_VAR);
5616         }
5617         return 0;
5618     }
5619     return
5620            ( a.low == b.low )
5621         && (    ( a.high == b.high )
5622              || (    ( a.low == 0 )
5623                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5624            );
5625
5626 }
5627
5628 /*----------------------------------------------------------------------------
5629 | Returns 1 if the extended double-precision floating-point value `a' is less
5630 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5631 | do not cause an exception.  Otherwise, the comparison is performed according
5632 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5633 *----------------------------------------------------------------------------*/
5634
5635 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5636 {
5637     flag aSign, bSign;
5638
5639     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5640               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5641          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5642               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5643        ) {
5644         if (    floatx80_is_signaling_nan( a )
5645              || floatx80_is_signaling_nan( b ) ) {
5646             float_raise( float_flag_invalid STATUS_VAR);
5647         }
5648         return 0;
5649     }
5650     aSign = extractFloatx80Sign( a );
5651     bSign = extractFloatx80Sign( b );
5652     if ( aSign != bSign ) {
5653         return
5654                aSign
5655             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5656                  == 0 );
5657     }
5658     return
5659           aSign ? le128( b.high, b.low, a.high, a.low )
5660         : le128( a.high, a.low, b.high, b.low );
5661
5662 }
5663
5664 /*----------------------------------------------------------------------------
5665 | Returns 1 if the extended double-precision floating-point value `a' is less
5666 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5667 | an exception.  Otherwise, the comparison is performed according to the
5668 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5669 *----------------------------------------------------------------------------*/
5670
5671 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5672 {
5673     flag aSign, bSign;
5674
5675     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5676               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5677          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5678               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5679        ) {
5680         if (    floatx80_is_signaling_nan( a )
5681              || floatx80_is_signaling_nan( b ) ) {
5682             float_raise( float_flag_invalid STATUS_VAR);
5683         }
5684         return 0;
5685     }
5686     aSign = extractFloatx80Sign( a );
5687     bSign = extractFloatx80Sign( b );
5688     if ( aSign != bSign ) {
5689         return
5690                aSign
5691             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5692                  != 0 );
5693     }
5694     return
5695           aSign ? lt128( b.high, b.low, a.high, a.low )
5696         : lt128( a.high, a.low, b.high, b.low );
5697
5698 }
5699
5700 /*----------------------------------------------------------------------------
5701 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5702 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5703 | The comparison is performed according to the IEC/IEEE Standard for Binary
5704 | Floating-Point Arithmetic.
5705 *----------------------------------------------------------------------------*/
5706 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5707 {
5708     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5709               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5710          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5711               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5712        ) {
5713         if (    floatx80_is_signaling_nan( a )
5714              || floatx80_is_signaling_nan( b ) ) {
5715             float_raise( float_flag_invalid STATUS_VAR);
5716         }
5717         return 1;
5718     }
5719     return 0;
5720 }
5721
5722 /*----------------------------------------------------------------------------
5723 | Returns the result of converting the quadruple-precision floating-point
5724 | value `a' to the 32-bit two's complement integer format.  The conversion
5725 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5726 | Arithmetic---which means in particular that the conversion is rounded
5727 | according to the current rounding mode.  If `a' is a NaN, the largest
5728 | positive integer is returned.  Otherwise, if the conversion overflows, the
5729 | largest integer with the same sign as `a' is returned.
5730 *----------------------------------------------------------------------------*/
5731
5732 int32 float128_to_int32(float128 a, float_status *status)
5733 {
5734     flag aSign;
5735     int32 aExp, shiftCount;
5736     uint64_t aSig0, aSig1;
5737
5738     aSig1 = extractFloat128Frac1( a );
5739     aSig0 = extractFloat128Frac0( a );
5740     aExp = extractFloat128Exp( a );
5741     aSign = extractFloat128Sign( a );
5742     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5743     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5744     aSig0 |= ( aSig1 != 0 );
5745     shiftCount = 0x4028 - aExp;
5746     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5747     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5748
5749 }
5750
5751 /*----------------------------------------------------------------------------
5752 | Returns the result of converting the quadruple-precision floating-point
5753 | value `a' to the 32-bit two's complement integer format.  The conversion
5754 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5755 | Arithmetic, except that the conversion is always rounded toward zero.  If
5756 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5757 | conversion overflows, the largest integer with the same sign as `a' is
5758 | returned.
5759 *----------------------------------------------------------------------------*/
5760
5761 int32 float128_to_int32_round_to_zero(float128 a, float_status *status)
5762 {
5763     flag aSign;
5764     int32 aExp, shiftCount;
5765     uint64_t aSig0, aSig1, savedASig;
5766     int32_t z;
5767
5768     aSig1 = extractFloat128Frac1( a );
5769     aSig0 = extractFloat128Frac0( a );
5770     aExp = extractFloat128Exp( a );
5771     aSign = extractFloat128Sign( a );
5772     aSig0 |= ( aSig1 != 0 );
5773     if ( 0x401E < aExp ) {
5774         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5775         goto invalid;
5776     }
5777     else if ( aExp < 0x3FFF ) {
5778         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5779         return 0;
5780     }
5781     aSig0 |= LIT64( 0x0001000000000000 );
5782     shiftCount = 0x402F - aExp;
5783     savedASig = aSig0;
5784     aSig0 >>= shiftCount;
5785     z = aSig0;
5786     if ( aSign ) z = - z;
5787     if ( ( z < 0 ) ^ aSign ) {
5788  invalid:
5789         float_raise( float_flag_invalid STATUS_VAR);
5790         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5791     }
5792     if ( ( aSig0<<shiftCount ) != savedASig ) {
5793         STATUS(float_exception_flags) |= float_flag_inexact;
5794     }
5795     return z;
5796
5797 }
5798
5799 /*----------------------------------------------------------------------------
5800 | Returns the result of converting the quadruple-precision floating-point
5801 | value `a' to the 64-bit two's complement integer format.  The conversion
5802 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5803 | Arithmetic---which means in particular that the conversion is rounded
5804 | according to the current rounding mode.  If `a' is a NaN, the largest
5805 | positive integer is returned.  Otherwise, if the conversion overflows, the
5806 | largest integer with the same sign as `a' is returned.
5807 *----------------------------------------------------------------------------*/
5808
5809 int64 float128_to_int64(float128 a, float_status *status)
5810 {
5811     flag aSign;
5812     int32 aExp, shiftCount;
5813     uint64_t aSig0, aSig1;
5814
5815     aSig1 = extractFloat128Frac1( a );
5816     aSig0 = extractFloat128Frac0( a );
5817     aExp = extractFloat128Exp( a );
5818     aSign = extractFloat128Sign( a );
5819     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5820     shiftCount = 0x402F - aExp;
5821     if ( shiftCount <= 0 ) {
5822         if ( 0x403E < aExp ) {
5823             float_raise( float_flag_invalid STATUS_VAR);
5824             if (    ! aSign
5825                  || (    ( aExp == 0x7FFF )
5826                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5827                     )
5828                ) {
5829                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5830             }
5831             return (int64_t) LIT64( 0x8000000000000000 );
5832         }
5833         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5834     }
5835     else {
5836         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5837     }
5838     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5839
5840 }
5841
5842 /*----------------------------------------------------------------------------
5843 | Returns the result of converting the quadruple-precision floating-point
5844 | value `a' to the 64-bit two's complement integer format.  The conversion
5845 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5846 | Arithmetic, except that the conversion is always rounded toward zero.
5847 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5848 | the conversion overflows, the largest integer with the same sign as `a' is
5849 | returned.
5850 *----------------------------------------------------------------------------*/
5851
5852 int64 float128_to_int64_round_to_zero(float128 a, float_status *status)
5853 {
5854     flag aSign;
5855     int32 aExp, shiftCount;
5856     uint64_t aSig0, aSig1;
5857     int64 z;
5858
5859     aSig1 = extractFloat128Frac1( a );
5860     aSig0 = extractFloat128Frac0( a );
5861     aExp = extractFloat128Exp( a );
5862     aSign = extractFloat128Sign( a );
5863     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5864     shiftCount = aExp - 0x402F;
5865     if ( 0 < shiftCount ) {
5866         if ( 0x403E <= aExp ) {
5867             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5868             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5869                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5870                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5871             }
5872             else {
5873                 float_raise( float_flag_invalid STATUS_VAR);
5874                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5875                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5876                 }
5877             }
5878             return (int64_t) LIT64( 0x8000000000000000 );
5879         }
5880         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5881         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5882             STATUS(float_exception_flags) |= float_flag_inexact;
5883         }
5884     }
5885     else {
5886         if ( aExp < 0x3FFF ) {
5887             if ( aExp | aSig0 | aSig1 ) {
5888                 STATUS(float_exception_flags) |= float_flag_inexact;
5889             }
5890             return 0;
5891         }
5892         z = aSig0>>( - shiftCount );
5893         if (    aSig1
5894              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5895             STATUS(float_exception_flags) |= float_flag_inexact;
5896         }
5897     }
5898     if ( aSign ) z = - z;
5899     return z;
5900
5901 }
5902
5903 /*----------------------------------------------------------------------------
5904 | Returns the result of converting the quadruple-precision floating-point
5905 | value `a' to the single-precision floating-point format.  The conversion
5906 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5907 | Arithmetic.
5908 *----------------------------------------------------------------------------*/
5909
5910 float32 float128_to_float32(float128 a, float_status *status)
5911 {
5912     flag aSign;
5913     int32 aExp;
5914     uint64_t aSig0, aSig1;
5915     uint32_t zSig;
5916
5917     aSig1 = extractFloat128Frac1( a );
5918     aSig0 = extractFloat128Frac0( a );
5919     aExp = extractFloat128Exp( a );
5920     aSign = extractFloat128Sign( a );
5921     if ( aExp == 0x7FFF ) {
5922         if ( aSig0 | aSig1 ) {
5923             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5924         }
5925         return packFloat32( aSign, 0xFF, 0 );
5926     }
5927     aSig0 |= ( aSig1 != 0 );
5928     shift64RightJamming( aSig0, 18, &aSig0 );
5929     zSig = aSig0;
5930     if ( aExp || zSig ) {
5931         zSig |= 0x40000000;
5932         aExp -= 0x3F81;
5933     }
5934     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5935
5936 }
5937
5938 /*----------------------------------------------------------------------------
5939 | Returns the result of converting the quadruple-precision floating-point
5940 | value `a' to the double-precision floating-point format.  The conversion
5941 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5942 | Arithmetic.
5943 *----------------------------------------------------------------------------*/
5944
5945 float64 float128_to_float64(float128 a, float_status *status)
5946 {
5947     flag aSign;
5948     int32 aExp;
5949     uint64_t aSig0, aSig1;
5950
5951     aSig1 = extractFloat128Frac1( a );
5952     aSig0 = extractFloat128Frac0( a );
5953     aExp = extractFloat128Exp( a );
5954     aSign = extractFloat128Sign( a );
5955     if ( aExp == 0x7FFF ) {
5956         if ( aSig0 | aSig1 ) {
5957             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5958         }
5959         return packFloat64( aSign, 0x7FF, 0 );
5960     }
5961     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5962     aSig0 |= ( aSig1 != 0 );
5963     if ( aExp || aSig0 ) {
5964         aSig0 |= LIT64( 0x4000000000000000 );
5965         aExp -= 0x3C01;
5966     }
5967     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5968
5969 }
5970
5971 /*----------------------------------------------------------------------------
5972 | Returns the result of converting the quadruple-precision floating-point
5973 | value `a' to the extended double-precision floating-point format.  The
5974 | conversion is performed according to the IEC/IEEE Standard for Binary
5975 | Floating-Point Arithmetic.
5976 *----------------------------------------------------------------------------*/
5977
5978 floatx80 float128_to_floatx80(float128 a, float_status *status)
5979 {
5980     flag aSign;
5981     int32 aExp;
5982     uint64_t aSig0, aSig1;
5983
5984     aSig1 = extractFloat128Frac1( a );
5985     aSig0 = extractFloat128Frac0( a );
5986     aExp = extractFloat128Exp( a );
5987     aSign = extractFloat128Sign( a );
5988     if ( aExp == 0x7FFF ) {
5989         if ( aSig0 | aSig1 ) {
5990             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5991         }
5992         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5993     }
5994     if ( aExp == 0 ) {
5995         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5996         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5997     }
5998     else {
5999         aSig0 |= LIT64( 0x0001000000000000 );
6000     }
6001     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6002     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
6003
6004 }
6005
6006 /*----------------------------------------------------------------------------
6007 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6008 | returns the result as a quadruple-precision floating-point value.  The
6009 | operation is performed according to the IEC/IEEE Standard for Binary
6010 | Floating-Point Arithmetic.
6011 *----------------------------------------------------------------------------*/
6012
6013 float128 float128_round_to_int(float128 a, float_status *status)
6014 {
6015     flag aSign;
6016     int32 aExp;
6017     uint64_t lastBitMask, roundBitsMask;
6018     float128 z;
6019
6020     aExp = extractFloat128Exp( a );
6021     if ( 0x402F <= aExp ) {
6022         if ( 0x406F <= aExp ) {
6023             if (    ( aExp == 0x7FFF )
6024                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6025                ) {
6026                 return propagateFloat128NaN( a, a STATUS_VAR );
6027             }
6028             return a;
6029         }
6030         lastBitMask = 1;
6031         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6032         roundBitsMask = lastBitMask - 1;
6033         z = a;
6034         switch (STATUS(float_rounding_mode)) {
6035         case float_round_nearest_even:
6036             if ( lastBitMask ) {
6037                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6038                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6039             }
6040             else {
6041                 if ( (int64_t) z.low < 0 ) {
6042                     ++z.high;
6043                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6044                 }
6045             }
6046             break;
6047         case float_round_ties_away:
6048             if (lastBitMask) {
6049                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6050             } else {
6051                 if ((int64_t) z.low < 0) {
6052                     ++z.high;
6053                 }
6054             }
6055             break;
6056         case float_round_to_zero:
6057             break;
6058         case float_round_up:
6059             if (!extractFloat128Sign(z)) {
6060                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6061             }
6062             break;
6063         case float_round_down:
6064             if (extractFloat128Sign(z)) {
6065                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6066             }
6067             break;
6068         default:
6069             abort();
6070         }
6071         z.low &= ~ roundBitsMask;
6072     }
6073     else {
6074         if ( aExp < 0x3FFF ) {
6075             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6076             STATUS(float_exception_flags) |= float_flag_inexact;
6077             aSign = extractFloat128Sign( a );
6078             switch ( STATUS(float_rounding_mode) ) {
6079              case float_round_nearest_even:
6080                 if (    ( aExp == 0x3FFE )
6081                      && (   extractFloat128Frac0( a )
6082                           | extractFloat128Frac1( a ) )
6083                    ) {
6084                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6085                 }
6086                 break;
6087             case float_round_ties_away:
6088                 if (aExp == 0x3FFE) {
6089                     return packFloat128(aSign, 0x3FFF, 0, 0);
6090                 }
6091                 break;
6092              case float_round_down:
6093                 return
6094                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6095                     : packFloat128( 0, 0, 0, 0 );
6096              case float_round_up:
6097                 return
6098                       aSign ? packFloat128( 1, 0, 0, 0 )
6099                     : packFloat128( 0, 0x3FFF, 0, 0 );
6100             }
6101             return packFloat128( aSign, 0, 0, 0 );
6102         }
6103         lastBitMask = 1;
6104         lastBitMask <<= 0x402F - aExp;
6105         roundBitsMask = lastBitMask - 1;
6106         z.low = 0;
6107         z.high = a.high;
6108         switch (STATUS(float_rounding_mode)) {
6109         case float_round_nearest_even:
6110             z.high += lastBitMask>>1;
6111             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6112                 z.high &= ~ lastBitMask;
6113             }
6114             break;
6115         case float_round_ties_away:
6116             z.high += lastBitMask>>1;
6117             break;
6118         case float_round_to_zero:
6119             break;
6120         case float_round_up:
6121             if (!extractFloat128Sign(z)) {
6122                 z.high |= ( a.low != 0 );
6123                 z.high += roundBitsMask;
6124             }
6125             break;
6126         case float_round_down:
6127             if (extractFloat128Sign(z)) {
6128                 z.high |= (a.low != 0);
6129                 z.high += roundBitsMask;
6130             }
6131             break;
6132         default:
6133             abort();
6134         }
6135         z.high &= ~ roundBitsMask;
6136     }
6137     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6138         STATUS(float_exception_flags) |= float_flag_inexact;
6139     }
6140     return z;
6141
6142 }
6143
6144 /*----------------------------------------------------------------------------
6145 | Returns the result of adding the absolute values of the quadruple-precision
6146 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6147 | before being returned.  `zSign' is ignored if the result is a NaN.
6148 | The addition is performed according to the IEC/IEEE Standard for Binary
6149 | Floating-Point Arithmetic.
6150 *----------------------------------------------------------------------------*/
6151
6152 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6153                                 float_status *status)
6154 {
6155     int32 aExp, bExp, zExp;
6156     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6157     int32 expDiff;
6158
6159     aSig1 = extractFloat128Frac1( a );
6160     aSig0 = extractFloat128Frac0( a );
6161     aExp = extractFloat128Exp( a );
6162     bSig1 = extractFloat128Frac1( b );
6163     bSig0 = extractFloat128Frac0( b );
6164     bExp = extractFloat128Exp( b );
6165     expDiff = aExp - bExp;
6166     if ( 0 < expDiff ) {
6167         if ( aExp == 0x7FFF ) {
6168             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6169             return a;
6170         }
6171         if ( bExp == 0 ) {
6172             --expDiff;
6173         }
6174         else {
6175             bSig0 |= LIT64( 0x0001000000000000 );
6176         }
6177         shift128ExtraRightJamming(
6178             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6179         zExp = aExp;
6180     }
6181     else if ( expDiff < 0 ) {
6182         if ( bExp == 0x7FFF ) {
6183             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6184             return packFloat128( zSign, 0x7FFF, 0, 0 );
6185         }
6186         if ( aExp == 0 ) {
6187             ++expDiff;
6188         }
6189         else {
6190             aSig0 |= LIT64( 0x0001000000000000 );
6191         }
6192         shift128ExtraRightJamming(
6193             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6194         zExp = bExp;
6195     }
6196     else {
6197         if ( aExp == 0x7FFF ) {
6198             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6199                 return propagateFloat128NaN( a, b STATUS_VAR );
6200             }
6201             return a;
6202         }
6203         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6204         if ( aExp == 0 ) {
6205             if (STATUS(flush_to_zero)) {
6206                 if (zSig0 | zSig1) {
6207                     float_raise(float_flag_output_denormal STATUS_VAR);
6208                 }
6209                 return packFloat128(zSign, 0, 0, 0);
6210             }
6211             return packFloat128( zSign, 0, zSig0, zSig1 );
6212         }
6213         zSig2 = 0;
6214         zSig0 |= LIT64( 0x0002000000000000 );
6215         zExp = aExp;
6216         goto shiftRight1;
6217     }
6218     aSig0 |= LIT64( 0x0001000000000000 );
6219     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6220     --zExp;
6221     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6222     ++zExp;
6223  shiftRight1:
6224     shift128ExtraRightJamming(
6225         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6226  roundAndPack:
6227     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6228
6229 }
6230
6231 /*----------------------------------------------------------------------------
6232 | Returns the result of subtracting the absolute values of the quadruple-
6233 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6234 | difference is negated before being returned.  `zSign' is ignored if the
6235 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6236 | Standard for Binary Floating-Point Arithmetic.
6237 *----------------------------------------------------------------------------*/
6238
6239 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6240                                 float_status *status)
6241 {
6242     int32 aExp, bExp, zExp;
6243     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6244     int32 expDiff;
6245     float128 z;
6246
6247     aSig1 = extractFloat128Frac1( a );
6248     aSig0 = extractFloat128Frac0( a );
6249     aExp = extractFloat128Exp( a );
6250     bSig1 = extractFloat128Frac1( b );
6251     bSig0 = extractFloat128Frac0( b );
6252     bExp = extractFloat128Exp( b );
6253     expDiff = aExp - bExp;
6254     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6255     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6256     if ( 0 < expDiff ) goto aExpBigger;
6257     if ( expDiff < 0 ) goto bExpBigger;
6258     if ( aExp == 0x7FFF ) {
6259         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6260             return propagateFloat128NaN( a, b STATUS_VAR );
6261         }
6262         float_raise( float_flag_invalid STATUS_VAR);
6263         z.low = float128_default_nan_low;
6264         z.high = float128_default_nan_high;
6265         return z;
6266     }
6267     if ( aExp == 0 ) {
6268         aExp = 1;
6269         bExp = 1;
6270     }
6271     if ( bSig0 < aSig0 ) goto aBigger;
6272     if ( aSig0 < bSig0 ) goto bBigger;
6273     if ( bSig1 < aSig1 ) goto aBigger;
6274     if ( aSig1 < bSig1 ) goto bBigger;
6275     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
6276  bExpBigger:
6277     if ( bExp == 0x7FFF ) {
6278         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6279         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6280     }
6281     if ( aExp == 0 ) {
6282         ++expDiff;
6283     }
6284     else {
6285         aSig0 |= LIT64( 0x4000000000000000 );
6286     }
6287     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6288     bSig0 |= LIT64( 0x4000000000000000 );
6289  bBigger:
6290     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6291     zExp = bExp;
6292     zSign ^= 1;
6293     goto normalizeRoundAndPack;
6294  aExpBigger:
6295     if ( aExp == 0x7FFF ) {
6296         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6297         return a;
6298     }
6299     if ( bExp == 0 ) {
6300         --expDiff;
6301     }
6302     else {
6303         bSig0 |= LIT64( 0x4000000000000000 );
6304     }
6305     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6306     aSig0 |= LIT64( 0x4000000000000000 );
6307  aBigger:
6308     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6309     zExp = aExp;
6310  normalizeRoundAndPack:
6311     --zExp;
6312     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
6313
6314 }
6315
6316 /*----------------------------------------------------------------------------
6317 | Returns the result of adding the quadruple-precision floating-point values
6318 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6319 | for Binary Floating-Point Arithmetic.
6320 *----------------------------------------------------------------------------*/
6321
6322 float128 float128_add(float128 a, float128 b, float_status *status)
6323 {
6324     flag aSign, bSign;
6325
6326     aSign = extractFloat128Sign( a );
6327     bSign = extractFloat128Sign( b );
6328     if ( aSign == bSign ) {
6329         return addFloat128Sigs( a, b, aSign STATUS_VAR );
6330     }
6331     else {
6332         return subFloat128Sigs( a, b, aSign STATUS_VAR );
6333     }
6334
6335 }
6336
6337 /*----------------------------------------------------------------------------
6338 | Returns the result of subtracting the quadruple-precision floating-point
6339 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6340 | Standard for Binary Floating-Point Arithmetic.
6341 *----------------------------------------------------------------------------*/
6342
6343 float128 float128_sub(float128 a, float128 b, float_status *status)
6344 {
6345     flag aSign, bSign;
6346
6347     aSign = extractFloat128Sign( a );
6348     bSign = extractFloat128Sign( b );
6349     if ( aSign == bSign ) {
6350         return subFloat128Sigs( a, b, aSign STATUS_VAR );
6351     }
6352     else {
6353         return addFloat128Sigs( a, b, aSign STATUS_VAR );
6354     }
6355
6356 }
6357
6358 /*----------------------------------------------------------------------------
6359 | Returns the result of multiplying the quadruple-precision floating-point
6360 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6361 | Standard for Binary Floating-Point Arithmetic.
6362 *----------------------------------------------------------------------------*/
6363
6364 float128 float128_mul(float128 a, float128 b, float_status *status)
6365 {
6366     flag aSign, bSign, zSign;
6367     int32 aExp, bExp, zExp;
6368     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6369     float128 z;
6370
6371     aSig1 = extractFloat128Frac1( a );
6372     aSig0 = extractFloat128Frac0( a );
6373     aExp = extractFloat128Exp( a );
6374     aSign = extractFloat128Sign( a );
6375     bSig1 = extractFloat128Frac1( b );
6376     bSig0 = extractFloat128Frac0( b );
6377     bExp = extractFloat128Exp( b );
6378     bSign = extractFloat128Sign( b );
6379     zSign = aSign ^ bSign;
6380     if ( aExp == 0x7FFF ) {
6381         if (    ( aSig0 | aSig1 )
6382              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6383             return propagateFloat128NaN( a, b STATUS_VAR );
6384         }
6385         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6386         return packFloat128( zSign, 0x7FFF, 0, 0 );
6387     }
6388     if ( bExp == 0x7FFF ) {
6389         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6390         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6391  invalid:
6392             float_raise( float_flag_invalid STATUS_VAR);
6393             z.low = float128_default_nan_low;
6394             z.high = float128_default_nan_high;
6395             return z;
6396         }
6397         return packFloat128( zSign, 0x7FFF, 0, 0 );
6398     }
6399     if ( aExp == 0 ) {
6400         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6401         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6402     }
6403     if ( bExp == 0 ) {
6404         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6405         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6406     }
6407     zExp = aExp + bExp - 0x4000;
6408     aSig0 |= LIT64( 0x0001000000000000 );
6409     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6410     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6411     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6412     zSig2 |= ( zSig3 != 0 );
6413     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6414         shift128ExtraRightJamming(
6415             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6416         ++zExp;
6417     }
6418     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6419
6420 }
6421
6422 /*----------------------------------------------------------------------------
6423 | Returns the result of dividing the quadruple-precision floating-point value
6424 | `a' by the corresponding value `b'.  The operation is performed according to
6425 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6426 *----------------------------------------------------------------------------*/
6427
6428 float128 float128_div(float128 a, float128 b, float_status *status)
6429 {
6430     flag aSign, bSign, zSign;
6431     int32 aExp, bExp, zExp;
6432     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6433     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6434     float128 z;
6435
6436     aSig1 = extractFloat128Frac1( a );
6437     aSig0 = extractFloat128Frac0( a );
6438     aExp = extractFloat128Exp( a );
6439     aSign = extractFloat128Sign( a );
6440     bSig1 = extractFloat128Frac1( b );
6441     bSig0 = extractFloat128Frac0( b );
6442     bExp = extractFloat128Exp( b );
6443     bSign = extractFloat128Sign( b );
6444     zSign = aSign ^ bSign;
6445     if ( aExp == 0x7FFF ) {
6446         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6447         if ( bExp == 0x7FFF ) {
6448             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6449             goto invalid;
6450         }
6451         return packFloat128( zSign, 0x7FFF, 0, 0 );
6452     }
6453     if ( bExp == 0x7FFF ) {
6454         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6455         return packFloat128( zSign, 0, 0, 0 );
6456     }
6457     if ( bExp == 0 ) {
6458         if ( ( bSig0 | bSig1 ) == 0 ) {
6459             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6460  invalid:
6461                 float_raise( float_flag_invalid STATUS_VAR);
6462                 z.low = float128_default_nan_low;
6463                 z.high = float128_default_nan_high;
6464                 return z;
6465             }
6466             float_raise( float_flag_divbyzero STATUS_VAR);
6467             return packFloat128( zSign, 0x7FFF, 0, 0 );
6468         }
6469         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6470     }
6471     if ( aExp == 0 ) {
6472         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6473         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6474     }
6475     zExp = aExp - bExp + 0x3FFD;
6476     shortShift128Left(
6477         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6478     shortShift128Left(
6479         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6480     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6481         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6482         ++zExp;
6483     }
6484     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6485     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6486     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6487     while ( (int64_t) rem0 < 0 ) {
6488         --zSig0;
6489         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6490     }
6491     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6492     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6493         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6494         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6495         while ( (int64_t) rem1 < 0 ) {
6496             --zSig1;
6497             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6498         }
6499         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6500     }
6501     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6502     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6503
6504 }
6505
6506 /*----------------------------------------------------------------------------
6507 | Returns the remainder of the quadruple-precision floating-point value `a'
6508 | with respect to the corresponding value `b'.  The operation is performed
6509 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6510 *----------------------------------------------------------------------------*/
6511
6512 float128 float128_rem(float128 a, float128 b, float_status *status)
6513 {
6514     flag aSign, zSign;
6515     int32 aExp, bExp, expDiff;
6516     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6517     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6518     int64_t sigMean0;
6519     float128 z;
6520
6521     aSig1 = extractFloat128Frac1( a );
6522     aSig0 = extractFloat128Frac0( a );
6523     aExp = extractFloat128Exp( a );
6524     aSign = extractFloat128Sign( a );
6525     bSig1 = extractFloat128Frac1( b );
6526     bSig0 = extractFloat128Frac0( b );
6527     bExp = extractFloat128Exp( b );
6528     if ( aExp == 0x7FFF ) {
6529         if (    ( aSig0 | aSig1 )
6530              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6531             return propagateFloat128NaN( a, b STATUS_VAR );
6532         }
6533         goto invalid;
6534     }
6535     if ( bExp == 0x7FFF ) {
6536         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6537         return a;
6538     }
6539     if ( bExp == 0 ) {
6540         if ( ( bSig0 | bSig1 ) == 0 ) {
6541  invalid:
6542             float_raise( float_flag_invalid STATUS_VAR);
6543             z.low = float128_default_nan_low;
6544             z.high = float128_default_nan_high;
6545             return z;
6546         }
6547         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6548     }
6549     if ( aExp == 0 ) {
6550         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6551         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6552     }
6553     expDiff = aExp - bExp;
6554     if ( expDiff < -1 ) return a;
6555     shortShift128Left(
6556         aSig0 | LIT64( 0x0001000000000000 ),
6557         aSig1,
6558         15 - ( expDiff < 0 ),
6559         &aSig0,
6560         &aSig1
6561     );
6562     shortShift128Left(
6563         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6564     q = le128( bSig0, bSig1, aSig0, aSig1 );
6565     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6566     expDiff -= 64;
6567     while ( 0 < expDiff ) {
6568         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6569         q = ( 4 < q ) ? q - 4 : 0;
6570         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6571         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6572         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6573         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6574         expDiff -= 61;
6575     }
6576     if ( -64 < expDiff ) {
6577         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6578         q = ( 4 < q ) ? q - 4 : 0;
6579         q >>= - expDiff;
6580         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6581         expDiff += 52;
6582         if ( expDiff < 0 ) {
6583             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6584         }
6585         else {
6586             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6587         }
6588         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6589         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6590     }
6591     else {
6592         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6593         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6594     }
6595     do {
6596         alternateASig0 = aSig0;
6597         alternateASig1 = aSig1;
6598         ++q;
6599         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6600     } while ( 0 <= (int64_t) aSig0 );
6601     add128(
6602         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6603     if (    ( sigMean0 < 0 )
6604          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6605         aSig0 = alternateASig0;
6606         aSig1 = alternateASig1;
6607     }
6608     zSign = ( (int64_t) aSig0 < 0 );
6609     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6610     return
6611         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6612
6613 }
6614
6615 /*----------------------------------------------------------------------------
6616 | Returns the square root of the quadruple-precision floating-point value `a'.
6617 | The operation is performed according to the IEC/IEEE Standard for Binary
6618 | Floating-Point Arithmetic.
6619 *----------------------------------------------------------------------------*/
6620
6621 float128 float128_sqrt(float128 a, float_status *status)
6622 {
6623     flag aSign;
6624     int32 aExp, zExp;
6625     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6626     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6627     float128 z;
6628
6629     aSig1 = extractFloat128Frac1( a );
6630     aSig0 = extractFloat128Frac0( a );
6631     aExp = extractFloat128Exp( a );
6632     aSign = extractFloat128Sign( a );
6633     if ( aExp == 0x7FFF ) {
6634         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6635         if ( ! aSign ) return a;
6636         goto invalid;
6637     }
6638     if ( aSign ) {
6639         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6640  invalid:
6641         float_raise( float_flag_invalid STATUS_VAR);
6642         z.low = float128_default_nan_low;
6643         z.high = float128_default_nan_high;
6644         return z;
6645     }
6646     if ( aExp == 0 ) {
6647         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6648         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6649     }
6650     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6651     aSig0 |= LIT64( 0x0001000000000000 );
6652     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6653     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6654     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6655     doubleZSig0 = zSig0<<1;
6656     mul64To128( zSig0, zSig0, &term0, &term1 );
6657     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6658     while ( (int64_t) rem0 < 0 ) {
6659         --zSig0;
6660         doubleZSig0 -= 2;
6661         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6662     }
6663     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6664     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6665         if ( zSig1 == 0 ) zSig1 = 1;
6666         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6667         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6668         mul64To128( zSig1, zSig1, &term2, &term3 );
6669         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6670         while ( (int64_t) rem1 < 0 ) {
6671             --zSig1;
6672             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6673             term3 |= 1;
6674             term2 |= doubleZSig0;
6675             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6676         }
6677         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6678     }
6679     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6680     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6681
6682 }
6683
6684 /*----------------------------------------------------------------------------
6685 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6686 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6687 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6688 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6689 *----------------------------------------------------------------------------*/
6690
6691 int float128_eq(float128 a, float128 b, float_status *status)
6692 {
6693
6694     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6695               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6696          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6697               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6698        ) {
6699         float_raise( float_flag_invalid STATUS_VAR);
6700         return 0;
6701     }
6702     return
6703            ( a.low == b.low )
6704         && (    ( a.high == b.high )
6705              || (    ( a.low == 0 )
6706                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6707            );
6708
6709 }
6710
6711 /*----------------------------------------------------------------------------
6712 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6713 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6714 | exception is raised if either operand is a NaN.  The comparison is performed
6715 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6716 *----------------------------------------------------------------------------*/
6717
6718 int float128_le(float128 a, float128 b, float_status *status)
6719 {
6720     flag aSign, bSign;
6721
6722     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6723               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6724          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6725               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6726        ) {
6727         float_raise( float_flag_invalid STATUS_VAR);
6728         return 0;
6729     }
6730     aSign = extractFloat128Sign( a );
6731     bSign = extractFloat128Sign( b );
6732     if ( aSign != bSign ) {
6733         return
6734                aSign
6735             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6736                  == 0 );
6737     }
6738     return
6739           aSign ? le128( b.high, b.low, a.high, a.low )
6740         : le128( a.high, a.low, b.high, b.low );
6741
6742 }
6743
6744 /*----------------------------------------------------------------------------
6745 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6746 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6747 | raised if either operand is a NaN.  The comparison is performed according
6748 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6749 *----------------------------------------------------------------------------*/
6750
6751 int float128_lt(float128 a, float128 b, float_status *status)
6752 {
6753     flag aSign, bSign;
6754
6755     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6756               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6757          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6758               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6759        ) {
6760         float_raise( float_flag_invalid STATUS_VAR);
6761         return 0;
6762     }
6763     aSign = extractFloat128Sign( a );
6764     bSign = extractFloat128Sign( b );
6765     if ( aSign != bSign ) {
6766         return
6767                aSign
6768             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6769                  != 0 );
6770     }
6771     return
6772           aSign ? lt128( b.high, b.low, a.high, a.low )
6773         : lt128( a.high, a.low, b.high, b.low );
6774
6775 }
6776
6777 /*----------------------------------------------------------------------------
6778 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6779 | be compared, and 0 otherwise.  The invalid exception is raised if either
6780 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6781 | Standard for Binary Floating-Point Arithmetic.
6782 *----------------------------------------------------------------------------*/
6783
6784 int float128_unordered(float128 a, float128 b, float_status *status)
6785 {
6786     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6787               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6788          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6789               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6790        ) {
6791         float_raise( float_flag_invalid STATUS_VAR);
6792         return 1;
6793     }
6794     return 0;
6795 }
6796
6797 /*----------------------------------------------------------------------------
6798 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6799 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6800 | exception.  The comparison is performed according to the IEC/IEEE Standard
6801 | for Binary Floating-Point Arithmetic.
6802 *----------------------------------------------------------------------------*/
6803
6804 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6805 {
6806
6807     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6808               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6809          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6810               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6811        ) {
6812         if (    float128_is_signaling_nan( a )
6813              || float128_is_signaling_nan( b ) ) {
6814             float_raise( float_flag_invalid STATUS_VAR);
6815         }
6816         return 0;
6817     }
6818     return
6819            ( a.low == b.low )
6820         && (    ( a.high == b.high )
6821              || (    ( a.low == 0 )
6822                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6823            );
6824
6825 }
6826
6827 /*----------------------------------------------------------------------------
6828 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6829 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6830 | cause an exception.  Otherwise, the comparison is performed according to the
6831 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6832 *----------------------------------------------------------------------------*/
6833
6834 int float128_le_quiet(float128 a, float128 b, float_status *status)
6835 {
6836     flag aSign, bSign;
6837
6838     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6839               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6840          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6841               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6842        ) {
6843         if (    float128_is_signaling_nan( a )
6844              || float128_is_signaling_nan( b ) ) {
6845             float_raise( float_flag_invalid STATUS_VAR);
6846         }
6847         return 0;
6848     }
6849     aSign = extractFloat128Sign( a );
6850     bSign = extractFloat128Sign( b );
6851     if ( aSign != bSign ) {
6852         return
6853                aSign
6854             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6855                  == 0 );
6856     }
6857     return
6858           aSign ? le128( b.high, b.low, a.high, a.low )
6859         : le128( a.high, a.low, b.high, b.low );
6860
6861 }
6862
6863 /*----------------------------------------------------------------------------
6864 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6865 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6866 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6867 | Standard for Binary Floating-Point Arithmetic.
6868 *----------------------------------------------------------------------------*/
6869
6870 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6871 {
6872     flag aSign, bSign;
6873
6874     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6875               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6876          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6877               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6878        ) {
6879         if (    float128_is_signaling_nan( a )
6880              || float128_is_signaling_nan( b ) ) {
6881             float_raise( float_flag_invalid STATUS_VAR);
6882         }
6883         return 0;
6884     }
6885     aSign = extractFloat128Sign( a );
6886     bSign = extractFloat128Sign( b );
6887     if ( aSign != bSign ) {
6888         return
6889                aSign
6890             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6891                  != 0 );
6892     }
6893     return
6894           aSign ? lt128( b.high, b.low, a.high, a.low )
6895         : lt128( a.high, a.low, b.high, b.low );
6896
6897 }
6898
6899 /*----------------------------------------------------------------------------
6900 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6901 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6902 | comparison is performed according to the IEC/IEEE Standard for Binary
6903 | Floating-Point Arithmetic.
6904 *----------------------------------------------------------------------------*/
6905
6906 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6907 {
6908     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6909               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6910          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6911               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6912        ) {
6913         if (    float128_is_signaling_nan( a )
6914              || float128_is_signaling_nan( b ) ) {
6915             float_raise( float_flag_invalid STATUS_VAR);
6916         }
6917         return 1;
6918     }
6919     return 0;
6920 }
6921
6922 /* misc functions */
6923 float32 uint32_to_float32(uint32_t a, float_status *status)
6924 {
6925     return int64_to_float32(a STATUS_VAR);
6926 }
6927
6928 float64 uint32_to_float64(uint32_t a, float_status *status)
6929 {
6930     return int64_to_float64(a STATUS_VAR);
6931 }
6932
6933 uint32 float32_to_uint32(float32 a, float_status *status)
6934 {
6935     int64_t v;
6936     uint32 res;
6937     int old_exc_flags = get_float_exception_flags(status);
6938
6939     v = float32_to_int64(a STATUS_VAR);
6940     if (v < 0) {
6941         res = 0;
6942     } else if (v > 0xffffffff) {
6943         res = 0xffffffff;
6944     } else {
6945         return v;
6946     }
6947     set_float_exception_flags(old_exc_flags, status);
6948     float_raise(float_flag_invalid STATUS_VAR);
6949     return res;
6950 }
6951
6952 uint32 float32_to_uint32_round_to_zero(float32 a, float_status *status)
6953 {
6954     int64_t v;
6955     uint32 res;
6956     int old_exc_flags = get_float_exception_flags(status);
6957
6958     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6959     if (v < 0) {
6960         res = 0;
6961     } else if (v > 0xffffffff) {
6962         res = 0xffffffff;
6963     } else {
6964         return v;
6965     }
6966     set_float_exception_flags(old_exc_flags, status);
6967     float_raise(float_flag_invalid STATUS_VAR);
6968     return res;
6969 }
6970
6971 int_fast16_t float32_to_int16(float32 a, float_status *status)
6972 {
6973     int32_t v;
6974     int_fast16_t res;
6975     int old_exc_flags = get_float_exception_flags(status);
6976
6977     v = float32_to_int32(a STATUS_VAR);
6978     if (v < -0x8000) {
6979         res = -0x8000;
6980     } else if (v > 0x7fff) {
6981         res = 0x7fff;
6982     } else {
6983         return v;
6984     }
6985
6986     set_float_exception_flags(old_exc_flags, status);
6987     float_raise(float_flag_invalid STATUS_VAR);
6988     return res;
6989 }
6990
6991 uint_fast16_t float32_to_uint16(float32 a, float_status *status)
6992 {
6993     int32_t v;
6994     uint_fast16_t res;
6995     int old_exc_flags = get_float_exception_flags(status);
6996
6997     v = float32_to_int32(a STATUS_VAR);
6998     if (v < 0) {
6999         res = 0;
7000     } else if (v > 0xffff) {
7001         res = 0xffff;
7002     } else {
7003         return v;
7004     }
7005
7006     set_float_exception_flags(old_exc_flags, status);
7007     float_raise(float_flag_invalid STATUS_VAR);
7008     return res;
7009 }
7010
7011 uint_fast16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
7012 {
7013     int64_t v;
7014     uint_fast16_t res;
7015     int old_exc_flags = get_float_exception_flags(status);
7016
7017     v = float32_to_int64_round_to_zero(a STATUS_VAR);
7018     if (v < 0) {
7019         res = 0;
7020     } else if (v > 0xffff) {
7021         res = 0xffff;
7022     } else {
7023         return v;
7024     }
7025     set_float_exception_flags(old_exc_flags, status);
7026     float_raise(float_flag_invalid STATUS_VAR);
7027     return res;
7028 }
7029
7030 uint32 float64_to_uint32(float64 a, float_status *status)
7031 {
7032     uint64_t v;
7033     uint32 res;
7034     int old_exc_flags = get_float_exception_flags(status);
7035
7036     v = float64_to_uint64(a STATUS_VAR);
7037     if (v > 0xffffffff) {
7038         res = 0xffffffff;
7039     } else {
7040         return v;
7041     }
7042     set_float_exception_flags(old_exc_flags, status);
7043     float_raise(float_flag_invalid STATUS_VAR);
7044     return res;
7045 }
7046
7047 uint32 float64_to_uint32_round_to_zero(float64 a, float_status *status)
7048 {
7049     uint64_t v;
7050     uint32 res;
7051     int old_exc_flags = get_float_exception_flags(status);
7052
7053     v = float64_to_uint64_round_to_zero(a STATUS_VAR);
7054     if (v > 0xffffffff) {
7055         res = 0xffffffff;
7056     } else {
7057         return v;
7058     }
7059     set_float_exception_flags(old_exc_flags, status);
7060     float_raise(float_flag_invalid STATUS_VAR);
7061     return res;
7062 }
7063
7064 int_fast16_t float64_to_int16(float64 a, float_status *status)
7065 {
7066     int64_t v;
7067     int_fast16_t res;
7068     int old_exc_flags = get_float_exception_flags(status);
7069
7070     v = float64_to_int32(a STATUS_VAR);
7071     if (v < -0x8000) {
7072         res = -0x8000;
7073     } else if (v > 0x7fff) {
7074         res = 0x7fff;
7075     } else {
7076         return v;
7077     }
7078
7079     set_float_exception_flags(old_exc_flags, status);
7080     float_raise(float_flag_invalid STATUS_VAR);
7081     return res;
7082 }
7083
7084 uint_fast16_t float64_to_uint16(float64 a, float_status *status)
7085 {
7086     int64_t v;
7087     uint_fast16_t res;
7088     int old_exc_flags = get_float_exception_flags(status);
7089
7090     v = float64_to_int32(a STATUS_VAR);
7091     if (v < 0) {
7092         res = 0;
7093     } else if (v > 0xffff) {
7094         res = 0xffff;
7095     } else {
7096         return v;
7097     }
7098
7099     set_float_exception_flags(old_exc_flags, status);
7100     float_raise(float_flag_invalid STATUS_VAR);
7101     return res;
7102 }
7103
7104 uint_fast16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
7105 {
7106     int64_t v;
7107     uint_fast16_t res;
7108     int old_exc_flags = get_float_exception_flags(status);
7109
7110     v = float64_to_int64_round_to_zero(a STATUS_VAR);
7111     if (v < 0) {
7112         res = 0;
7113     } else if (v > 0xffff) {
7114         res = 0xffff;
7115     } else {
7116         return v;
7117     }
7118     set_float_exception_flags(old_exc_flags, status);
7119     float_raise(float_flag_invalid STATUS_VAR);
7120     return res;
7121 }
7122
7123 /*----------------------------------------------------------------------------
7124 | Returns the result of converting the double-precision floating-point value
7125 | `a' to the 64-bit unsigned integer format.  The conversion is
7126 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7127 | Arithmetic---which means in particular that the conversion is rounded
7128 | according to the current rounding mode.  If `a' is a NaN, the largest
7129 | positive integer is returned.  If the conversion overflows, the
7130 | largest unsigned integer is returned.  If 'a' is negative, the value is
7131 | rounded and zero is returned; negative values that do not round to zero
7132 | will raise the inexact exception.
7133 *----------------------------------------------------------------------------*/
7134
7135 uint64_t float64_to_uint64(float64 a, float_status *status)
7136 {
7137     flag aSign;
7138     int_fast16_t aExp, shiftCount;
7139     uint64_t aSig, aSigExtra;
7140     a = float64_squash_input_denormal(a STATUS_VAR);
7141
7142     aSig = extractFloat64Frac(a);
7143     aExp = extractFloat64Exp(a);
7144     aSign = extractFloat64Sign(a);
7145     if (aSign && (aExp > 1022)) {
7146         float_raise(float_flag_invalid STATUS_VAR);
7147         if (float64_is_any_nan(a)) {
7148             return LIT64(0xFFFFFFFFFFFFFFFF);
7149         } else {
7150             return 0;
7151         }
7152     }
7153     if (aExp) {
7154         aSig |= LIT64(0x0010000000000000);
7155     }
7156     shiftCount = 0x433 - aExp;
7157     if (shiftCount <= 0) {
7158         if (0x43E < aExp) {
7159             float_raise(float_flag_invalid STATUS_VAR);
7160             return LIT64(0xFFFFFFFFFFFFFFFF);
7161         }
7162         aSigExtra = 0;
7163         aSig <<= -shiftCount;
7164     } else {
7165         shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7166     }
7167     return roundAndPackUint64(aSign, aSig, aSigExtra STATUS_VAR);
7168 }
7169
7170 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
7171 {
7172     signed char current_rounding_mode = STATUS(float_rounding_mode);
7173     set_float_rounding_mode(float_round_to_zero STATUS_VAR);
7174     int64_t v = float64_to_uint64(a STATUS_VAR);
7175     set_float_rounding_mode(current_rounding_mode STATUS_VAR);
7176     return v;
7177 }
7178
7179 #define COMPARE(s, nan_exp)                                                  \
7180 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7181                                       int is_quiet, float_status *status)    \
7182 {                                                                            \
7183     flag aSign, bSign;                                                       \
7184     uint ## s ## _t av, bv;                                                  \
7185     a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
7186     b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
7187                                                                              \
7188     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
7189          extractFloat ## s ## Frac( a ) ) ||                                 \
7190         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
7191           extractFloat ## s ## Frac( b ) )) {                                \
7192         if (!is_quiet ||                                                     \
7193             float ## s ## _is_signaling_nan( a ) ||                          \
7194             float ## s ## _is_signaling_nan( b ) ) {                         \
7195             float_raise( float_flag_invalid STATUS_VAR);                     \
7196         }                                                                    \
7197         return float_relation_unordered;                                     \
7198     }                                                                        \
7199     aSign = extractFloat ## s ## Sign( a );                                  \
7200     bSign = extractFloat ## s ## Sign( b );                                  \
7201     av = float ## s ## _val(a);                                              \
7202     bv = float ## s ## _val(b);                                              \
7203     if ( aSign != bSign ) {                                                  \
7204         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
7205             /* zero case */                                                  \
7206             return float_relation_equal;                                     \
7207         } else {                                                             \
7208             return 1 - (2 * aSign);                                          \
7209         }                                                                    \
7210     } else {                                                                 \
7211         if (av == bv) {                                                      \
7212             return float_relation_equal;                                     \
7213         } else {                                                             \
7214             return 1 - 2 * (aSign ^ ( av < bv ));                            \
7215         }                                                                    \
7216     }                                                                        \
7217 }                                                                            \
7218                                                                              \
7219 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
7220 {                                                                            \
7221     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
7222 }                                                                            \
7223                                                                              \
7224 int float ## s ## _compare_quiet(float ## s a, float ## s b,                 \
7225                                  float_status *status)                       \
7226 {                                                                            \
7227     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
7228 }
7229
7230 COMPARE(32, 0xff)
7231 COMPARE(64, 0x7ff)
7232
7233 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7234                                             int is_quiet, float_status *status)
7235 {
7236     flag aSign, bSign;
7237
7238     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7239           ( extractFloatx80Frac( a )<<1 ) ) ||
7240         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7241           ( extractFloatx80Frac( b )<<1 ) )) {
7242         if (!is_quiet ||
7243             floatx80_is_signaling_nan( a ) ||
7244             floatx80_is_signaling_nan( b ) ) {
7245             float_raise( float_flag_invalid STATUS_VAR);
7246         }
7247         return float_relation_unordered;
7248     }
7249     aSign = extractFloatx80Sign( a );
7250     bSign = extractFloatx80Sign( b );
7251     if ( aSign != bSign ) {
7252
7253         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7254              ( ( a.low | b.low ) == 0 ) ) {
7255             /* zero case */
7256             return float_relation_equal;
7257         } else {
7258             return 1 - (2 * aSign);
7259         }
7260     } else {
7261         if (a.low == b.low && a.high == b.high) {
7262             return float_relation_equal;
7263         } else {
7264             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7265         }
7266     }
7267 }
7268
7269 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7270 {
7271     return floatx80_compare_internal(a, b, 0 STATUS_VAR);
7272 }
7273
7274 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7275 {
7276     return floatx80_compare_internal(a, b, 1 STATUS_VAR);
7277 }
7278
7279 static inline int float128_compare_internal(float128 a, float128 b,
7280                                             int is_quiet, float_status *status)
7281 {
7282     flag aSign, bSign;
7283
7284     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7285           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7286         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7287           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7288         if (!is_quiet ||
7289             float128_is_signaling_nan( a ) ||
7290             float128_is_signaling_nan( b ) ) {
7291             float_raise( float_flag_invalid STATUS_VAR);
7292         }
7293         return float_relation_unordered;
7294     }
7295     aSign = extractFloat128Sign( a );
7296     bSign = extractFloat128Sign( b );
7297     if ( aSign != bSign ) {
7298         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7299             /* zero case */
7300             return float_relation_equal;
7301         } else {
7302             return 1 - (2 * aSign);
7303         }
7304     } else {
7305         if (a.low == b.low && a.high == b.high) {
7306             return float_relation_equal;
7307         } else {
7308             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7309         }
7310     }
7311 }
7312
7313 int float128_compare(float128 a, float128 b, float_status *status)
7314 {
7315     return float128_compare_internal(a, b, 0 STATUS_VAR);
7316 }
7317
7318 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7319 {
7320     return float128_compare_internal(a, b, 1 STATUS_VAR);
7321 }
7322
7323 /* min() and max() functions. These can't be implemented as
7324  * 'compare and pick one input' because that would mishandle
7325  * NaNs and +0 vs -0.
7326  *
7327  * minnum() and maxnum() functions. These are similar to the min()
7328  * and max() functions but if one of the arguments is a QNaN and
7329  * the other is numerical then the numerical argument is returned.
7330  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7331  * and maxNum() operations. min() and max() are the typical min/max
7332  * semantics provided by many CPUs which predate that specification.
7333  *
7334  * minnummag() and maxnummag() functions correspond to minNumMag()
7335  * and minNumMag() from the IEEE-754 2008.
7336  */
7337 #define MINMAX(s)                                                       \
7338 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
7339                                                int ismin, int isieee,   \
7340                                                int ismag,               \
7341                                                float_status *status)    \
7342 {                                                                       \
7343     flag aSign, bSign;                                                  \
7344     uint ## s ## _t av, bv, aav, abv;                                   \
7345     a = float ## s ## _squash_input_denormal(a STATUS_VAR);             \
7346     b = float ## s ## _squash_input_denormal(b STATUS_VAR);             \
7347     if (float ## s ## _is_any_nan(a) ||                                 \
7348         float ## s ## _is_any_nan(b)) {                                 \
7349         if (isieee) {                                                   \
7350             if (float ## s ## _is_quiet_nan(a) &&                       \
7351                 !float ## s ##_is_any_nan(b)) {                         \
7352                 return b;                                               \
7353             } else if (float ## s ## _is_quiet_nan(b) &&                \
7354                        !float ## s ## _is_any_nan(a)) {                 \
7355                 return a;                                               \
7356             }                                                           \
7357         }                                                               \
7358         return propagateFloat ## s ## NaN(a, b STATUS_VAR);             \
7359     }                                                                   \
7360     aSign = extractFloat ## s ## Sign(a);                               \
7361     bSign = extractFloat ## s ## Sign(b);                               \
7362     av = float ## s ## _val(a);                                         \
7363     bv = float ## s ## _val(b);                                         \
7364     if (ismag) {                                                        \
7365         aav = float ## s ## _abs(av);                                   \
7366         abv = float ## s ## _abs(bv);                                   \
7367         if (aav != abv) {                                               \
7368             if (ismin) {                                                \
7369                 return (aav < abv) ? a : b;                             \
7370             } else {                                                    \
7371                 return (aav < abv) ? b : a;                             \
7372             }                                                           \
7373         }                                                               \
7374     }                                                                   \
7375     if (aSign != bSign) {                                               \
7376         if (ismin) {                                                    \
7377             return aSign ? a : b;                                       \
7378         } else {                                                        \
7379             return aSign ? b : a;                                       \
7380         }                                                               \
7381     } else {                                                            \
7382         if (ismin) {                                                    \
7383             return (aSign ^ (av < bv)) ? a : b;                         \
7384         } else {                                                        \
7385             return (aSign ^ (av < bv)) ? b : a;                         \
7386         }                                                               \
7387     }                                                                   \
7388 }                                                                       \
7389                                                                         \
7390 float ## s float ## s ## _min(float ## s a, float ## s b,               \
7391                               float_status *status)                     \
7392 {                                                                       \
7393     return float ## s ## _minmax(a, b, 1, 0, 0 STATUS_VAR);             \
7394 }                                                                       \
7395                                                                         \
7396 float ## s float ## s ## _max(float ## s a, float ## s b,               \
7397                               float_status *status)                     \
7398 {                                                                       \
7399     return float ## s ## _minmax(a, b, 0, 0, 0 STATUS_VAR);             \
7400 }                                                                       \
7401                                                                         \
7402 float ## s float ## s ## _minnum(float ## s a, float ## s b,            \
7403                                  float_status *status)                  \
7404 {                                                                       \
7405     return float ## s ## _minmax(a, b, 1, 1, 0 STATUS_VAR);             \
7406 }                                                                       \
7407                                                                         \
7408 float ## s float ## s ## _maxnum(float ## s a, float ## s b,            \
7409                                  float_status *status)                  \
7410 {                                                                       \
7411     return float ## s ## _minmax(a, b, 0, 1, 0 STATUS_VAR);             \
7412 }                                                                       \
7413                                                                         \
7414 float ## s float ## s ## _minnummag(float ## s a, float ## s b,         \
7415                                     float_status *status)               \
7416 {                                                                       \
7417     return float ## s ## _minmax(a, b, 1, 1, 1 STATUS_VAR);             \
7418 }                                                                       \
7419                                                                         \
7420 float ## s float ## s ## _maxnummag(float ## s a, float ## s b,         \
7421                                     float_status *status)               \
7422 {                                                                       \
7423     return float ## s ## _minmax(a, b, 0, 1, 1 STATUS_VAR);             \
7424 }
7425
7426 MINMAX(32)
7427 MINMAX(64)
7428
7429
7430 /* Multiply A by 2 raised to the power N.  */
7431 float32 float32_scalbn(float32 a, int n, float_status *status)
7432 {
7433     flag aSign;
7434     int16_t aExp;
7435     uint32_t aSig;
7436
7437     a = float32_squash_input_denormal(a STATUS_VAR);
7438     aSig = extractFloat32Frac( a );
7439     aExp = extractFloat32Exp( a );
7440     aSign = extractFloat32Sign( a );
7441
7442     if ( aExp == 0xFF ) {
7443         if ( aSig ) {
7444             return propagateFloat32NaN( a, a STATUS_VAR );
7445         }
7446         return a;
7447     }
7448     if (aExp != 0) {
7449         aSig |= 0x00800000;
7450     } else if (aSig == 0) {
7451         return a;
7452     } else {
7453         aExp++;
7454     }
7455
7456     if (n > 0x200) {
7457         n = 0x200;
7458     } else if (n < -0x200) {
7459         n = -0x200;
7460     }
7461
7462     aExp += n - 1;
7463     aSig <<= 7;
7464     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
7465 }
7466
7467 float64 float64_scalbn(float64 a, int n, float_status *status)
7468 {
7469     flag aSign;
7470     int16_t aExp;
7471     uint64_t aSig;
7472
7473     a = float64_squash_input_denormal(a STATUS_VAR);
7474     aSig = extractFloat64Frac( a );
7475     aExp = extractFloat64Exp( a );
7476     aSign = extractFloat64Sign( a );
7477
7478     if ( aExp == 0x7FF ) {
7479         if ( aSig ) {
7480             return propagateFloat64NaN( a, a STATUS_VAR );
7481         }
7482         return a;
7483     }
7484     if (aExp != 0) {
7485         aSig |= LIT64( 0x0010000000000000 );
7486     } else if (aSig == 0) {
7487         return a;
7488     } else {
7489         aExp++;
7490     }
7491
7492     if (n > 0x1000) {
7493         n = 0x1000;
7494     } else if (n < -0x1000) {
7495         n = -0x1000;
7496     }
7497
7498     aExp += n - 1;
7499     aSig <<= 10;
7500     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
7501 }
7502
7503 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7504 {
7505     flag aSign;
7506     int32_t aExp;
7507     uint64_t aSig;
7508
7509     aSig = extractFloatx80Frac( a );
7510     aExp = extractFloatx80Exp( a );
7511     aSign = extractFloatx80Sign( a );
7512
7513     if ( aExp == 0x7FFF ) {
7514         if ( aSig<<1 ) {
7515             return propagateFloatx80NaN( a, a STATUS_VAR );
7516         }
7517         return a;
7518     }
7519
7520     if (aExp == 0) {
7521         if (aSig == 0) {
7522             return a;
7523         }
7524         aExp++;
7525     }
7526
7527     if (n > 0x10000) {
7528         n = 0x10000;
7529     } else if (n < -0x10000) {
7530         n = -0x10000;
7531     }
7532
7533     aExp += n;
7534     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
7535                                           aSign, aExp, aSig, 0 STATUS_VAR );
7536 }
7537
7538 float128 float128_scalbn(float128 a, int n, float_status *status)
7539 {
7540     flag aSign;
7541     int32_t aExp;
7542     uint64_t aSig0, aSig1;
7543
7544     aSig1 = extractFloat128Frac1( a );
7545     aSig0 = extractFloat128Frac0( a );
7546     aExp = extractFloat128Exp( a );
7547     aSign = extractFloat128Sign( a );
7548     if ( aExp == 0x7FFF ) {
7549         if ( aSig0 | aSig1 ) {
7550             return propagateFloat128NaN( a, a STATUS_VAR );
7551         }
7552         return a;
7553     }
7554     if (aExp != 0) {
7555         aSig0 |= LIT64( 0x0001000000000000 );
7556     } else if (aSig0 == 0 && aSig1 == 0) {
7557         return a;
7558     } else {
7559         aExp++;
7560     }
7561
7562     if (n > 0x10000) {
7563         n = 0x10000;
7564     } else if (n < -0x10000) {
7565         n = -0x10000;
7566     }
7567
7568     aExp += n - 1;
7569     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7570                                           STATUS_VAR );
7571
7572 }