fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * The code in this source file is derived from release 2a of the SoftFloat
   5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6  * some later contributions) are provided under that license, as detailed below.
   7  * It has subsequently been modified by contributors to the QEMU Project,
   8  * so some portions are provided under:
   9  *  the SoftFloat-2a license
  10  *  the BSD license
  11  *  GPL-v2-or-later
  12  *
  13  * Any future contributions to this file after December 1st 2014 will be
  14  * taken to be licensed under the Softfloat-2a license unless specifically
  15  * indicated otherwise.
  16  */
  17
  18 /*
  19 ===============================================================================
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* BSD licensing:
  48  * Copyright (c) 2006, Fabrice Bellard
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are met:
  53  *
  54  * 1. Redistributions of source code must retain the above copyright notice,
  55  * this list of conditions and the following disclaimer.
  56  *
  57  * 2. Redistributions in binary form must reproduce the above copyright notice,
  58  * this list of conditions and the following disclaimer in the documentation
  59  * and/or other materials provided with the distribution.
  60  *
  61  * 3. Neither the name of the copyright holder nor the names of its contributors
  62  * may be used to endorse or promote products derived from this software without
  63  * specific prior written permission.
  64  *
  65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75  * THE POSSIBILITY OF SUCH DAMAGE.
  76  */
  77
  78 /* Portions of this work are licensed under the terms of the GNU GPL,
  79  * version 2 or later. See the COPYING file in the top-level directory.
  80  */
  81
  82 /* softfloat (and in particular the code in softfloat-specialize.h) is
  83  * target-dependent and needs the TARGET_* macros.
  84  */
  85 #include "config.h"
  86
  87 #include "fpu/softfloat.h"
  88
  89 /* We only need stdlib for abort() */
  90 #include <stdlib.h>
  91
  92 /*----------------------------------------------------------------------------
  93 | Primitive arithmetic functions, including multi-word arithmetic, and
  94 | division and square root approximations.  (Can be specialized to target if
  95 | desired.)
  96 *----------------------------------------------------------------------------*/
  97 #include "softfloat-macros.h"
  98
  99 /*----------------------------------------------------------------------------
 100 | Functions and definitions to determine:  (1) whether tininess for underflow
 101 | is detected before or after rounding by default, (2) what (if anything)
 102 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
 103 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 104 | are propagated from function inputs to output.  These details are target-
 105 | specific.
 106 *----------------------------------------------------------------------------*/
 107 #include "softfloat-specialize.h"
 108
 109 /*----------------------------------------------------------------------------
 110 | Returns the fraction bits of the half-precision floating-point value `a'.
 111 *----------------------------------------------------------------------------*/
 112
 113 static inline uint32_t extractFloat16Frac(float16 a)
 114 {
 115     return float16_val(a) & 0x3ff;
 116 }
 117
 118 /*----------------------------------------------------------------------------
 119 | Returns the exponent bits of the half-precision floating-point value `a'.
 120 *----------------------------------------------------------------------------*/
 121
 122 static inline int_fast16_t extractFloat16Exp(float16 a)
 123 {
 124     return (float16_val(a) >> 10) & 0x1f;
 125 }
 126
 127 /*----------------------------------------------------------------------------
 128 | Returns the sign bit of the single-precision floating-point value `a'.
 129 *----------------------------------------------------------------------------*/
 130
 131 static inline flag extractFloat16Sign(float16 a)
 132 {
 133     return float16_val(a)>>15;
 134 }
 135
 136 /*----------------------------------------------------------------------------
 137 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
 138 | and 7, and returns the properly rounded 32-bit integer corresponding to the
 139 | input.  If `zSign' is 1, the input is negated before being converted to an
 140 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 141 | is simply rounded to an integer, with the inexact exception raised if the
 142 | input cannot be represented exactly as an integer.  However, if the fixed-
 143 | point input is too large, the invalid exception is raised and the largest
 144 | positive or negative integer is returned.
 145 *----------------------------------------------------------------------------*/
 146
 147 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
 148 {
 149     int8 roundingMode;
 150     flag roundNearestEven;
 151     int8 roundIncrement, roundBits;
 152     int32_t z;
 153
 154     roundingMode = STATUS(float_rounding_mode);
 155     roundNearestEven = ( roundingMode == float_round_nearest_even );
 156     switch (roundingMode) {
 157     case float_round_nearest_even:
 158     case float_round_ties_away:
 159         roundIncrement = 0x40;
 160         break;
 161     case float_round_to_zero:
 162         roundIncrement = 0;
 163         break;
 164     case float_round_up:
 165         roundIncrement = zSign ? 0 : 0x7f;
 166         break;
 167     case float_round_down:
 168         roundIncrement = zSign ? 0x7f : 0;
 169         break;
 170     default:
 171         abort();
 172     }
 173     roundBits = absZ & 0x7F;
 174     absZ = ( absZ + roundIncrement )>>7;
 175     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 176     z = absZ;
 177     if ( zSign ) z = - z;
 178     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 179         float_raise( float_flag_invalid STATUS_VAR);
 180         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
 181     }
 182     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 183     return z;
 184
 185 }
 186
 187 /*----------------------------------------------------------------------------
 188 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 189 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 190 | and returns the properly rounded 64-bit integer corresponding to the input.
 191 | If `zSign' is 1, the input is negated before being converted to an integer.
 192 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 193 | the inexact exception raised if the input cannot be represented exactly as
 194 | an integer.  However, if the fixed-point input is too large, the invalid
 195 | exception is raised and the largest positive or negative integer is
 196 | returned.
 197 *----------------------------------------------------------------------------*/
 198
 199 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
 200 {
 201     int8 roundingMode;
 202     flag roundNearestEven, increment;
 203     int64_t z;
 204
 205     roundingMode = STATUS(float_rounding_mode);
 206     roundNearestEven = ( roundingMode == float_round_nearest_even );
 207     switch (roundingMode) {
 208     case float_round_nearest_even:
 209     case float_round_ties_away:
 210         increment = ((int64_t) absZ1 < 0);
 211         break;
 212     case float_round_to_zero:
 213         increment = 0;
 214         break;
 215     case float_round_up:
 216         increment = !zSign && absZ1;
 217         break;
 218     case float_round_down:
 219         increment = zSign && absZ1;
 220         break;
 221     default:
 222         abort();
 223     }
 224     if ( increment ) {
 225         ++absZ0;
 226         if ( absZ0 == 0 ) goto overflow;
 227         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 228     }
 229     z = absZ0;
 230     if ( zSign ) z = - z;
 231     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 232  overflow:
 233         float_raise( float_flag_invalid STATUS_VAR);
 234         return
 235               zSign ? (int64_t) LIT64( 0x8000000000000000 )
 236             : LIT64( 0x7FFFFFFFFFFFFFFF );
 237     }
 238     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 239     return z;
 240
 241 }
 242
 243 /*----------------------------------------------------------------------------
 244 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 245 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 246 | and returns the properly rounded 64-bit unsigned integer corresponding to the
 247 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
 248 | with the inexact exception raised if the input cannot be represented exactly
 249 | as an integer.  However, if the fixed-point input is too large, the invalid
 250 | exception is raised and the largest unsigned integer is returned.
 251 *----------------------------------------------------------------------------*/
 252
 253 static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,
 254                                 uint64_t absZ1 STATUS_PARAM)
 255 {
 256     int8 roundingMode;
 257     flag roundNearestEven, increment;
 258
 259     roundingMode = STATUS(float_rounding_mode);
 260     roundNearestEven = (roundingMode == float_round_nearest_even);
 261     switch (roundingMode) {
 262     case float_round_nearest_even:
 263     case float_round_ties_away:
 264         increment = ((int64_t)absZ1 < 0);
 265         break;
 266     case float_round_to_zero:
 267         increment = 0;
 268         break;
 269     case float_round_up:
 270         increment = !zSign && absZ1;
 271         break;
 272     case float_round_down:
 273         increment = zSign && absZ1;
 274         break;
 275     default:
 276         abort();
 277     }
 278     if (increment) {
 279         ++absZ0;
 280         if (absZ0 == 0) {
 281             float_raise(float_flag_invalid STATUS_VAR);
 282             return LIT64(0xFFFFFFFFFFFFFFFF);
 283         }
 284         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
 285     }
 286
 287     if (zSign && absZ0) {
 288         float_raise(float_flag_invalid STATUS_VAR);
 289         return 0;
 290     }
 291
 292     if (absZ1) {
 293         STATUS(float_exception_flags) |= float_flag_inexact;
 294     }
 295     return absZ0;
 296 }
 297
 298 /*----------------------------------------------------------------------------
 299 | Returns the fraction bits of the single-precision floating-point value `a'.
 300 *----------------------------------------------------------------------------*/
 301
 302 static inline uint32_t extractFloat32Frac( float32 a )
 303 {
 304
 305     return float32_val(a) & 0x007FFFFF;
 306
 307 }
 308
 309 /*----------------------------------------------------------------------------
 310 | Returns the exponent bits of the single-precision floating-point value `a'.
 311 *----------------------------------------------------------------------------*/
 312
 313 static inline int_fast16_t extractFloat32Exp(float32 a)
 314 {
 315
 316     return ( float32_val(a)>>23 ) & 0xFF;
 317
 318 }
 319
 320 /*----------------------------------------------------------------------------
 321 | Returns the sign bit of the single-precision floating-point value `a'.
 322 *----------------------------------------------------------------------------*/
 323
 324 static inline flag extractFloat32Sign( float32 a )
 325 {
 326
 327     return float32_val(a)>>31;
 328
 329 }
 330
 331 /*----------------------------------------------------------------------------
 332 | If `a' is denormal and we are in flush-to-zero mode then set the
 333 | input-denormal exception and return zero. Otherwise just return the value.
 334 *----------------------------------------------------------------------------*/
 335 float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
 336 {
 337     if (STATUS(flush_inputs_to_zero)) {
 338         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
 339             float_raise(float_flag_input_denormal STATUS_VAR);
 340             return make_float32(float32_val(a) & 0x80000000);
 341         }
 342     }
 343     return a;
 344 }
 345
 346 /*----------------------------------------------------------------------------
 347 | Normalizes the subnormal single-precision floating-point value represented
 348 | by the denormalized significand `aSig'.  The normalized exponent and
 349 | significand are stored at the locations pointed to by `zExpPtr' and
 350 | `zSigPtr', respectively.
 351 *----------------------------------------------------------------------------*/
 352
 353 static void
 354  normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
 355 {
 356     int8 shiftCount;
 357
 358     shiftCount = countLeadingZeros32( aSig ) - 8;
 359     *zSigPtr = aSig<<shiftCount;
 360     *zExpPtr = 1 - shiftCount;
 361
 362 }
 363
 364 /*----------------------------------------------------------------------------
 365 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 366 | single-precision floating-point value, returning the result.  After being
 367 | shifted into the proper positions, the three fields are simply added
 368 | together to form the result.  This means that any integer portion of `zSig'
 369 | will be added into the exponent.  Since a properly normalized significand
 370 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 371 | than the desired result exponent whenever `zSig' is a complete, normalized
 372 | significand.
 373 *----------------------------------------------------------------------------*/
 374
 375 static inline float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
 376 {
 377
 378     return make_float32(
 379           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
 380
 381 }
 382
 383 /*----------------------------------------------------------------------------
 384 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 385 | and significand `zSig', and returns the proper single-precision floating-
 386 | point value corresponding to the abstract input.  Ordinarily, the abstract
 387 | value is simply rounded and packed into the single-precision format, with
 388 | the inexact exception raised if the abstract input cannot be represented
 389 | exactly.  However, if the abstract value is too large, the overflow and
 390 | inexact exceptions are raised and an infinity or maximal finite value is
 391 | returned.  If the abstract value is too small, the input value is rounded to
 392 | a subnormal number, and the underflow and inexact exceptions are raised if
 393 | the abstract input cannot be represented exactly as a subnormal single-
 394 | precision floating-point number.
 395 |     The input significand `zSig' has its binary point between bits 30
 396 | and 29, which is 7 bits to the left of the usual location.  This shifted
 397 | significand must be normalized or smaller.  If `zSig' is not normalized,
 398 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 399 | and it must not require rounding.  In the usual case that `zSig' is
 400 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 401 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 402 | Binary Floating-Point Arithmetic.
 403 *----------------------------------------------------------------------------*/
 404
 405 static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
 406 {
 407     int8 roundingMode;
 408     flag roundNearestEven;
 409     int8 roundIncrement, roundBits;
 410     flag isTiny;
 411
 412     roundingMode = STATUS(float_rounding_mode);
 413     roundNearestEven = ( roundingMode == float_round_nearest_even );
 414     switch (roundingMode) {
 415     case float_round_nearest_even:
 416     case float_round_ties_away:
 417         roundIncrement = 0x40;
 418         break;
 419     case float_round_to_zero:
 420         roundIncrement = 0;
 421         break;
 422     case float_round_up:
 423         roundIncrement = zSign ? 0 : 0x7f;
 424         break;
 425     case float_round_down:
 426         roundIncrement = zSign ? 0x7f : 0;
 427         break;
 428     default:
 429         abort();
 430         break;
 431     }
 432     roundBits = zSig & 0x7F;
 433     if ( 0xFD <= (uint16_t) zExp ) {
 434         if (    ( 0xFD < zExp )
 435              || (    ( zExp == 0xFD )
 436                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
 437            ) {
 438             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 439             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 440         }
 441         if ( zExp < 0 ) {
 442             if (STATUS(flush_to_zero)) {
 443                 float_raise(float_flag_output_denormal STATUS_VAR);
 444                 return packFloat32(zSign, 0, 0);
 445             }
 446             isTiny =
 447                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 448                 || ( zExp < -1 )
 449                 || ( zSig + roundIncrement < 0x80000000 );
 450             shift32RightJamming( zSig, - zExp, &zSig );
 451             zExp = 0;
 452             roundBits = zSig & 0x7F;
 453             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 454         }
 455     }
 456     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 457     zSig = ( zSig + roundIncrement )>>7;
 458     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 459     if ( zSig == 0 ) zExp = 0;
 460     return packFloat32( zSign, zExp, zSig );
 461
 462 }
 463
 464 /*----------------------------------------------------------------------------
 465 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 466 | and significand `zSig', and returns the proper single-precision floating-
 467 | point value corresponding to the abstract input.  This routine is just like
 468 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 469 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 470 | floating-point exponent.
 471 *----------------------------------------------------------------------------*/
 472
 473 static float32
 474  normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
 475 {
 476     int8 shiftCount;
 477
 478     shiftCount = countLeadingZeros32( zSig ) - 1;
 479     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 480
 481 }
 482
 483 /*----------------------------------------------------------------------------
 484 | Returns the fraction bits of the double-precision floating-point value `a'.
 485 *----------------------------------------------------------------------------*/
 486
 487 static inline uint64_t extractFloat64Frac( float64 a )
 488 {
 489
 490     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 491
 492 }
 493
 494 /*----------------------------------------------------------------------------
 495 | Returns the exponent bits of the double-precision floating-point value `a'.
 496 *----------------------------------------------------------------------------*/
 497
 498 static inline int_fast16_t extractFloat64Exp(float64 a)
 499 {
 500
 501     return ( float64_val(a)>>52 ) & 0x7FF;
 502
 503 }
 504
 505 /*----------------------------------------------------------------------------
 506 | Returns the sign bit of the double-precision floating-point value `a'.
 507 *----------------------------------------------------------------------------*/
 508
 509 static inline flag extractFloat64Sign( float64 a )
 510 {
 511
 512     return float64_val(a)>>63;
 513
 514 }
 515
 516 /*----------------------------------------------------------------------------
 517 | If `a' is denormal and we are in flush-to-zero mode then set the
 518 | input-denormal exception and return zero. Otherwise just return the value.
 519 *----------------------------------------------------------------------------*/
 520 float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
 521 {
 522     if (STATUS(flush_inputs_to_zero)) {
 523         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
 524             float_raise(float_flag_input_denormal STATUS_VAR);
 525             return make_float64(float64_val(a) & (1ULL << 63));
 526         }
 527     }
 528     return a;
 529 }
 530
 531 /*----------------------------------------------------------------------------
 532 | Normalizes the subnormal double-precision floating-point value represented
 533 | by the denormalized significand `aSig'.  The normalized exponent and
 534 | significand are stored at the locations pointed to by `zExpPtr' and
 535 | `zSigPtr', respectively.
 536 *----------------------------------------------------------------------------*/
 537
 538 static void
 539  normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
 540 {
 541     int8 shiftCount;
 542
 543     shiftCount = countLeadingZeros64( aSig ) - 11;
 544     *zSigPtr = aSig<<shiftCount;
 545     *zExpPtr = 1 - shiftCount;
 546
 547 }
 548
 549 /*----------------------------------------------------------------------------
 550 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 551 | double-precision floating-point value, returning the result.  After being
 552 | shifted into the proper positions, the three fields are simply added
 553 | together to form the result.  This means that any integer portion of `zSig'
 554 | will be added into the exponent.  Since a properly normalized significand
 555 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 556 | than the desired result exponent whenever `zSig' is a complete, normalized
 557 | significand.
 558 *----------------------------------------------------------------------------*/
 559
 560 static inline float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
 561 {
 562
 563     return make_float64(
 564         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
 565
 566 }
 567
 568 /*----------------------------------------------------------------------------
 569 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 570 | and significand `zSig', and returns the proper double-precision floating-
 571 | point value corresponding to the abstract input.  Ordinarily, the abstract
 572 | value is simply rounded and packed into the double-precision format, with
 573 | the inexact exception raised if the abstract input cannot be represented
 574 | exactly.  However, if the abstract value is too large, the overflow and
 575 | inexact exceptions are raised and an infinity or maximal finite value is
 576 | returned.  If the abstract value is too small, the input value is rounded to
 577 | a subnormal number, and the underflow and inexact exceptions are raised if
 578 | the abstract input cannot be represented exactly as a subnormal double-
 579 | precision floating-point number.
 580 |     The input significand `zSig' has its binary point between bits 62
 581 | and 61, which is 10 bits to the left of the usual location.  This shifted
 582 | significand must be normalized or smaller.  If `zSig' is not normalized,
 583 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 584 | and it must not require rounding.  In the usual case that `zSig' is
 585 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 586 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 587 | Binary Floating-Point Arithmetic.
 588 *----------------------------------------------------------------------------*/
 589
 590 static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
 591 {
 592     int8 roundingMode;
 593     flag roundNearestEven;
 594     int_fast16_t roundIncrement, roundBits;
 595     flag isTiny;
 596
 597     roundingMode = STATUS(float_rounding_mode);
 598     roundNearestEven = ( roundingMode == float_round_nearest_even );
 599     switch (roundingMode) {
 600     case float_round_nearest_even:
 601     case float_round_ties_away:
 602         roundIncrement = 0x200;
 603         break;
 604     case float_round_to_zero:
 605         roundIncrement = 0;
 606         break;
 607     case float_round_up:
 608         roundIncrement = zSign ? 0 : 0x3ff;
 609         break;
 610     case float_round_down:
 611         roundIncrement = zSign ? 0x3ff : 0;
 612         break;
 613     default:
 614         abort();
 615     }
 616     roundBits = zSig & 0x3FF;
 617     if ( 0x7FD <= (uint16_t) zExp ) {
 618         if (    ( 0x7FD < zExp )
 619              || (    ( zExp == 0x7FD )
 620                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
 621            ) {
 622             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 623             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 624         }
 625         if ( zExp < 0 ) {
 626             if (STATUS(flush_to_zero)) {
 627                 float_raise(float_flag_output_denormal STATUS_VAR);
 628                 return packFloat64(zSign, 0, 0);
 629             }
 630             isTiny =
 631                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 632                 || ( zExp < -1 )
 633                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 634             shift64RightJamming( zSig, - zExp, &zSig );
 635             zExp = 0;
 636             roundBits = zSig & 0x3FF;
 637             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 638         }
 639     }
 640     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 641     zSig = ( zSig + roundIncrement )>>10;
 642     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 643     if ( zSig == 0 ) zExp = 0;
 644     return packFloat64( zSign, zExp, zSig );
 645
 646 }
 647
 648 /*----------------------------------------------------------------------------
 649 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 650 | and significand `zSig', and returns the proper double-precision floating-
 651 | point value corresponding to the abstract input.  This routine is just like
 652 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 653 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 654 | floating-point exponent.
 655 *----------------------------------------------------------------------------*/
 656
 657 static float64
 658  normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
 659 {
 660     int8 shiftCount;
 661
 662     shiftCount = countLeadingZeros64( zSig ) - 1;
 663     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 664
 665 }
 666
 667 /*----------------------------------------------------------------------------
 668 | Returns the fraction bits of the extended double-precision floating-point
 669 | value `a'.
 670 *----------------------------------------------------------------------------*/
 671
 672 static inline uint64_t extractFloatx80Frac( floatx80 a )
 673 {
 674
 675     return a.low;
 676
 677 }
 678
 679 /*----------------------------------------------------------------------------
 680 | Returns the exponent bits of the extended double-precision floating-point
 681 | value `a'.
 682 *----------------------------------------------------------------------------*/
 683
 684 static inline int32 extractFloatx80Exp( floatx80 a )
 685 {
 686
 687     return a.high & 0x7FFF;
 688
 689 }
 690
 691 /*----------------------------------------------------------------------------
 692 | Returns the sign bit of the extended double-precision floating-point value
 693 | `a'.
 694 *----------------------------------------------------------------------------*/
 695
 696 static inline flag extractFloatx80Sign( floatx80 a )
 697 {
 698
 699     return a.high>>15;
 700
 701 }
 702
 703 /*----------------------------------------------------------------------------
 704 | Normalizes the subnormal extended double-precision floating-point value
 705 | represented by the denormalized significand `aSig'.  The normalized exponent
 706 | and significand are stored at the locations pointed to by `zExpPtr' and
 707 | `zSigPtr', respectively.
 708 *----------------------------------------------------------------------------*/
 709
 710 static void
 711  normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
 712 {
 713     int8 shiftCount;
 714
 715     shiftCount = countLeadingZeros64( aSig );
 716     *zSigPtr = aSig<<shiftCount;
 717     *zExpPtr = 1 - shiftCount;
 718
 719 }
 720
 721 /*----------------------------------------------------------------------------
 722 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 723 | extended double-precision floating-point value, returning the result.
 724 *----------------------------------------------------------------------------*/
 725
 726 static inline floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
 727 {
 728     floatx80 z;
 729
 730     z.low = zSig;
 731     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
 732     return z;
 733
 734 }
 735
 736 /*----------------------------------------------------------------------------
 737 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 738 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 739 | and returns the proper extended double-precision floating-point value
 740 | corresponding to the abstract input.  Ordinarily, the abstract value is
 741 | rounded and packed into the extended double-precision format, with the
 742 | inexact exception raised if the abstract input cannot be represented
 743 | exactly.  However, if the abstract value is too large, the overflow and
 744 | inexact exceptions are raised and an infinity or maximal finite value is
 745 | returned.  If the abstract value is too small, the input value is rounded to
 746 | a subnormal number, and the underflow and inexact exceptions are raised if
 747 | the abstract input cannot be represented exactly as a subnormal extended
 748 | double-precision floating-point number.
 749 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 750 | number of bits as single or double precision, respectively.  Otherwise, the
 751 | result is rounded to the full precision of the extended double-precision
 752 | format.
 753 |     The input significand must be normalized or smaller.  If the input
 754 | significand is not normalized, `zExp' must be 0; in that case, the result
 755 | returned is a subnormal number, and it must not require rounding.  The
 756 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 757 | Floating-Point Arithmetic.
 758 *----------------------------------------------------------------------------*/
 759
 760 static floatx80
 761  roundAndPackFloatx80(
 762      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 763  STATUS_PARAM)
 764 {
 765     int8 roundingMode;
 766     flag roundNearestEven, increment, isTiny;
 767     int64 roundIncrement, roundMask, roundBits;
 768
 769     roundingMode = STATUS(float_rounding_mode);
 770     roundNearestEven = ( roundingMode == float_round_nearest_even );
 771     if ( roundingPrecision == 80 ) goto precision80;
 772     if ( roundingPrecision == 64 ) {
 773         roundIncrement = LIT64( 0x0000000000000400 );
 774         roundMask = LIT64( 0x00000000000007FF );
 775     }
 776     else if ( roundingPrecision == 32 ) {
 777         roundIncrement = LIT64( 0x0000008000000000 );
 778         roundMask = LIT64( 0x000000FFFFFFFFFF );
 779     }
 780     else {
 781         goto precision80;
 782     }
 783     zSig0 |= ( zSig1 != 0 );
 784     switch (roundingMode) {
 785     case float_round_nearest_even:
 786     case float_round_ties_away:
 787         break;
 788     case float_round_to_zero:
 789         roundIncrement = 0;
 790         break;
 791     case float_round_up:
 792         roundIncrement = zSign ? 0 : roundMask;
 793         break;
 794     case float_round_down:
 795         roundIncrement = zSign ? roundMask : 0;
 796         break;
 797     default:
 798         abort();
 799     }
 800     roundBits = zSig0 & roundMask;
 801     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 802         if (    ( 0x7FFE < zExp )
 803              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 804            ) {
 805             goto overflow;
 806         }
 807         if ( zExp <= 0 ) {
 808             if (STATUS(flush_to_zero)) {
 809                 float_raise(float_flag_output_denormal STATUS_VAR);
 810                 return packFloatx80(zSign, 0, 0);
 811             }
 812             isTiny =
 813                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 814                 || ( zExp < 0 )
 815                 || ( zSig0 <= zSig0 + roundIncrement );
 816             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 817             zExp = 0;
 818             roundBits = zSig0 & roundMask;
 819             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 820             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 821             zSig0 += roundIncrement;
 822             if ( (int64_t) zSig0 < 0 ) zExp = 1;
 823             roundIncrement = roundMask + 1;
 824             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 825                 roundMask |= roundIncrement;
 826             }
 827             zSig0 &= ~ roundMask;
 828             return packFloatx80( zSign, zExp, zSig0 );
 829         }
 830     }
 831     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 832     zSig0 += roundIncrement;
 833     if ( zSig0 < roundIncrement ) {
 834         ++zExp;
 835         zSig0 = LIT64( 0x8000000000000000 );
 836     }
 837     roundIncrement = roundMask + 1;
 838     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 839         roundMask |= roundIncrement;
 840     }
 841     zSig0 &= ~ roundMask;
 842     if ( zSig0 == 0 ) zExp = 0;
 843     return packFloatx80( zSign, zExp, zSig0 );
 844  precision80:
 845     switch (roundingMode) {
 846     case float_round_nearest_even:
 847     case float_round_ties_away:
 848         increment = ((int64_t)zSig1 < 0);
 849         break;
 850     case float_round_to_zero:
 851         increment = 0;
 852         break;
 853     case float_round_up:
 854         increment = !zSign && zSig1;
 855         break;
 856     case float_round_down:
 857         increment = zSign && zSig1;
 858         break;
 859     default:
 860         abort();
 861     }
 862     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 863         if (    ( 0x7FFE < zExp )
 864              || (    ( zExp == 0x7FFE )
 865                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 866                   && increment
 867                 )
 868            ) {
 869             roundMask = 0;
 870  overflow:
 871             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 872             if (    ( roundingMode == float_round_to_zero )
 873                  || ( zSign && ( roundingMode == float_round_up ) )
 874                  || ( ! zSign && ( roundingMode == float_round_down ) )
 875                ) {
 876                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 877             }
 878             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 879         }
 880         if ( zExp <= 0 ) {
 881             isTiny =
 882                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 883                 || ( zExp < 0 )
 884                 || ! increment
 885                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 886             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 887             zExp = 0;
 888             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 889             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 890             switch (roundingMode) {
 891             case float_round_nearest_even:
 892             case float_round_ties_away:
 893                 increment = ((int64_t)zSig1 < 0);
 894                 break;
 895             case float_round_to_zero:
 896                 increment = 0;
 897                 break;
 898             case float_round_up:
 899                 increment = !zSign && zSig1;
 900                 break;
 901             case float_round_down:
 902                 increment = zSign && zSig1;
 903                 break;
 904             default:
 905                 abort();
 906             }
 907             if ( increment ) {
 908                 ++zSig0;
 909                 zSig0 &=
 910                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 911                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
 912             }
 913             return packFloatx80( zSign, zExp, zSig0 );
 914         }
 915     }
 916     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 917     if ( increment ) {
 918         ++zSig0;
 919         if ( zSig0 == 0 ) {
 920             ++zExp;
 921             zSig0 = LIT64( 0x8000000000000000 );
 922         }
 923         else {
 924             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 925         }
 926     }
 927     else {
 928         if ( zSig0 == 0 ) zExp = 0;
 929     }
 930     return packFloatx80( zSign, zExp, zSig0 );
 931
 932 }
 933
 934 /*----------------------------------------------------------------------------
 935 | Takes an abstract floating-point value having sign `zSign', exponent
 936 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 937 | and returns the proper extended double-precision floating-point value
 938 | corresponding to the abstract input.  This routine is just like
 939 | `roundAndPackFloatx80' except that the input significand does not have to be
 940 | normalized.
 941 *----------------------------------------------------------------------------*/
 942
 943 static floatx80
 944  normalizeRoundAndPackFloatx80(
 945      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 946  STATUS_PARAM)
 947 {
 948     int8 shiftCount;
 949
 950     if ( zSig0 == 0 ) {
 951         zSig0 = zSig1;
 952         zSig1 = 0;
 953         zExp -= 64;
 954     }
 955     shiftCount = countLeadingZeros64( zSig0 );
 956     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 957     zExp -= shiftCount;
 958     return
 959         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 960
 961 }
 962
 963 /*----------------------------------------------------------------------------
 964 | Returns the least-significant 64 fraction bits of the quadruple-precision
 965 | floating-point value `a'.
 966 *----------------------------------------------------------------------------*/
 967
 968 static inline uint64_t extractFloat128Frac1( float128 a )
 969 {
 970
 971     return a.low;
 972
 973 }
 974
 975 /*----------------------------------------------------------------------------
 976 | Returns the most-significant 48 fraction bits of the quadruple-precision
 977 | floating-point value `a'.
 978 *----------------------------------------------------------------------------*/
 979
 980 static inline uint64_t extractFloat128Frac0( float128 a )
 981 {
 982
 983     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 984
 985 }
 986
 987 /*----------------------------------------------------------------------------
 988 | Returns the exponent bits of the quadruple-precision floating-point value
 989 | `a'.
 990 *----------------------------------------------------------------------------*/
 991
 992 static inline int32 extractFloat128Exp( float128 a )
 993 {
 994
 995     return ( a.high>>48 ) & 0x7FFF;
 996
 997 }
 998
 999 /*----------------------------------------------------------------------------
1000 | Returns the sign bit of the quadruple-precision floating-point value `a'.
1001 *----------------------------------------------------------------------------*/
1002
1003 static inline flag extractFloat128Sign( float128 a )
1004 {
1005
1006     return a.high>>63;
1007
1008 }
1009
1010 /*----------------------------------------------------------------------------
1011 | Normalizes the subnormal quadruple-precision floating-point value
1012 | represented by the denormalized significand formed by the concatenation of
1013 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
1014 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
1015 | significand are stored at the location pointed to by `zSig0Ptr', and the
1016 | least significant 64 bits of the normalized significand are stored at the
1017 | location pointed to by `zSig1Ptr'.
1018 *----------------------------------------------------------------------------*/
1019
1020 static void
1021  normalizeFloat128Subnormal(
1022      uint64_t aSig0,
1023      uint64_t aSig1,
1024      int32 *zExpPtr,
1025      uint64_t *zSig0Ptr,
1026      uint64_t *zSig1Ptr
1027  )
1028 {
1029     int8 shiftCount;
1030
1031     if ( aSig0 == 0 ) {
1032         shiftCount = countLeadingZeros64( aSig1 ) - 15;
1033         if ( shiftCount < 0 ) {
1034             *zSig0Ptr = aSig1>>( - shiftCount );
1035             *zSig1Ptr = aSig1<<( shiftCount & 63 );
1036         }
1037         else {
1038             *zSig0Ptr = aSig1<<shiftCount;
1039             *zSig1Ptr = 0;
1040         }
1041         *zExpPtr = - shiftCount - 63;
1042     }
1043     else {
1044         shiftCount = countLeadingZeros64( aSig0 ) - 15;
1045         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1046         *zExpPtr = 1 - shiftCount;
1047     }
1048
1049 }
1050
1051 /*----------------------------------------------------------------------------
1052 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1053 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1054 | floating-point value, returning the result.  After being shifted into the
1055 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1056 | added together to form the most significant 32 bits of the result.  This
1057 | means that any integer portion of `zSig0' will be added into the exponent.
1058 | Since a properly normalized significand will have an integer portion equal
1059 | to 1, the `zExp' input should be 1 less than the desired result exponent
1060 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1061 | significand.
1062 *----------------------------------------------------------------------------*/
1063
1064 static inline float128
1065  packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
1066 {
1067     float128 z;
1068
1069     z.low = zSig1;
1070     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1071     return z;
1072
1073 }
1074
1075 /*----------------------------------------------------------------------------
1076 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1077 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1078 | and `zSig2', and returns the proper quadruple-precision floating-point value
1079 | corresponding to the abstract input.  Ordinarily, the abstract value is
1080 | simply rounded and packed into the quadruple-precision format, with the
1081 | inexact exception raised if the abstract input cannot be represented
1082 | exactly.  However, if the abstract value is too large, the overflow and
1083 | inexact exceptions are raised and an infinity or maximal finite value is
1084 | returned.  If the abstract value is too small, the input value is rounded to
1085 | a subnormal number, and the underflow and inexact exceptions are raised if
1086 | the abstract input cannot be represented exactly as a subnormal quadruple-
1087 | precision floating-point number.
1088 |     The input significand must be normalized or smaller.  If the input
1089 | significand is not normalized, `zExp' must be 0; in that case, the result
1090 | returned is a subnormal number, and it must not require rounding.  In the
1091 | usual case that the input significand is normalized, `zExp' must be 1 less
1092 | than the ``true'' floating-point exponent.  The handling of underflow and
1093 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1094 *----------------------------------------------------------------------------*/
1095
1096 static float128
1097  roundAndPackFloat128(
1098      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
1099 {
1100     int8 roundingMode;
1101     flag roundNearestEven, increment, isTiny;
1102
1103     roundingMode = STATUS(float_rounding_mode);
1104     roundNearestEven = ( roundingMode == float_round_nearest_even );
1105     switch (roundingMode) {
1106     case float_round_nearest_even:
1107     case float_round_ties_away:
1108         increment = ((int64_t)zSig2 < 0);
1109         break;
1110     case float_round_to_zero:
1111         increment = 0;
1112         break;
1113     case float_round_up:
1114         increment = !zSign && zSig2;
1115         break;
1116     case float_round_down:
1117         increment = zSign && zSig2;
1118         break;
1119     default:
1120         abort();
1121     }
1122     if ( 0x7FFD <= (uint32_t) zExp ) {
1123         if (    ( 0x7FFD < zExp )
1124              || (    ( zExp == 0x7FFD )
1125                   && eq128(
1126                          LIT64( 0x0001FFFFFFFFFFFF ),
1127                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1128                          zSig0,
1129                          zSig1
1130                      )
1131                   && increment
1132                 )
1133            ) {
1134             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1135             if (    ( roundingMode == float_round_to_zero )
1136                  || ( zSign && ( roundingMode == float_round_up ) )
1137                  || ( ! zSign && ( roundingMode == float_round_down ) )
1138                ) {
1139                 return
1140                     packFloat128(
1141                         zSign,
1142                         0x7FFE,
1143                         LIT64( 0x0000FFFFFFFFFFFF ),
1144                         LIT64( 0xFFFFFFFFFFFFFFFF )
1145                     );
1146             }
1147             return packFloat128( zSign, 0x7FFF, 0, 0 );
1148         }
1149         if ( zExp < 0 ) {
1150             if (STATUS(flush_to_zero)) {
1151                 float_raise(float_flag_output_denormal STATUS_VAR);
1152                 return packFloat128(zSign, 0, 0, 0);
1153             }
1154             isTiny =
1155                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1156                 || ( zExp < -1 )
1157                 || ! increment
1158                 || lt128(
1159                        zSig0,
1160                        zSig1,
1161                        LIT64( 0x0001FFFFFFFFFFFF ),
1162                        LIT64( 0xFFFFFFFFFFFFFFFF )
1163                    );
1164             shift128ExtraRightJamming(
1165                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1166             zExp = 0;
1167             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1168             switch (roundingMode) {
1169             case float_round_nearest_even:
1170             case float_round_ties_away:
1171                 increment = ((int64_t)zSig2 < 0);
1172                 break;
1173             case float_round_to_zero:
1174                 increment = 0;
1175                 break;
1176             case float_round_up:
1177                 increment = !zSign && zSig2;
1178                 break;
1179             case float_round_down:
1180                 increment = zSign && zSig2;
1181                 break;
1182             default:
1183                 abort();
1184             }
1185         }
1186     }
1187     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1188     if ( increment ) {
1189         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1190         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1191     }
1192     else {
1193         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1194     }
1195     return packFloat128( zSign, zExp, zSig0, zSig1 );
1196
1197 }
1198
1199 /*----------------------------------------------------------------------------
1200 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1201 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1202 | returns the proper quadruple-precision floating-point value corresponding
1203 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1204 | except that the input significand has fewer bits and does not have to be
1205 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1206 | point exponent.
1207 *----------------------------------------------------------------------------*/
1208
1209 static float128
1210  normalizeRoundAndPackFloat128(
1211      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1212 {
1213     int8 shiftCount;
1214     uint64_t zSig2;
1215
1216     if ( zSig0 == 0 ) {
1217         zSig0 = zSig1;
1218         zSig1 = 0;
1219         zExp -= 64;
1220     }
1221     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1222     if ( 0 <= shiftCount ) {
1223         zSig2 = 0;
1224         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1225     }
1226     else {
1227         shift128ExtraRightJamming(
1228             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1229     }
1230     zExp -= shiftCount;
1231     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1232
1233 }
1234
1235 /*----------------------------------------------------------------------------
1236 | Returns the result of converting the 32-bit two's complement integer `a'
1237 | to the single-precision floating-point format.  The conversion is performed
1238 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1239 *----------------------------------------------------------------------------*/
1240
1241 float32 int32_to_float32(int32_t a STATUS_PARAM)
1242 {
1243     flag zSign;
1244
1245     if ( a == 0 ) return float32_zero;
1246     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1247     zSign = ( a < 0 );
1248     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1249
1250 }
1251
1252 /*----------------------------------------------------------------------------
1253 | Returns the result of converting the 32-bit two's complement integer `a'
1254 | to the double-precision floating-point format.  The conversion is performed
1255 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1256 *----------------------------------------------------------------------------*/
1257
1258 float64 int32_to_float64(int32_t a STATUS_PARAM)
1259 {
1260     flag zSign;
1261     uint32 absA;
1262     int8 shiftCount;
1263     uint64_t zSig;
1264
1265     if ( a == 0 ) return float64_zero;
1266     zSign = ( a < 0 );
1267     absA = zSign ? - a : a;
1268     shiftCount = countLeadingZeros32( absA ) + 21;
1269     zSig = absA;
1270     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1271
1272 }
1273
1274 /*----------------------------------------------------------------------------
1275 | Returns the result of converting the 32-bit two's complement integer `a'
1276 | to the extended double-precision floating-point format.  The conversion
1277 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1278 | Arithmetic.
1279 *----------------------------------------------------------------------------*/
1280
1281 floatx80 int32_to_floatx80(int32_t a STATUS_PARAM)
1282 {
1283     flag zSign;
1284     uint32 absA;
1285     int8 shiftCount;
1286     uint64_t zSig;
1287
1288     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1289     zSign = ( a < 0 );
1290     absA = zSign ? - a : a;
1291     shiftCount = countLeadingZeros32( absA ) + 32;
1292     zSig = absA;
1293     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1294
1295 }
1296
1297 /*----------------------------------------------------------------------------
1298 | Returns the result of converting the 32-bit two's complement integer `a' to
1299 | the quadruple-precision floating-point format.  The conversion is performed
1300 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1301 *----------------------------------------------------------------------------*/
1302
1303 float128 int32_to_float128(int32_t a STATUS_PARAM)
1304 {
1305     flag zSign;
1306     uint32 absA;
1307     int8 shiftCount;
1308     uint64_t zSig0;
1309
1310     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1311     zSign = ( a < 0 );
1312     absA = zSign ? - a : a;
1313     shiftCount = countLeadingZeros32( absA ) + 17;
1314     zSig0 = absA;
1315     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1316
1317 }
1318
1319 /*----------------------------------------------------------------------------
1320 | Returns the result of converting the 64-bit two's complement integer `a'
1321 | to the single-precision floating-point format.  The conversion is performed
1322 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1323 *----------------------------------------------------------------------------*/
1324
1325 float32 int64_to_float32(int64_t a STATUS_PARAM)
1326 {
1327     flag zSign;
1328     uint64 absA;
1329     int8 shiftCount;
1330
1331     if ( a == 0 ) return float32_zero;
1332     zSign = ( a < 0 );
1333     absA = zSign ? - a : a;
1334     shiftCount = countLeadingZeros64( absA ) - 40;
1335     if ( 0 <= shiftCount ) {
1336         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1337     }
1338     else {
1339         shiftCount += 7;
1340         if ( shiftCount < 0 ) {
1341             shift64RightJamming( absA, - shiftCount, &absA );
1342         }
1343         else {
1344             absA <<= shiftCount;
1345         }
1346         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1347     }
1348
1349 }
1350
1351 /*----------------------------------------------------------------------------
1352 | Returns the result of converting the 64-bit two's complement integer `a'
1353 | to the double-precision floating-point format.  The conversion is performed
1354 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1355 *----------------------------------------------------------------------------*/
1356
1357 float64 int64_to_float64(int64_t a STATUS_PARAM)
1358 {
1359     flag zSign;
1360
1361     if ( a == 0 ) return float64_zero;
1362     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1363         return packFloat64( 1, 0x43E, 0 );
1364     }
1365     zSign = ( a < 0 );
1366     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1367
1368 }
1369
1370 /*----------------------------------------------------------------------------
1371 | Returns the result of converting the 64-bit two's complement integer `a'
1372 | to the extended double-precision floating-point format.  The conversion
1373 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1374 | Arithmetic.
1375 *----------------------------------------------------------------------------*/
1376
1377 floatx80 int64_to_floatx80(int64_t a STATUS_PARAM)
1378 {
1379     flag zSign;
1380     uint64 absA;
1381     int8 shiftCount;
1382
1383     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1384     zSign = ( a < 0 );
1385     absA = zSign ? - a : a;
1386     shiftCount = countLeadingZeros64( absA );
1387     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1388
1389 }
1390
1391 /*----------------------------------------------------------------------------
1392 | Returns the result of converting the 64-bit two's complement integer `a' to
1393 | the quadruple-precision floating-point format.  The conversion is performed
1394 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1395 *----------------------------------------------------------------------------*/
1396
1397 float128 int64_to_float128(int64_t a STATUS_PARAM)
1398 {
1399     flag zSign;
1400     uint64 absA;
1401     int8 shiftCount;
1402     int32 zExp;
1403     uint64_t zSig0, zSig1;
1404
1405     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1406     zSign = ( a < 0 );
1407     absA = zSign ? - a : a;
1408     shiftCount = countLeadingZeros64( absA ) + 49;
1409     zExp = 0x406E - shiftCount;
1410     if ( 64 <= shiftCount ) {
1411         zSig1 = 0;
1412         zSig0 = absA;
1413         shiftCount -= 64;
1414     }
1415     else {
1416         zSig1 = absA;
1417         zSig0 = 0;
1418     }
1419     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1420     return packFloat128( zSign, zExp, zSig0, zSig1 );
1421
1422 }
1423
1424 /*----------------------------------------------------------------------------
1425 | Returns the result of converting the 64-bit unsigned integer `a'
1426 | to the single-precision floating-point format.  The conversion is performed
1427 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1428 *----------------------------------------------------------------------------*/
1429
1430 float32 uint64_to_float32(uint64_t a STATUS_PARAM)
1431 {
1432     int shiftcount;
1433
1434     if (a == 0) {
1435         return float32_zero;
1436     }
1437
1438     /* Determine (left) shift needed to put first set bit into bit posn 23
1439      * (since packFloat32() expects the binary point between bits 23 and 22);
1440      * this is the fast case for smallish numbers.
1441      */
1442     shiftcount = countLeadingZeros64(a) - 40;
1443     if (shiftcount >= 0) {
1444         return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1445     }
1446     /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1447      * expects the binary point between bits 30 and 29, hence the + 7.
1448      */
1449     shiftcount += 7;
1450     if (shiftcount < 0) {
1451         shift64RightJamming(a, -shiftcount, &a);
1452     } else {
1453         a <<= shiftcount;
1454     }
1455
1456     return roundAndPackFloat32(0, 0x9c - shiftcount, a STATUS_VAR);
1457 }
1458
1459 /*----------------------------------------------------------------------------
1460 | Returns the result of converting the 64-bit unsigned integer `a'
1461 | to the double-precision floating-point format.  The conversion is performed
1462 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1463 *----------------------------------------------------------------------------*/
1464
1465 float64 uint64_to_float64(uint64_t a STATUS_PARAM)
1466 {
1467     int exp = 0x43C;
1468     int shiftcount;
1469
1470     if (a == 0) {
1471         return float64_zero;
1472     }
1473
1474     shiftcount = countLeadingZeros64(a) - 1;
1475     if (shiftcount < 0) {
1476         shift64RightJamming(a, -shiftcount, &a);
1477     } else {
1478         a <<= shiftcount;
1479     }
1480     return roundAndPackFloat64(0, exp - shiftcount, a STATUS_VAR);
1481 }
1482
1483 /*----------------------------------------------------------------------------
1484 | Returns the result of converting the 64-bit unsigned integer `a'
1485 | to the quadruple-precision floating-point format.  The conversion is performed
1486 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1487 *----------------------------------------------------------------------------*/
1488
1489 float128 uint64_to_float128(uint64_t a STATUS_PARAM)
1490 {
1491     if (a == 0) {
1492         return float128_zero;
1493     }
1494     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1495 }
1496
1497 /*----------------------------------------------------------------------------
1498 | Returns the result of converting the single-precision floating-point value
1499 | `a' to the 32-bit two's complement integer format.  The conversion is
1500 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1501 | Arithmetic---which means in particular that the conversion is rounded
1502 | according to the current rounding mode.  If `a' is a NaN, the largest
1503 | positive integer is returned.  Otherwise, if the conversion overflows, the
1504 | largest integer with the same sign as `a' is returned.
1505 *----------------------------------------------------------------------------*/
1506
1507 int32 float32_to_int32( float32 a STATUS_PARAM )
1508 {
1509     flag aSign;
1510     int_fast16_t aExp, shiftCount;
1511     uint32_t aSig;
1512     uint64_t aSig64;
1513
1514     a = float32_squash_input_denormal(a STATUS_VAR);
1515     aSig = extractFloat32Frac( a );
1516     aExp = extractFloat32Exp( a );
1517     aSign = extractFloat32Sign( a );
1518     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1519     if ( aExp ) aSig |= 0x00800000;
1520     shiftCount = 0xAF - aExp;
1521     aSig64 = aSig;
1522     aSig64 <<= 32;
1523     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1524     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1525
1526 }
1527
1528 /*----------------------------------------------------------------------------
1529 | Returns the result of converting the single-precision floating-point value
1530 | `a' to the 32-bit two's complement integer format.  The conversion is
1531 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1532 | Arithmetic, except that the conversion is always rounded toward zero.
1533 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1534 | the conversion overflows, the largest integer with the same sign as `a' is
1535 | returned.
1536 *----------------------------------------------------------------------------*/
1537
1538 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1539 {
1540     flag aSign;
1541     int_fast16_t aExp, shiftCount;
1542     uint32_t aSig;
1543     int32_t z;
1544     a = float32_squash_input_denormal(a STATUS_VAR);
1545
1546     aSig = extractFloat32Frac( a );
1547     aExp = extractFloat32Exp( a );
1548     aSign = extractFloat32Sign( a );
1549     shiftCount = aExp - 0x9E;
1550     if ( 0 <= shiftCount ) {
1551         if ( float32_val(a) != 0xCF000000 ) {
1552             float_raise( float_flag_invalid STATUS_VAR);
1553             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1554         }
1555         return (int32_t) 0x80000000;
1556     }
1557     else if ( aExp <= 0x7E ) {
1558         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1559         return 0;
1560     }
1561     aSig = ( aSig | 0x00800000 )<<8;
1562     z = aSig>>( - shiftCount );
1563     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1564         STATUS(float_exception_flags) |= float_flag_inexact;
1565     }
1566     if ( aSign ) z = - z;
1567     return z;
1568
1569 }
1570
1571 /*----------------------------------------------------------------------------
1572 | Returns the result of converting the single-precision floating-point value
1573 | `a' to the 16-bit two's complement integer format.  The conversion is
1574 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1575 | Arithmetic, except that the conversion is always rounded toward zero.
1576 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1577 | the conversion overflows, the largest integer with the same sign as `a' is
1578 | returned.
1579 *----------------------------------------------------------------------------*/
1580
1581 int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
1582 {
1583     flag aSign;
1584     int_fast16_t aExp, shiftCount;
1585     uint32_t aSig;
1586     int32 z;
1587
1588     aSig = extractFloat32Frac( a );
1589     aExp = extractFloat32Exp( a );
1590     aSign = extractFloat32Sign( a );
1591     shiftCount = aExp - 0x8E;
1592     if ( 0 <= shiftCount ) {
1593         if ( float32_val(a) != 0xC7000000 ) {
1594             float_raise( float_flag_invalid STATUS_VAR);
1595             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1596                 return 0x7FFF;
1597             }
1598         }
1599         return (int32_t) 0xffff8000;
1600     }
1601     else if ( aExp <= 0x7E ) {
1602         if ( aExp | aSig ) {
1603             STATUS(float_exception_flags) |= float_flag_inexact;
1604         }
1605         return 0;
1606     }
1607     shiftCount -= 0x10;
1608     aSig = ( aSig | 0x00800000 )<<8;
1609     z = aSig>>( - shiftCount );
1610     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1611         STATUS(float_exception_flags) |= float_flag_inexact;
1612     }
1613     if ( aSign ) {
1614         z = - z;
1615     }
1616     return z;
1617
1618 }
1619
1620 /*----------------------------------------------------------------------------
1621 | Returns the result of converting the single-precision floating-point value
1622 | `a' to the 64-bit two's complement integer format.  The conversion is
1623 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1624 | Arithmetic---which means in particular that the conversion is rounded
1625 | according to the current rounding mode.  If `a' is a NaN, the largest
1626 | positive integer is returned.  Otherwise, if the conversion overflows, the
1627 | largest integer with the same sign as `a' is returned.
1628 *----------------------------------------------------------------------------*/
1629
1630 int64 float32_to_int64( float32 a STATUS_PARAM )
1631 {
1632     flag aSign;
1633     int_fast16_t aExp, shiftCount;
1634     uint32_t aSig;
1635     uint64_t aSig64, aSigExtra;
1636     a = float32_squash_input_denormal(a STATUS_VAR);
1637
1638     aSig = extractFloat32Frac( a );
1639     aExp = extractFloat32Exp( a );
1640     aSign = extractFloat32Sign( a );
1641     shiftCount = 0xBE - aExp;
1642     if ( shiftCount < 0 ) {
1643         float_raise( float_flag_invalid STATUS_VAR);
1644         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1645             return LIT64( 0x7FFFFFFFFFFFFFFF );
1646         }
1647         return (int64_t) LIT64( 0x8000000000000000 );
1648     }
1649     if ( aExp ) aSig |= 0x00800000;
1650     aSig64 = aSig;
1651     aSig64 <<= 40;
1652     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1653     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1654
1655 }
1656
1657 /*----------------------------------------------------------------------------
1658 | Returns the result of converting the single-precision floating-point value
1659 | `a' to the 64-bit unsigned integer format.  The conversion is
1660 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1661 | Arithmetic---which means in particular that the conversion is rounded
1662 | according to the current rounding mode.  If `a' is a NaN, the largest
1663 | unsigned integer is returned.  Otherwise, if the conversion overflows, the
1664 | largest unsigned integer is returned.  If the 'a' is negative, the result
1665 | is rounded and zero is returned; values that do not round to zero will
1666 | raise the inexact exception flag.
1667 *----------------------------------------------------------------------------*/
1668
1669 uint64 float32_to_uint64(float32 a STATUS_PARAM)
1670 {
1671     flag aSign;
1672     int_fast16_t aExp, shiftCount;
1673     uint32_t aSig;
1674     uint64_t aSig64, aSigExtra;
1675     a = float32_squash_input_denormal(a STATUS_VAR);
1676
1677     aSig = extractFloat32Frac(a);
1678     aExp = extractFloat32Exp(a);
1679     aSign = extractFloat32Sign(a);
1680     if ((aSign) && (aExp > 126)) {
1681         float_raise(float_flag_invalid STATUS_VAR);
1682         if (float32_is_any_nan(a)) {
1683             return LIT64(0xFFFFFFFFFFFFFFFF);
1684         } else {
1685             return 0;
1686         }
1687     }
1688     shiftCount = 0xBE - aExp;
1689     if (aExp) {
1690         aSig |= 0x00800000;
1691     }
1692     if (shiftCount < 0) {
1693         float_raise(float_flag_invalid STATUS_VAR);
1694         return LIT64(0xFFFFFFFFFFFFFFFF);
1695     }
1696
1697     aSig64 = aSig;
1698     aSig64 <<= 40;
1699     shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1700     return roundAndPackUint64(aSign, aSig64, aSigExtra STATUS_VAR);
1701 }
1702
1703 /*----------------------------------------------------------------------------
1704 | Returns the result of converting the single-precision floating-point value
1705 | `a' to the 64-bit unsigned integer format.  The conversion is
1706 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1707 | Arithmetic, except that the conversion is always rounded toward zero.  If
1708 | `a' is a NaN, the largest unsigned integer is returned.  Otherwise, if the
1709 | conversion overflows, the largest unsigned integer is returned.  If the
1710 | 'a' is negative, the result is rounded and zero is returned; values that do
1711 | not round to zero will raise the inexact flag.
1712 *----------------------------------------------------------------------------*/
1713
1714 uint64 float32_to_uint64_round_to_zero(float32 a STATUS_PARAM)
1715 {
1716     signed char current_rounding_mode = STATUS(float_rounding_mode);
1717     set_float_rounding_mode(float_round_to_zero STATUS_VAR);
1718     int64_t v = float32_to_uint64(a STATUS_VAR);
1719     set_float_rounding_mode(current_rounding_mode STATUS_VAR);
1720     return v;
1721 }
1722
1723 /*----------------------------------------------------------------------------
1724 | Returns the result of converting the single-precision floating-point value
1725 | `a' to the 64-bit two's complement integer format.  The conversion is
1726 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1727 | Arithmetic, except that the conversion is always rounded toward zero.  If
1728 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1729 | conversion overflows, the largest integer with the same sign as `a' is
1730 | returned.
1731 *----------------------------------------------------------------------------*/
1732
1733 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1734 {
1735     flag aSign;
1736     int_fast16_t aExp, shiftCount;
1737     uint32_t aSig;
1738     uint64_t aSig64;
1739     int64 z;
1740     a = float32_squash_input_denormal(a STATUS_VAR);
1741
1742     aSig = extractFloat32Frac( a );
1743     aExp = extractFloat32Exp( a );
1744     aSign = extractFloat32Sign( a );
1745     shiftCount = aExp - 0xBE;
1746     if ( 0 <= shiftCount ) {
1747         if ( float32_val(a) != 0xDF000000 ) {
1748             float_raise( float_flag_invalid STATUS_VAR);
1749             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1750                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1751             }
1752         }
1753         return (int64_t) LIT64( 0x8000000000000000 );
1754     }
1755     else if ( aExp <= 0x7E ) {
1756         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1757         return 0;
1758     }
1759     aSig64 = aSig | 0x00800000;
1760     aSig64 <<= 40;
1761     z = aSig64>>( - shiftCount );
1762     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1763         STATUS(float_exception_flags) |= float_flag_inexact;
1764     }
1765     if ( aSign ) z = - z;
1766     return z;
1767
1768 }
1769
1770 /*----------------------------------------------------------------------------
1771 | Returns the result of converting the single-precision floating-point value
1772 | `a' to the double-precision floating-point format.  The conversion is
1773 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1774 | Arithmetic.
1775 *----------------------------------------------------------------------------*/
1776
1777 float64 float32_to_float64( float32 a STATUS_PARAM )
1778 {
1779     flag aSign;
1780     int_fast16_t aExp;
1781     uint32_t aSig;
1782     a = float32_squash_input_denormal(a STATUS_VAR);
1783
1784     aSig = extractFloat32Frac( a );
1785     aExp = extractFloat32Exp( a );
1786     aSign = extractFloat32Sign( a );
1787     if ( aExp == 0xFF ) {
1788         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1789         return packFloat64( aSign, 0x7FF, 0 );
1790     }
1791     if ( aExp == 0 ) {
1792         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1793         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1794         --aExp;
1795     }
1796     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1797
1798 }
1799
1800 /*----------------------------------------------------------------------------
1801 | Returns the result of converting the single-precision floating-point value
1802 | `a' to the extended double-precision floating-point format.  The conversion
1803 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1804 | Arithmetic.
1805 *----------------------------------------------------------------------------*/
1806
1807 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1808 {
1809     flag aSign;
1810     int_fast16_t aExp;
1811     uint32_t aSig;
1812
1813     a = float32_squash_input_denormal(a STATUS_VAR);
1814     aSig = extractFloat32Frac( a );
1815     aExp = extractFloat32Exp( a );
1816     aSign = extractFloat32Sign( a );
1817     if ( aExp == 0xFF ) {
1818         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1819         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1820     }
1821     if ( aExp == 0 ) {
1822         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1823         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1824     }
1825     aSig |= 0x00800000;
1826     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1827
1828 }
1829
1830 /*----------------------------------------------------------------------------
1831 | Returns the result of converting the single-precision floating-point value
1832 | `a' to the double-precision floating-point format.  The conversion is
1833 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1834 | Arithmetic.
1835 *----------------------------------------------------------------------------*/
1836
1837 float128 float32_to_float128( float32 a STATUS_PARAM )
1838 {
1839     flag aSign;
1840     int_fast16_t aExp;
1841     uint32_t aSig;
1842
1843     a = float32_squash_input_denormal(a STATUS_VAR);
1844     aSig = extractFloat32Frac( a );
1845     aExp = extractFloat32Exp( a );
1846     aSign = extractFloat32Sign( a );
1847     if ( aExp == 0xFF ) {
1848         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1849         return packFloat128( aSign, 0x7FFF, 0, 0 );
1850     }
1851     if ( aExp == 0 ) {
1852         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1853         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1854         --aExp;
1855     }
1856     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1857
1858 }
1859
1860 /*----------------------------------------------------------------------------
1861 | Rounds the single-precision floating-point value `a' to an integer, and
1862 | returns the result as a single-precision floating-point value.  The
1863 | operation is performed according to the IEC/IEEE Standard for Binary
1864 | Floating-Point Arithmetic.
1865 *----------------------------------------------------------------------------*/
1866
1867 float32 float32_round_to_int( float32 a STATUS_PARAM)
1868 {
1869     flag aSign;
1870     int_fast16_t aExp;
1871     uint32_t lastBitMask, roundBitsMask;
1872     uint32_t z;
1873     a = float32_squash_input_denormal(a STATUS_VAR);
1874
1875     aExp = extractFloat32Exp( a );
1876     if ( 0x96 <= aExp ) {
1877         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1878             return propagateFloat32NaN( a, a STATUS_VAR );
1879         }
1880         return a;
1881     }
1882     if ( aExp <= 0x7E ) {
1883         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1884         STATUS(float_exception_flags) |= float_flag_inexact;
1885         aSign = extractFloat32Sign( a );
1886         switch ( STATUS(float_rounding_mode) ) {
1887          case float_round_nearest_even:
1888             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1889                 return packFloat32( aSign, 0x7F, 0 );
1890             }
1891             break;
1892         case float_round_ties_away:
1893             if (aExp == 0x7E) {
1894                 return packFloat32(aSign, 0x7F, 0);
1895             }
1896             break;
1897          case float_round_down:
1898             return make_float32(aSign ? 0xBF800000 : 0);
1899          case float_round_up:
1900             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1901         }
1902         return packFloat32( aSign, 0, 0 );
1903     }
1904     lastBitMask = 1;
1905     lastBitMask <<= 0x96 - aExp;
1906     roundBitsMask = lastBitMask - 1;
1907     z = float32_val(a);
1908     switch (STATUS(float_rounding_mode)) {
1909     case float_round_nearest_even:
1910         z += lastBitMask>>1;
1911         if ((z & roundBitsMask) == 0) {
1912             z &= ~lastBitMask;
1913         }
1914         break;
1915     case float_round_ties_away:
1916         z += lastBitMask >> 1;
1917         break;
1918     case float_round_to_zero:
1919         break;
1920     case float_round_up:
1921         if (!extractFloat32Sign(make_float32(z))) {
1922             z += roundBitsMask;
1923         }
1924         break;
1925     case float_round_down:
1926         if (extractFloat32Sign(make_float32(z))) {
1927             z += roundBitsMask;
1928         }
1929         break;
1930     default:
1931         abort();
1932     }
1933     z &= ~ roundBitsMask;
1934     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1935     return make_float32(z);
1936
1937 }
1938
1939 /*----------------------------------------------------------------------------
1940 | Returns the result of adding the absolute values of the single-precision
1941 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1942 | before being returned.  `zSign' is ignored if the result is a NaN.
1943 | The addition is performed according to the IEC/IEEE Standard for Binary
1944 | Floating-Point Arithmetic.
1945 *----------------------------------------------------------------------------*/
1946
1947 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1948 {
1949     int_fast16_t aExp, bExp, zExp;
1950     uint32_t aSig, bSig, zSig;
1951     int_fast16_t expDiff;
1952
1953     aSig = extractFloat32Frac( a );
1954     aExp = extractFloat32Exp( a );
1955     bSig = extractFloat32Frac( b );
1956     bExp = extractFloat32Exp( b );
1957     expDiff = aExp - bExp;
1958     aSig <<= 6;
1959     bSig <<= 6;
1960     if ( 0 < expDiff ) {
1961         if ( aExp == 0xFF ) {
1962             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1963             return a;
1964         }
1965         if ( bExp == 0 ) {
1966             --expDiff;
1967         }
1968         else {
1969             bSig |= 0x20000000;
1970         }
1971         shift32RightJamming( bSig, expDiff, &bSig );
1972         zExp = aExp;
1973     }
1974     else if ( expDiff < 0 ) {
1975         if ( bExp == 0xFF ) {
1976             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1977             return packFloat32( zSign, 0xFF, 0 );
1978         }
1979         if ( aExp == 0 ) {
1980             ++expDiff;
1981         }
1982         else {
1983             aSig |= 0x20000000;
1984         }
1985         shift32RightJamming( aSig, - expDiff, &aSig );
1986         zExp = bExp;
1987     }
1988     else {
1989         if ( aExp == 0xFF ) {
1990             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1991             return a;
1992         }
1993         if ( aExp == 0 ) {
1994             if (STATUS(flush_to_zero)) {
1995                 if (aSig | bSig) {
1996                     float_raise(float_flag_output_denormal STATUS_VAR);
1997                 }
1998                 return packFloat32(zSign, 0, 0);
1999             }
2000             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2001         }
2002         zSig = 0x40000000 + aSig + bSig;
2003         zExp = aExp;
2004         goto roundAndPack;
2005     }
2006     aSig |= 0x20000000;
2007     zSig = ( aSig + bSig )<<1;
2008     --zExp;
2009     if ( (int32_t) zSig < 0 ) {
2010         zSig = aSig + bSig;
2011         ++zExp;
2012     }
2013  roundAndPack:
2014     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2015
2016 }
2017
2018 /*----------------------------------------------------------------------------
2019 | Returns the result of subtracting the absolute values of the single-
2020 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2021 | difference is negated before being returned.  `zSign' is ignored if the
2022 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2023 | Standard for Binary Floating-Point Arithmetic.
2024 *----------------------------------------------------------------------------*/
2025
2026 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
2027 {
2028     int_fast16_t aExp, bExp, zExp;
2029     uint32_t aSig, bSig, zSig;
2030     int_fast16_t expDiff;
2031
2032     aSig = extractFloat32Frac( a );
2033     aExp = extractFloat32Exp( a );
2034     bSig = extractFloat32Frac( b );
2035     bExp = extractFloat32Exp( b );
2036     expDiff = aExp - bExp;
2037     aSig <<= 7;
2038     bSig <<= 7;
2039     if ( 0 < expDiff ) goto aExpBigger;
2040     if ( expDiff < 0 ) goto bExpBigger;
2041     if ( aExp == 0xFF ) {
2042         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2043         float_raise( float_flag_invalid STATUS_VAR);
2044         return float32_default_nan;
2045     }
2046     if ( aExp == 0 ) {
2047         aExp = 1;
2048         bExp = 1;
2049     }
2050     if ( bSig < aSig ) goto aBigger;
2051     if ( aSig < bSig ) goto bBigger;
2052     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2053  bExpBigger:
2054     if ( bExp == 0xFF ) {
2055         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2056         return packFloat32( zSign ^ 1, 0xFF, 0 );
2057     }
2058     if ( aExp == 0 ) {
2059         ++expDiff;
2060     }
2061     else {
2062         aSig |= 0x40000000;
2063     }
2064     shift32RightJamming( aSig, - expDiff, &aSig );
2065     bSig |= 0x40000000;
2066  bBigger:
2067     zSig = bSig - aSig;
2068     zExp = bExp;
2069     zSign ^= 1;
2070     goto normalizeRoundAndPack;
2071  aExpBigger:
2072     if ( aExp == 0xFF ) {
2073         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2074         return a;
2075     }
2076     if ( bExp == 0 ) {
2077         --expDiff;
2078     }
2079     else {
2080         bSig |= 0x40000000;
2081     }
2082     shift32RightJamming( bSig, expDiff, &bSig );
2083     aSig |= 0x40000000;
2084  aBigger:
2085     zSig = aSig - bSig;
2086     zExp = aExp;
2087  normalizeRoundAndPack:
2088     --zExp;
2089     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2090
2091 }
2092
2093 /*----------------------------------------------------------------------------
2094 | Returns the result of adding the single-precision floating-point values `a'
2095 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2096 | Binary Floating-Point Arithmetic.
2097 *----------------------------------------------------------------------------*/
2098
2099 float32 float32_add( float32 a, float32 b STATUS_PARAM )
2100 {
2101     flag aSign, bSign;
2102     a = float32_squash_input_denormal(a STATUS_VAR);
2103     b = float32_squash_input_denormal(b STATUS_VAR);
2104
2105     aSign = extractFloat32Sign( a );
2106     bSign = extractFloat32Sign( b );
2107     if ( aSign == bSign ) {
2108         return addFloat32Sigs( a, b, aSign STATUS_VAR);
2109     }
2110     else {
2111         return subFloat32Sigs( a, b, aSign STATUS_VAR );
2112     }
2113
2114 }
2115
2116 /*----------------------------------------------------------------------------
2117 | Returns the result of subtracting the single-precision floating-point values
2118 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2119 | for Binary Floating-Point Arithmetic.
2120 *----------------------------------------------------------------------------*/
2121
2122 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
2123 {
2124     flag aSign, bSign;
2125     a = float32_squash_input_denormal(a STATUS_VAR);
2126     b = float32_squash_input_denormal(b STATUS_VAR);
2127
2128     aSign = extractFloat32Sign( a );
2129     bSign = extractFloat32Sign( b );
2130     if ( aSign == bSign ) {
2131         return subFloat32Sigs( a, b, aSign STATUS_VAR );
2132     }
2133     else {
2134         return addFloat32Sigs( a, b, aSign STATUS_VAR );
2135     }
2136
2137 }
2138
2139 /*----------------------------------------------------------------------------
2140 | Returns the result of multiplying the single-precision floating-point values
2141 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2142 | for Binary Floating-Point Arithmetic.
2143 *----------------------------------------------------------------------------*/
2144
2145 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
2146 {
2147     flag aSign, bSign, zSign;
2148     int_fast16_t aExp, bExp, zExp;
2149     uint32_t aSig, bSig;
2150     uint64_t zSig64;
2151     uint32_t zSig;
2152
2153     a = float32_squash_input_denormal(a STATUS_VAR);
2154     b = float32_squash_input_denormal(b STATUS_VAR);
2155
2156     aSig = extractFloat32Frac( a );
2157     aExp = extractFloat32Exp( a );
2158     aSign = extractFloat32Sign( a );
2159     bSig = extractFloat32Frac( b );
2160     bExp = extractFloat32Exp( b );
2161     bSign = extractFloat32Sign( b );
2162     zSign = aSign ^ bSign;
2163     if ( aExp == 0xFF ) {
2164         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2165             return propagateFloat32NaN( a, b STATUS_VAR );
2166         }
2167         if ( ( bExp | bSig ) == 0 ) {
2168             float_raise( float_flag_invalid STATUS_VAR);
2169             return float32_default_nan;
2170         }
2171         return packFloat32( zSign, 0xFF, 0 );
2172     }
2173     if ( bExp == 0xFF ) {
2174         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2175         if ( ( aExp | aSig ) == 0 ) {
2176             float_raise( float_flag_invalid STATUS_VAR);
2177             return float32_default_nan;
2178         }
2179         return packFloat32( zSign, 0xFF, 0 );
2180     }
2181     if ( aExp == 0 ) {
2182         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2183         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2184     }
2185     if ( bExp == 0 ) {
2186         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2187         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2188     }
2189     zExp = aExp + bExp - 0x7F;
2190     aSig = ( aSig | 0x00800000 )<<7;
2191     bSig = ( bSig | 0x00800000 )<<8;
2192     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2193     zSig = zSig64;
2194     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2195         zSig <<= 1;
2196         --zExp;
2197     }
2198     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2199
2200 }
2201
2202 /*----------------------------------------------------------------------------
2203 | Returns the result of dividing the single-precision floating-point value `a'
2204 | by the corresponding value `b'.  The operation is performed according to the
2205 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2206 *----------------------------------------------------------------------------*/
2207
2208 float32 float32_div( float32 a, float32 b STATUS_PARAM )
2209 {
2210     flag aSign, bSign, zSign;
2211     int_fast16_t aExp, bExp, zExp;
2212     uint32_t aSig, bSig, zSig;
2213     a = float32_squash_input_denormal(a STATUS_VAR);
2214     b = float32_squash_input_denormal(b STATUS_VAR);
2215
2216     aSig = extractFloat32Frac( a );
2217     aExp = extractFloat32Exp( a );
2218     aSign = extractFloat32Sign( a );
2219     bSig = extractFloat32Frac( b );
2220     bExp = extractFloat32Exp( b );
2221     bSign = extractFloat32Sign( b );
2222     zSign = aSign ^ bSign;
2223     if ( aExp == 0xFF ) {
2224         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2225         if ( bExp == 0xFF ) {
2226             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2227             float_raise( float_flag_invalid STATUS_VAR);
2228             return float32_default_nan;
2229         }
2230         return packFloat32( zSign, 0xFF, 0 );
2231     }
2232     if ( bExp == 0xFF ) {
2233         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2234         return packFloat32( zSign, 0, 0 );
2235     }
2236     if ( bExp == 0 ) {
2237         if ( bSig == 0 ) {
2238             if ( ( aExp | aSig ) == 0 ) {
2239                 float_raise( float_flag_invalid STATUS_VAR);
2240                 return float32_default_nan;
2241             }
2242             float_raise( float_flag_divbyzero STATUS_VAR);
2243             return packFloat32( zSign, 0xFF, 0 );
2244         }
2245         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2246     }
2247     if ( aExp == 0 ) {
2248         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2249         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2250     }
2251     zExp = aExp - bExp + 0x7D;
2252     aSig = ( aSig | 0x00800000 )<<7;
2253     bSig = ( bSig | 0x00800000 )<<8;
2254     if ( bSig <= ( aSig + aSig ) ) {
2255         aSig >>= 1;
2256         ++zExp;
2257     }
2258     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2259     if ( ( zSig & 0x3F ) == 0 ) {
2260         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2261     }
2262     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2263
2264 }
2265
2266 /*----------------------------------------------------------------------------
2267 | Returns the remainder of the single-precision floating-point value `a'
2268 | with respect to the corresponding value `b'.  The operation is performed
2269 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2270 *----------------------------------------------------------------------------*/
2271
2272 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2273 {
2274     flag aSign, zSign;
2275     int_fast16_t aExp, bExp, expDiff;
2276     uint32_t aSig, bSig;
2277     uint32_t q;
2278     uint64_t aSig64, bSig64, q64;
2279     uint32_t alternateASig;
2280     int32_t sigMean;
2281     a = float32_squash_input_denormal(a STATUS_VAR);
2282     b = float32_squash_input_denormal(b STATUS_VAR);
2283
2284     aSig = extractFloat32Frac( a );
2285     aExp = extractFloat32Exp( a );
2286     aSign = extractFloat32Sign( a );
2287     bSig = extractFloat32Frac( b );
2288     bExp = extractFloat32Exp( b );
2289     if ( aExp == 0xFF ) {
2290         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2291             return propagateFloat32NaN( a, b STATUS_VAR );
2292         }
2293         float_raise( float_flag_invalid STATUS_VAR);
2294         return float32_default_nan;
2295     }
2296     if ( bExp == 0xFF ) {
2297         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2298         return a;
2299     }
2300     if ( bExp == 0 ) {
2301         if ( bSig == 0 ) {
2302             float_raise( float_flag_invalid STATUS_VAR);
2303             return float32_default_nan;
2304         }
2305         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2306     }
2307     if ( aExp == 0 ) {
2308         if ( aSig == 0 ) return a;
2309         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2310     }
2311     expDiff = aExp - bExp;
2312     aSig |= 0x00800000;
2313     bSig |= 0x00800000;
2314     if ( expDiff < 32 ) {
2315         aSig <<= 8;
2316         bSig <<= 8;
2317         if ( expDiff < 0 ) {
2318             if ( expDiff < -1 ) return a;
2319             aSig >>= 1;
2320         }
2321         q = ( bSig <= aSig );
2322         if ( q ) aSig -= bSig;
2323         if ( 0 < expDiff ) {
2324             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2325             q >>= 32 - expDiff;
2326             bSig >>= 2;
2327             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2328         }
2329         else {
2330             aSig >>= 2;
2331             bSig >>= 2;
2332         }
2333     }
2334     else {
2335         if ( bSig <= aSig ) aSig -= bSig;
2336         aSig64 = ( (uint64_t) aSig )<<40;
2337         bSig64 = ( (uint64_t) bSig )<<40;
2338         expDiff -= 64;
2339         while ( 0 < expDiff ) {
2340             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2341             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2342             aSig64 = - ( ( bSig * q64 )<<38 );
2343             expDiff -= 62;
2344         }
2345         expDiff += 64;
2346         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2347         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2348         q = q64>>( 64 - expDiff );
2349         bSig <<= 6;
2350         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2351     }
2352     do {
2353         alternateASig = aSig;
2354         ++q;
2355         aSig -= bSig;
2356     } while ( 0 <= (int32_t) aSig );
2357     sigMean = aSig + alternateASig;
2358     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2359         aSig = alternateASig;
2360     }
2361     zSign = ( (int32_t) aSig < 0 );
2362     if ( zSign ) aSig = - aSig;
2363     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2364
2365 }
2366
2367 /*----------------------------------------------------------------------------
2368 | Returns the result of multiplying the single-precision floating-point values
2369 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2370 | multiplication.  The operation is performed according to the IEC/IEEE
2371 | Standard for Binary Floating-Point Arithmetic 754-2008.
2372 | The flags argument allows the caller to select negation of the
2373 | addend, the intermediate product, or the final result. (The difference
2374 | between this and having the caller do a separate negation is that negating
2375 | externally will flip the sign bit on NaNs.)
2376 *----------------------------------------------------------------------------*/
2377
2378 float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2379 {
2380     flag aSign, bSign, cSign, zSign;
2381     int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2382     uint32_t aSig, bSig, cSig;
2383     flag pInf, pZero, pSign;
2384     uint64_t pSig64, cSig64, zSig64;
2385     uint32_t pSig;
2386     int shiftcount;
2387     flag signflip, infzero;
2388
2389     a = float32_squash_input_denormal(a STATUS_VAR);
2390     b = float32_squash_input_denormal(b STATUS_VAR);
2391     c = float32_squash_input_denormal(c STATUS_VAR);
2392     aSig = extractFloat32Frac(a);
2393     aExp = extractFloat32Exp(a);
2394     aSign = extractFloat32Sign(a);
2395     bSig = extractFloat32Frac(b);
2396     bExp = extractFloat32Exp(b);
2397     bSign = extractFloat32Sign(b);
2398     cSig = extractFloat32Frac(c);
2399     cExp = extractFloat32Exp(c);
2400     cSign = extractFloat32Sign(c);
2401
2402     infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2403                (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2404
2405     /* It is implementation-defined whether the cases of (0,inf,qnan)
2406      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2407      * they return if they do), so we have to hand this information
2408      * off to the target-specific pick-a-NaN routine.
2409      */
2410     if (((aExp == 0xff) && aSig) ||
2411         ((bExp == 0xff) && bSig) ||
2412         ((cExp == 0xff) && cSig)) {
2413         return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2414     }
2415
2416     if (infzero) {
2417         float_raise(float_flag_invalid STATUS_VAR);
2418         return float32_default_nan;
2419     }
2420
2421     if (flags & float_muladd_negate_c) {
2422         cSign ^= 1;
2423     }
2424
2425     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2426
2427     /* Work out the sign and type of the product */
2428     pSign = aSign ^ bSign;
2429     if (flags & float_muladd_negate_product) {
2430         pSign ^= 1;
2431     }
2432     pInf = (aExp == 0xff) || (bExp == 0xff);
2433     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2434
2435     if (cExp == 0xff) {
2436         if (pInf && (pSign ^ cSign)) {
2437             /* addition of opposite-signed infinities => InvalidOperation */
2438             float_raise(float_flag_invalid STATUS_VAR);
2439             return float32_default_nan;
2440         }
2441         /* Otherwise generate an infinity of the same sign */
2442         return packFloat32(cSign ^ signflip, 0xff, 0);
2443     }
2444
2445     if (pInf) {
2446         return packFloat32(pSign ^ signflip, 0xff, 0);
2447     }
2448
2449     if (pZero) {
2450         if (cExp == 0) {
2451             if (cSig == 0) {
2452                 /* Adding two exact zeroes */
2453                 if (pSign == cSign) {
2454                     zSign = pSign;
2455                 } else if (STATUS(float_rounding_mode) == float_round_down) {
2456                     zSign = 1;
2457                 } else {
2458                     zSign = 0;
2459                 }
2460                 return packFloat32(zSign ^ signflip, 0, 0);
2461             }
2462             /* Exact zero plus a denorm */
2463             if (STATUS(flush_to_zero)) {
2464                 float_raise(float_flag_output_denormal STATUS_VAR);
2465                 return packFloat32(cSign ^ signflip, 0, 0);
2466             }
2467         }
2468         /* Zero plus something non-zero : just return the something */
2469         if (flags & float_muladd_halve_result) {
2470             if (cExp == 0) {
2471                 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2472             }
2473             /* Subtract one to halve, and one again because roundAndPackFloat32
2474              * wants one less than the true exponent.
2475              */
2476             cExp -= 2;
2477             cSig = (cSig | 0x00800000) << 7;
2478             return roundAndPackFloat32(cSign ^ signflip, cExp, cSig STATUS_VAR);
2479         }
2480         return packFloat32(cSign ^ signflip, cExp, cSig);
2481     }
2482
2483     if (aExp == 0) {
2484         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2485     }
2486     if (bExp == 0) {
2487         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2488     }
2489
2490     /* Calculate the actual result a * b + c */
2491
2492     /* Multiply first; this is easy. */
2493     /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2494      * because we want the true exponent, not the "one-less-than"
2495      * flavour that roundAndPackFloat32() takes.
2496      */
2497     pExp = aExp + bExp - 0x7e;
2498     aSig = (aSig | 0x00800000) << 7;
2499     bSig = (bSig | 0x00800000) << 8;
2500     pSig64 = (uint64_t)aSig * bSig;
2501     if ((int64_t)(pSig64 << 1) >= 0) {
2502         pSig64 <<= 1;
2503         pExp--;
2504     }
2505
2506     zSign = pSign ^ signflip;
2507
2508     /* Now pSig64 is the significand of the multiply, with the explicit bit in
2509      * position 62.
2510      */
2511     if (cExp == 0) {
2512         if (!cSig) {
2513             /* Throw out the special case of c being an exact zero now */
2514             shift64RightJamming(pSig64, 32, &pSig64);
2515             pSig = pSig64;
2516             if (flags & float_muladd_halve_result) {
2517                 pExp--;
2518             }
2519             return roundAndPackFloat32(zSign, pExp - 1,
2520                                        pSig STATUS_VAR);
2521         }
2522         normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2523     }
2524
2525     cSig64 = (uint64_t)cSig << (62 - 23);
2526     cSig64 |= LIT64(0x4000000000000000);
2527     expDiff = pExp - cExp;
2528
2529     if (pSign == cSign) {
2530         /* Addition */
2531         if (expDiff > 0) {
2532             /* scale c to match p */
2533             shift64RightJamming(cSig64, expDiff, &cSig64);
2534             zExp = pExp;
2535         } else if (expDiff < 0) {
2536             /* scale p to match c */
2537             shift64RightJamming(pSig64, -expDiff, &pSig64);
2538             zExp = cExp;
2539         } else {
2540             /* no scaling needed */
2541             zExp = cExp;
2542         }
2543         /* Add significands and make sure explicit bit ends up in posn 62 */
2544         zSig64 = pSig64 + cSig64;
2545         if ((int64_t)zSig64 < 0) {
2546             shift64RightJamming(zSig64, 1, &zSig64);
2547         } else {
2548             zExp--;
2549         }
2550     } else {
2551         /* Subtraction */
2552         if (expDiff > 0) {
2553             shift64RightJamming(cSig64, expDiff, &cSig64);
2554             zSig64 = pSig64 - cSig64;
2555             zExp = pExp;
2556         } else if (expDiff < 0) {
2557             shift64RightJamming(pSig64, -expDiff, &pSig64);
2558             zSig64 = cSig64 - pSig64;
2559             zExp = cExp;
2560             zSign ^= 1;
2561         } else {
2562             zExp = pExp;
2563             if (cSig64 < pSig64) {
2564                 zSig64 = pSig64 - cSig64;
2565             } else if (pSig64 < cSig64) {
2566                 zSig64 = cSig64 - pSig64;
2567                 zSign ^= 1;
2568             } else {
2569                 /* Exact zero */
2570                 zSign = signflip;
2571                 if (STATUS(float_rounding_mode) == float_round_down) {
2572                     zSign ^= 1;
2573                 }
2574                 return packFloat32(zSign, 0, 0);
2575             }
2576         }
2577         --zExp;
2578         /* Normalize to put the explicit bit back into bit 62. */
2579         shiftcount = countLeadingZeros64(zSig64) - 1;
2580         zSig64 <<= shiftcount;
2581         zExp -= shiftcount;
2582     }
2583     if (flags & float_muladd_halve_result) {
2584         zExp--;
2585     }
2586
2587     shift64RightJamming(zSig64, 32, &zSig64);
2588     return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2589 }
2590
2591
2592 /*----------------------------------------------------------------------------
2593 | Returns the square root of the single-precision floating-point value `a'.
2594 | The operation is performed according to the IEC/IEEE Standard for Binary
2595 | Floating-Point Arithmetic.
2596 *----------------------------------------------------------------------------*/
2597
2598 float32 float32_sqrt( float32 a STATUS_PARAM )
2599 {
2600     flag aSign;
2601     int_fast16_t aExp, zExp;
2602     uint32_t aSig, zSig;
2603     uint64_t rem, term;
2604     a = float32_squash_input_denormal(a STATUS_VAR);
2605
2606     aSig = extractFloat32Frac( a );
2607     aExp = extractFloat32Exp( a );
2608     aSign = extractFloat32Sign( a );
2609     if ( aExp == 0xFF ) {
2610         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2611         if ( ! aSign ) return a;
2612         float_raise( float_flag_invalid STATUS_VAR);
2613         return float32_default_nan;
2614     }
2615     if ( aSign ) {
2616         if ( ( aExp | aSig ) == 0 ) return a;
2617         float_raise( float_flag_invalid STATUS_VAR);
2618         return float32_default_nan;
2619     }
2620     if ( aExp == 0 ) {
2621         if ( aSig == 0 ) return float32_zero;
2622         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2623     }
2624     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2625     aSig = ( aSig | 0x00800000 )<<8;
2626     zSig = estimateSqrt32( aExp, aSig ) + 2;
2627     if ( ( zSig & 0x7F ) <= 5 ) {
2628         if ( zSig < 2 ) {
2629             zSig = 0x7FFFFFFF;
2630             goto roundAndPack;
2631         }
2632         aSig >>= aExp & 1;
2633         term = ( (uint64_t) zSig ) * zSig;
2634         rem = ( ( (uint64_t) aSig )<<32 ) - term;
2635         while ( (int64_t) rem < 0 ) {
2636             --zSig;
2637             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2638         }
2639         zSig |= ( rem != 0 );
2640     }
2641     shift32RightJamming( zSig, 1, &zSig );
2642  roundAndPack:
2643     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2644
2645 }
2646
2647 /*----------------------------------------------------------------------------
2648 | Returns the binary exponential of the single-precision floating-point value
2649 | `a'. The operation is performed according to the IEC/IEEE Standard for
2650 | Binary Floating-Point Arithmetic.
2651 |
2652 | Uses the following identities:
2653 |
2654 | 1. -------------------------------------------------------------------------
2655 |      x    x*ln(2)
2656 |     2  = e
2657 |
2658 | 2. -------------------------------------------------------------------------
2659 |                      2     3     4     5           n
2660 |      x        x     x     x     x     x           x
2661 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2662 |               1!    2!    3!    4!    5!          n!
2663 *----------------------------------------------------------------------------*/
2664
2665 static const float64 float32_exp2_coefficients[15] =
2666 {
2667     const_float64( 0x3ff0000000000000ll ), /*  1 */
2668     const_float64( 0x3fe0000000000000ll ), /*  2 */
2669     const_float64( 0x3fc5555555555555ll ), /*  3 */
2670     const_float64( 0x3fa5555555555555ll ), /*  4 */
2671     const_float64( 0x3f81111111111111ll ), /*  5 */
2672     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2673     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2674     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2675     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2676     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2677     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2678     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2679     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2680     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2681     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2682 };
2683
2684 float32 float32_exp2( float32 a STATUS_PARAM )
2685 {
2686     flag aSign;
2687     int_fast16_t aExp;
2688     uint32_t aSig;
2689     float64 r, x, xn;
2690     int i;
2691     a = float32_squash_input_denormal(a STATUS_VAR);
2692
2693     aSig = extractFloat32Frac( a );
2694     aExp = extractFloat32Exp( a );
2695     aSign = extractFloat32Sign( a );
2696
2697     if ( aExp == 0xFF) {
2698         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2699         return (aSign) ? float32_zero : a;
2700     }
2701     if (aExp == 0) {
2702         if (aSig == 0) return float32_one;
2703     }
2704
2705     float_raise( float_flag_inexact STATUS_VAR);
2706
2707     /* ******************************* */
2708     /* using float64 for approximation */
2709     /* ******************************* */
2710     x = float32_to_float64(a STATUS_VAR);
2711     x = float64_mul(x, float64_ln2 STATUS_VAR);
2712
2713     xn = x;
2714     r = float64_one;
2715     for (i = 0 ; i < 15 ; i++) {
2716         float64 f;
2717
2718         f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2719         r = float64_add(r, f STATUS_VAR);
2720
2721         xn = float64_mul(xn, x STATUS_VAR);
2722     }
2723
2724     return float64_to_float32(r, status);
2725 }
2726
2727 /*----------------------------------------------------------------------------
2728 | Returns the binary log of the single-precision floating-point value `a'.
2729 | The operation is performed according to the IEC/IEEE Standard for Binary
2730 | Floating-Point Arithmetic.
2731 *----------------------------------------------------------------------------*/
2732 float32 float32_log2( float32 a STATUS_PARAM )
2733 {
2734     flag aSign, zSign;
2735     int_fast16_t aExp;
2736     uint32_t aSig, zSig, i;
2737
2738     a = float32_squash_input_denormal(a STATUS_VAR);
2739     aSig = extractFloat32Frac( a );
2740     aExp = extractFloat32Exp( a );
2741     aSign = extractFloat32Sign( a );
2742
2743     if ( aExp == 0 ) {
2744         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2745         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2746     }
2747     if ( aSign ) {
2748         float_raise( float_flag_invalid STATUS_VAR);
2749         return float32_default_nan;
2750     }
2751     if ( aExp == 0xFF ) {
2752         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2753         return a;
2754     }
2755
2756     aExp -= 0x7F;
2757     aSig |= 0x00800000;
2758     zSign = aExp < 0;
2759     zSig = aExp << 23;
2760
2761     for (i = 1 << 22; i > 0; i >>= 1) {
2762         aSig = ( (uint64_t)aSig * aSig ) >> 23;
2763         if ( aSig & 0x01000000 ) {
2764             aSig >>= 1;
2765             zSig |= i;
2766         }
2767     }
2768
2769     if ( zSign )
2770         zSig = -zSig;
2771
2772     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2773 }
2774
2775 /*----------------------------------------------------------------------------
2776 | Returns 1 if the single-precision floating-point value `a' is equal to
2777 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2778 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2779 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2780 *----------------------------------------------------------------------------*/
2781
2782 int float32_eq( float32 a, float32 b STATUS_PARAM )
2783 {
2784     uint32_t av, bv;
2785     a = float32_squash_input_denormal(a STATUS_VAR);
2786     b = float32_squash_input_denormal(b STATUS_VAR);
2787
2788     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2789          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2790        ) {
2791         float_raise( float_flag_invalid STATUS_VAR);
2792         return 0;
2793     }
2794     av = float32_val(a);
2795     bv = float32_val(b);
2796     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2797 }
2798
2799 /*----------------------------------------------------------------------------
2800 | Returns 1 if the single-precision floating-point value `a' is less than
2801 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
2802 | exception is raised if either operand is a NaN.  The comparison is performed
2803 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2804 *----------------------------------------------------------------------------*/
2805
2806 int float32_le( float32 a, float32 b STATUS_PARAM )
2807 {
2808     flag aSign, bSign;
2809     uint32_t av, bv;
2810     a = float32_squash_input_denormal(a STATUS_VAR);
2811     b = float32_squash_input_denormal(b STATUS_VAR);
2812
2813     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2814          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2815        ) {
2816         float_raise( float_flag_invalid STATUS_VAR);
2817         return 0;
2818     }
2819     aSign = extractFloat32Sign( a );
2820     bSign = extractFloat32Sign( b );
2821     av = float32_val(a);
2822     bv = float32_val(b);
2823     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2824     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2825
2826 }
2827
2828 /*----------------------------------------------------------------------------
2829 | Returns 1 if the single-precision floating-point value `a' is less than
2830 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2831 | raised if either operand is a NaN.  The comparison is performed according
2832 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2833 *----------------------------------------------------------------------------*/
2834
2835 int float32_lt( float32 a, float32 b STATUS_PARAM )
2836 {
2837     flag aSign, bSign;
2838     uint32_t av, bv;
2839     a = float32_squash_input_denormal(a STATUS_VAR);
2840     b = float32_squash_input_denormal(b STATUS_VAR);
2841
2842     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2843          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2844        ) {
2845         float_raise( float_flag_invalid STATUS_VAR);
2846         return 0;
2847     }
2848     aSign = extractFloat32Sign( a );
2849     bSign = extractFloat32Sign( b );
2850     av = float32_val(a);
2851     bv = float32_val(b);
2852     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2853     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2854
2855 }
2856
2857 /*----------------------------------------------------------------------------
2858 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2859 | be compared, and 0 otherwise.  The invalid exception is raised if either
2860 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
2861 | Standard for Binary Floating-Point Arithmetic.
2862 *----------------------------------------------------------------------------*/
2863
2864 int float32_unordered( float32 a, float32 b STATUS_PARAM )
2865 {
2866     a = float32_squash_input_denormal(a STATUS_VAR);
2867     b = float32_squash_input_denormal(b STATUS_VAR);
2868
2869     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2870          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2871        ) {
2872         float_raise( float_flag_invalid STATUS_VAR);
2873         return 1;
2874     }
2875     return 0;
2876 }
2877
2878 /*----------------------------------------------------------------------------
2879 | Returns 1 if the single-precision floating-point value `a' is equal to
2880 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2881 | exception.  The comparison is performed according to the IEC/IEEE Standard
2882 | for Binary Floating-Point Arithmetic.
2883 *----------------------------------------------------------------------------*/
2884
2885 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2886 {
2887     a = float32_squash_input_denormal(a STATUS_VAR);
2888     b = float32_squash_input_denormal(b STATUS_VAR);
2889
2890     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2891          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2892        ) {
2893         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2894             float_raise( float_flag_invalid STATUS_VAR);
2895         }
2896         return 0;
2897     }
2898     return ( float32_val(a) == float32_val(b) ) ||
2899             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2900 }
2901
2902 /*----------------------------------------------------------------------------
2903 | Returns 1 if the single-precision floating-point value `a' is less than or
2904 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2905 | cause an exception.  Otherwise, the comparison is performed according to the
2906 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2907 *----------------------------------------------------------------------------*/
2908
2909 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2910 {
2911     flag aSign, bSign;
2912     uint32_t av, bv;
2913     a = float32_squash_input_denormal(a STATUS_VAR);
2914     b = float32_squash_input_denormal(b STATUS_VAR);
2915
2916     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2917          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2918        ) {
2919         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2920             float_raise( float_flag_invalid STATUS_VAR);
2921         }
2922         return 0;
2923     }
2924     aSign = extractFloat32Sign( a );
2925     bSign = extractFloat32Sign( b );
2926     av = float32_val(a);
2927     bv = float32_val(b);
2928     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2929     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2930
2931 }
2932
2933 /*----------------------------------------------------------------------------
2934 | Returns 1 if the single-precision floating-point value `a' is less than
2935 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2936 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2937 | Standard for Binary Floating-Point Arithmetic.
2938 *----------------------------------------------------------------------------*/
2939
2940 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2941 {
2942     flag aSign, bSign;
2943     uint32_t av, bv;
2944     a = float32_squash_input_denormal(a STATUS_VAR);
2945     b = float32_squash_input_denormal(b STATUS_VAR);
2946
2947     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2948          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2949        ) {
2950         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2951             float_raise( float_flag_invalid STATUS_VAR);
2952         }
2953         return 0;
2954     }
2955     aSign = extractFloat32Sign( a );
2956     bSign = extractFloat32Sign( b );
2957     av = float32_val(a);
2958     bv = float32_val(b);
2959     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2960     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2961
2962 }
2963
2964 /*----------------------------------------------------------------------------
2965 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2966 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
2967 | comparison is performed according to the IEC/IEEE Standard for Binary
2968 | Floating-Point Arithmetic.
2969 *----------------------------------------------------------------------------*/
2970
2971 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2972 {
2973     a = float32_squash_input_denormal(a STATUS_VAR);
2974     b = float32_squash_input_denormal(b STATUS_VAR);
2975
2976     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2977          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2978        ) {
2979         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2980             float_raise( float_flag_invalid STATUS_VAR);
2981         }
2982         return 1;
2983     }
2984     return 0;
2985 }
2986
2987 /*----------------------------------------------------------------------------
2988 | Returns the result of converting the double-precision floating-point value
2989 | `a' to the 32-bit two's complement integer format.  The conversion is
2990 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2991 | Arithmetic---which means in particular that the conversion is rounded
2992 | according to the current rounding mode.  If `a' is a NaN, the largest
2993 | positive integer is returned.  Otherwise, if the conversion overflows, the
2994 | largest integer with the same sign as `a' is returned.
2995 *----------------------------------------------------------------------------*/
2996
2997 int32 float64_to_int32( float64 a STATUS_PARAM )
2998 {
2999     flag aSign;
3000     int_fast16_t aExp, shiftCount;
3001     uint64_t aSig;
3002     a = float64_squash_input_denormal(a STATUS_VAR);
3003
3004     aSig = extractFloat64Frac( a );
3005     aExp = extractFloat64Exp( a );
3006     aSign = extractFloat64Sign( a );
3007     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3008     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3009     shiftCount = 0x42C - aExp;
3010     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3011     return roundAndPackInt32( aSign, aSig STATUS_VAR );
3012
3013 }
3014
3015 /*----------------------------------------------------------------------------
3016 | Returns the result of converting the double-precision floating-point value
3017 | `a' to the 32-bit two's complement integer format.  The conversion is
3018 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3019 | Arithmetic, except that the conversion is always rounded toward zero.
3020 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3021 | the conversion overflows, the largest integer with the same sign as `a' is
3022 | returned.
3023 *----------------------------------------------------------------------------*/
3024
3025 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
3026 {
3027     flag aSign;
3028     int_fast16_t aExp, shiftCount;
3029     uint64_t aSig, savedASig;
3030     int32_t z;
3031     a = float64_squash_input_denormal(a STATUS_VAR);
3032
3033     aSig = extractFloat64Frac( a );
3034     aExp = extractFloat64Exp( a );
3035     aSign = extractFloat64Sign( a );
3036     if ( 0x41E < aExp ) {
3037         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3038         goto invalid;
3039     }
3040     else if ( aExp < 0x3FF ) {
3041         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3042         return 0;
3043     }
3044     aSig |= LIT64( 0x0010000000000000 );
3045     shiftCount = 0x433 - aExp;
3046     savedASig = aSig;
3047     aSig >>= shiftCount;
3048     z = aSig;
3049     if ( aSign ) z = - z;
3050     if ( ( z < 0 ) ^ aSign ) {
3051  invalid:
3052         float_raise( float_flag_invalid STATUS_VAR);
3053         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3054     }
3055     if ( ( aSig<<shiftCount ) != savedASig ) {
3056         STATUS(float_exception_flags) |= float_flag_inexact;
3057     }
3058     return z;
3059
3060 }
3061
3062 /*----------------------------------------------------------------------------
3063 | Returns the result of converting the double-precision floating-point value
3064 | `a' to the 16-bit two's complement integer format.  The conversion is
3065 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3066 | Arithmetic, except that the conversion is always rounded toward zero.
3067 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3068 | the conversion overflows, the largest integer with the same sign as `a' is
3069 | returned.
3070 *----------------------------------------------------------------------------*/
3071
3072 int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
3073 {
3074     flag aSign;
3075     int_fast16_t aExp, shiftCount;
3076     uint64_t aSig, savedASig;
3077     int32 z;
3078
3079     aSig = extractFloat64Frac( a );
3080     aExp = extractFloat64Exp( a );
3081     aSign = extractFloat64Sign( a );
3082     if ( 0x40E < aExp ) {
3083         if ( ( aExp == 0x7FF ) && aSig ) {
3084             aSign = 0;
3085         }
3086         goto invalid;
3087     }
3088     else if ( aExp < 0x3FF ) {
3089         if ( aExp || aSig ) {
3090             STATUS(float_exception_flags) |= float_flag_inexact;
3091         }
3092         return 0;
3093     }
3094     aSig |= LIT64( 0x0010000000000000 );
3095     shiftCount = 0x433 - aExp;
3096     savedASig = aSig;
3097     aSig >>= shiftCount;
3098     z = aSig;
3099     if ( aSign ) {
3100         z = - z;
3101     }
3102     if ( ( (int16_t)z < 0 ) ^ aSign ) {
3103  invalid:
3104         float_raise( float_flag_invalid STATUS_VAR);
3105         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3106     }
3107     if ( ( aSig<<shiftCount ) != savedASig ) {
3108         STATUS(float_exception_flags) |= float_flag_inexact;
3109     }
3110     return z;
3111 }
3112
3113 /*----------------------------------------------------------------------------
3114 | Returns the result of converting the double-precision floating-point value
3115 | `a' to the 64-bit two's complement integer format.  The conversion is
3116 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3117 | Arithmetic---which means in particular that the conversion is rounded
3118 | according to the current rounding mode.  If `a' is a NaN, the largest
3119 | positive integer is returned.  Otherwise, if the conversion overflows, the
3120 | largest integer with the same sign as `a' is returned.
3121 *----------------------------------------------------------------------------*/
3122
3123 int64 float64_to_int64( float64 a STATUS_PARAM )
3124 {
3125     flag aSign;
3126     int_fast16_t aExp, shiftCount;
3127     uint64_t aSig, aSigExtra;
3128     a = float64_squash_input_denormal(a STATUS_VAR);
3129
3130     aSig = extractFloat64Frac( a );
3131     aExp = extractFloat64Exp( a );
3132     aSign = extractFloat64Sign( a );
3133     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3134     shiftCount = 0x433 - aExp;
3135     if ( shiftCount <= 0 ) {
3136         if ( 0x43E < aExp ) {
3137             float_raise( float_flag_invalid STATUS_VAR);
3138             if (    ! aSign
3139                  || (    ( aExp == 0x7FF )
3140                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
3141                ) {
3142                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3143             }
3144             return (int64_t) LIT64( 0x8000000000000000 );
3145         }
3146         aSigExtra = 0;
3147         aSig <<= - shiftCount;
3148     }
3149     else {
3150         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3151     }
3152     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3153
3154 }
3155
3156 /*----------------------------------------------------------------------------
3157 | Returns the result of converting the double-precision floating-point value
3158 | `a' to the 64-bit two's complement integer format.  The conversion is
3159 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3160 | Arithmetic, except that the conversion is always rounded toward zero.
3161 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3162 | the conversion overflows, the largest integer with the same sign as `a' is
3163 | returned.
3164 *----------------------------------------------------------------------------*/
3165
3166 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
3167 {
3168     flag aSign;
3169     int_fast16_t aExp, shiftCount;
3170     uint64_t aSig;
3171     int64 z;
3172     a = float64_squash_input_denormal(a STATUS_VAR);
3173
3174     aSig = extractFloat64Frac( a );
3175     aExp = extractFloat64Exp( a );
3176     aSign = extractFloat64Sign( a );
3177     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3178     shiftCount = aExp - 0x433;
3179     if ( 0 <= shiftCount ) {
3180         if ( 0x43E <= aExp ) {
3181             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3182                 float_raise( float_flag_invalid STATUS_VAR);
3183                 if (    ! aSign
3184                      || (    ( aExp == 0x7FF )
3185                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
3186                    ) {
3187                     return LIT64( 0x7FFFFFFFFFFFFFFF );
3188                 }
3189             }
3190             return (int64_t) LIT64( 0x8000000000000000 );
3191         }
3192         z = aSig<<shiftCount;
3193     }
3194     else {
3195         if ( aExp < 0x3FE ) {
3196             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3197             return 0;
3198         }
3199         z = aSig>>( - shiftCount );
3200         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3201             STATUS(float_exception_flags) |= float_flag_inexact;
3202         }
3203     }
3204     if ( aSign ) z = - z;
3205     return z;
3206
3207 }
3208
3209 /*----------------------------------------------------------------------------
3210 | Returns the result of converting the double-precision floating-point value
3211 | `a' to the single-precision floating-point format.  The conversion is
3212 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3213 | Arithmetic.
3214 *----------------------------------------------------------------------------*/
3215
3216 float32 float64_to_float32( float64 a STATUS_PARAM )
3217 {
3218     flag aSign;
3219     int_fast16_t aExp;
3220     uint64_t aSig;
3221     uint32_t zSig;
3222     a = float64_squash_input_denormal(a STATUS_VAR);
3223
3224     aSig = extractFloat64Frac( a );
3225     aExp = extractFloat64Exp( a );
3226     aSign = extractFloat64Sign( a );
3227     if ( aExp == 0x7FF ) {
3228         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3229         return packFloat32( aSign, 0xFF, 0 );
3230     }
3231     shift64RightJamming( aSig, 22, &aSig );
3232     zSig = aSig;
3233     if ( aExp || zSig ) {
3234         zSig |= 0x40000000;
3235         aExp -= 0x381;
3236     }
3237     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
3238
3239 }
3240
3241
3242 /*----------------------------------------------------------------------------
3243 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3244 | half-precision floating-point value, returning the result.  After being
3245 | shifted into the proper positions, the three fields are simply added
3246 | together to form the result.  This means that any integer portion of `zSig'
3247 | will be added into the exponent.  Since a properly normalized significand
3248 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3249 | than the desired result exponent whenever `zSig' is a complete, normalized
3250 | significand.
3251 *----------------------------------------------------------------------------*/
3252 static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3253 {
3254     return make_float16(
3255         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3256 }
3257
3258 /*----------------------------------------------------------------------------
3259 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3260 | and significand `zSig', and returns the proper half-precision floating-
3261 | point value corresponding to the abstract input.  Ordinarily, the abstract
3262 | value is simply rounded and packed into the half-precision format, with
3263 | the inexact exception raised if the abstract input cannot be represented
3264 | exactly.  However, if the abstract value is too large, the overflow and
3265 | inexact exceptions are raised and an infinity or maximal finite value is
3266 | returned.  If the abstract value is too small, the input value is rounded to
3267 | a subnormal number, and the underflow and inexact exceptions are raised if
3268 | the abstract input cannot be represented exactly as a subnormal half-
3269 | precision floating-point number.
3270 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3271 | ARM-style "alternative representation", which omits the NaN and Inf
3272 | encodings in order to raise the maximum representable exponent by one.
3273 |     The input significand `zSig' has its binary point between bits 22
3274 | and 23, which is 13 bits to the left of the usual location.  This shifted
3275 | significand must be normalized or smaller.  If `zSig' is not normalized,
3276 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3277 | and it must not require rounding.  In the usual case that `zSig' is
3278 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3279 | Note the slightly odd position of the binary point in zSig compared with the
3280 | other roundAndPackFloat functions. This should probably be fixed if we
3281 | need to implement more float16 routines than just conversion.
3282 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3283 | Binary Floating-Point Arithmetic.
3284 *----------------------------------------------------------------------------*/
3285
3286 static float32 roundAndPackFloat16(flag zSign, int_fast16_t zExp,
3287                                    uint32_t zSig, flag ieee STATUS_PARAM)
3288 {
3289     int maxexp = ieee ? 29 : 30;
3290     uint32_t mask;
3291     uint32_t increment;
3292     bool rounding_bumps_exp;
3293     bool is_tiny = false;
3294
3295     /* Calculate the mask of bits of the mantissa which are not
3296      * representable in half-precision and will be lost.
3297      */
3298     if (zExp < 1) {
3299         /* Will be denormal in halfprec */
3300         mask = 0x00ffffff;
3301         if (zExp >= -11) {
3302             mask >>= 11 + zExp;
3303         }
3304     } else {
3305         /* Normal number in halfprec */
3306         mask = 0x00001fff;
3307     }
3308
3309     switch (STATUS(float_rounding_mode)) {
3310     case float_round_nearest_even:
3311         increment = (mask + 1) >> 1;
3312         if ((zSig & mask) == increment) {
3313             increment = zSig & (increment << 1);
3314         }
3315         break;
3316     case float_round_ties_away:
3317         increment = (mask + 1) >> 1;
3318         break;
3319     case float_round_up:
3320         increment = zSign ? 0 : mask;
3321         break;
3322     case float_round_down:
3323         increment = zSign ? mask : 0;
3324         break;
3325     default: /* round_to_zero */
3326         increment = 0;
3327         break;
3328     }
3329
3330     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3331
3332     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3333         if (ieee) {
3334             float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR);
3335             return packFloat16(zSign, 0x1f, 0);
3336         } else {
3337             float_raise(float_flag_invalid STATUS_VAR);
3338             return packFloat16(zSign, 0x1f, 0x3ff);
3339         }
3340     }
3341
3342     if (zExp < 0) {
3343         /* Note that flush-to-zero does not affect half-precision results */
3344         is_tiny =
3345             (STATUS(float_detect_tininess) == float_tininess_before_rounding)
3346             || (zExp < -1)
3347             || (!rounding_bumps_exp);
3348     }
3349     if (zSig & mask) {
3350         float_raise(float_flag_inexact STATUS_VAR);
3351         if (is_tiny) {
3352             float_raise(float_flag_underflow STATUS_VAR);
3353         }
3354     }
3355
3356     zSig += increment;
3357     if (rounding_bumps_exp) {
3358         zSig >>= 1;
3359         zExp++;
3360     }
3361
3362     if (zExp < -10) {
3363         return packFloat16(zSign, 0, 0);
3364     }
3365     if (zExp < 0) {
3366         zSig >>= -zExp;
3367         zExp = 0;
3368     }
3369     return packFloat16(zSign, zExp, zSig >> 13);
3370 }
3371
3372 static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr,
3373                                       uint32_t *zSigPtr)
3374 {
3375     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3376     *zSigPtr = aSig << shiftCount;
3377     *zExpPtr = 1 - shiftCount;
3378 }
3379
3380 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3381    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3382
3383 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
3384 {
3385     flag aSign;
3386     int_fast16_t aExp;
3387     uint32_t aSig;
3388
3389     aSign = extractFloat16Sign(a);
3390     aExp = extractFloat16Exp(a);
3391     aSig = extractFloat16Frac(a);
3392
3393     if (aExp == 0x1f && ieee) {
3394         if (aSig) {
3395             return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3396         }
3397         return packFloat32(aSign, 0xff, 0);
3398     }
3399     if (aExp == 0) {
3400         if (aSig == 0) {
3401             return packFloat32(aSign, 0, 0);
3402         }
3403
3404         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3405         aExp--;
3406     }
3407     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3408 }
3409
3410 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
3411 {
3412     flag aSign;
3413     int_fast16_t aExp;
3414     uint32_t aSig;
3415
3416     a = float32_squash_input_denormal(a STATUS_VAR);
3417
3418     aSig = extractFloat32Frac( a );
3419     aExp = extractFloat32Exp( a );
3420     aSign = extractFloat32Sign( a );
3421     if ( aExp == 0xFF ) {
3422         if (aSig) {
3423             /* Input is a NaN */
3424             if (!ieee) {
3425                 float_raise(float_flag_invalid STATUS_VAR);
3426                 return packFloat16(aSign, 0, 0);
3427             }
3428             return commonNaNToFloat16(
3429                 float32ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3430         }
3431         /* Infinity */
3432         if (!ieee) {
3433             float_raise(float_flag_invalid STATUS_VAR);
3434             return packFloat16(aSign, 0x1f, 0x3ff);
3435         }
3436         return packFloat16(aSign, 0x1f, 0);
3437     }
3438     if (aExp == 0 && aSig == 0) {
3439         return packFloat16(aSign, 0, 0);
3440     }
3441     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3442      * even if the input is denormal; however this is harmless because
3443      * the largest possible single-precision denormal is still smaller
3444      * than the smallest representable half-precision denormal, and so we
3445      * will end up ignoring aSig and returning via the "always return zero"
3446      * codepath.
3447      */
3448     aSig |= 0x00800000;
3449     aExp -= 0x71;
3450
3451     return roundAndPackFloat16(aSign, aExp, aSig, ieee STATUS_VAR);
3452 }
3453
3454 float64 float16_to_float64(float16 a, flag ieee STATUS_PARAM)
3455 {
3456     flag aSign;
3457     int_fast16_t aExp;
3458     uint32_t aSig;
3459
3460     aSign = extractFloat16Sign(a);
3461     aExp = extractFloat16Exp(a);
3462     aSig = extractFloat16Frac(a);
3463
3464     if (aExp == 0x1f && ieee) {
3465         if (aSig) {
3466             return commonNaNToFloat64(
3467                 float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3468         }
3469         return packFloat64(aSign, 0x7ff, 0);
3470     }
3471     if (aExp == 0) {
3472         if (aSig == 0) {
3473             return packFloat64(aSign, 0, 0);
3474         }
3475
3476         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3477         aExp--;
3478     }
3479     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3480 }
3481
3482 float16 float64_to_float16(float64 a, flag ieee STATUS_PARAM)
3483 {
3484     flag aSign;
3485     int_fast16_t aExp;
3486     uint64_t aSig;
3487     uint32_t zSig;
3488
3489     a = float64_squash_input_denormal(a STATUS_VAR);
3490
3491     aSig = extractFloat64Frac(a);
3492     aExp = extractFloat64Exp(a);
3493     aSign = extractFloat64Sign(a);
3494     if (aExp == 0x7FF) {
3495         if (aSig) {
3496             /* Input is a NaN */
3497             if (!ieee) {
3498                 float_raise(float_flag_invalid STATUS_VAR);
3499                 return packFloat16(aSign, 0, 0);
3500             }
3501             return commonNaNToFloat16(
3502                 float64ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3503         }
3504         /* Infinity */
3505         if (!ieee) {
3506             float_raise(float_flag_invalid STATUS_VAR);
3507             return packFloat16(aSign, 0x1f, 0x3ff);
3508         }
3509         return packFloat16(aSign, 0x1f, 0);
3510     }
3511     shift64RightJamming(aSig, 29, &aSig);
3512     zSig = aSig;
3513     if (aExp == 0 && zSig == 0) {
3514         return packFloat16(aSign, 0, 0);
3515     }
3516     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3517      * even if the input is denormal; however this is harmless because
3518      * the largest possible single-precision denormal is still smaller
3519      * than the smallest representable half-precision denormal, and so we
3520      * will end up ignoring aSig and returning via the "always return zero"
3521      * codepath.
3522      */
3523     zSig |= 0x00800000;
3524     aExp -= 0x3F1;
3525
3526     return roundAndPackFloat16(aSign, aExp, zSig, ieee STATUS_VAR);
3527 }
3528
3529 /*----------------------------------------------------------------------------
3530 | Returns the result of converting the double-precision floating-point value
3531 | `a' to the extended double-precision floating-point format.  The conversion
3532 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3533 | Arithmetic.
3534 *----------------------------------------------------------------------------*/
3535
3536 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3537 {
3538     flag aSign;
3539     int_fast16_t aExp;
3540     uint64_t aSig;
3541
3542     a = float64_squash_input_denormal(a STATUS_VAR);
3543     aSig = extractFloat64Frac( a );
3544     aExp = extractFloat64Exp( a );
3545     aSign = extractFloat64Sign( a );
3546     if ( aExp == 0x7FF ) {
3547         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3548         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3549     }
3550     if ( aExp == 0 ) {
3551         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3552         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3553     }
3554     return
3555         packFloatx80(
3556             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3557
3558 }
3559
3560 /*----------------------------------------------------------------------------
3561 | Returns the result of converting the double-precision floating-point value
3562 | `a' to the quadruple-precision floating-point format.  The conversion is
3563 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3564 | Arithmetic.
3565 *----------------------------------------------------------------------------*/
3566
3567 float128 float64_to_float128( float64 a STATUS_PARAM )
3568 {
3569     flag aSign;
3570     int_fast16_t aExp;
3571     uint64_t aSig, zSig0, zSig1;
3572
3573     a = float64_squash_input_denormal(a STATUS_VAR);
3574     aSig = extractFloat64Frac( a );
3575     aExp = extractFloat64Exp( a );
3576     aSign = extractFloat64Sign( a );
3577     if ( aExp == 0x7FF ) {
3578         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3579         return packFloat128( aSign, 0x7FFF, 0, 0 );
3580     }
3581     if ( aExp == 0 ) {
3582         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3583         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3584         --aExp;
3585     }
3586     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3587     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3588
3589 }
3590
3591 /*----------------------------------------------------------------------------
3592 | Rounds the double-precision floating-point value `a' to an integer, and
3593 | returns the result as a double-precision floating-point value.  The
3594 | operation is performed according to the IEC/IEEE Standard for Binary
3595 | Floating-Point Arithmetic.
3596 *----------------------------------------------------------------------------*/
3597
3598 float64 float64_round_to_int( float64 a STATUS_PARAM )
3599 {
3600     flag aSign;
3601     int_fast16_t aExp;
3602     uint64_t lastBitMask, roundBitsMask;
3603     uint64_t z;
3604     a = float64_squash_input_denormal(a STATUS_VAR);
3605
3606     aExp = extractFloat64Exp( a );
3607     if ( 0x433 <= aExp ) {
3608         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3609             return propagateFloat64NaN( a, a STATUS_VAR );
3610         }
3611         return a;
3612     }
3613     if ( aExp < 0x3FF ) {
3614         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3615         STATUS(float_exception_flags) |= float_flag_inexact;
3616         aSign = extractFloat64Sign( a );
3617         switch ( STATUS(float_rounding_mode) ) {
3618          case float_round_nearest_even:
3619             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3620                 return packFloat64( aSign, 0x3FF, 0 );
3621             }
3622             break;
3623         case float_round_ties_away:
3624             if (aExp == 0x3FE) {
3625                 return packFloat64(aSign, 0x3ff, 0);
3626             }
3627             break;
3628          case float_round_down:
3629             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3630          case float_round_up:
3631             return make_float64(
3632             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3633         }
3634         return packFloat64( aSign, 0, 0 );
3635     }
3636     lastBitMask = 1;
3637     lastBitMask <<= 0x433 - aExp;
3638     roundBitsMask = lastBitMask - 1;
3639     z = float64_val(a);
3640     switch (STATUS(float_rounding_mode)) {
3641     case float_round_nearest_even:
3642         z += lastBitMask >> 1;
3643         if ((z & roundBitsMask) == 0) {
3644             z &= ~lastBitMask;
3645         }
3646         break;
3647     case float_round_ties_away:
3648         z += lastBitMask >> 1;
3649         break;
3650     case float_round_to_zero:
3651         break;
3652     case float_round_up:
3653         if (!extractFloat64Sign(make_float64(z))) {
3654             z += roundBitsMask;
3655         }
3656         break;
3657     case float_round_down:
3658         if (extractFloat64Sign(make_float64(z))) {
3659             z += roundBitsMask;
3660         }
3661         break;
3662     default:
3663         abort();
3664     }
3665     z &= ~ roundBitsMask;
3666     if ( z != float64_val(a) )
3667         STATUS(float_exception_flags) |= float_flag_inexact;
3668     return make_float64(z);
3669
3670 }
3671
3672 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3673 {
3674     int oldmode;
3675     float64 res;
3676     oldmode = STATUS(float_rounding_mode);
3677     STATUS(float_rounding_mode) = float_round_to_zero;
3678     res = float64_round_to_int(a STATUS_VAR);
3679     STATUS(float_rounding_mode) = oldmode;
3680     return res;
3681 }
3682
3683 /*----------------------------------------------------------------------------
3684 | Returns the result of adding the absolute values of the double-precision
3685 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3686 | before being returned.  `zSign' is ignored if the result is a NaN.
3687 | The addition is performed according to the IEC/IEEE Standard for Binary
3688 | Floating-Point Arithmetic.
3689 *----------------------------------------------------------------------------*/
3690
3691 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3692 {
3693     int_fast16_t aExp, bExp, zExp;
3694     uint64_t aSig, bSig, zSig;
3695     int_fast16_t expDiff;
3696
3697     aSig = extractFloat64Frac( a );
3698     aExp = extractFloat64Exp( a );
3699     bSig = extractFloat64Frac( b );
3700     bExp = extractFloat64Exp( b );
3701     expDiff = aExp - bExp;
3702     aSig <<= 9;
3703     bSig <<= 9;
3704     if ( 0 < expDiff ) {
3705         if ( aExp == 0x7FF ) {
3706             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3707             return a;
3708         }
3709         if ( bExp == 0 ) {
3710             --expDiff;
3711         }
3712         else {
3713             bSig |= LIT64( 0x2000000000000000 );
3714         }
3715         shift64RightJamming( bSig, expDiff, &bSig );
3716         zExp = aExp;
3717     }
3718     else if ( expDiff < 0 ) {
3719         if ( bExp == 0x7FF ) {
3720             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3721             return packFloat64( zSign, 0x7FF, 0 );
3722         }
3723         if ( aExp == 0 ) {
3724             ++expDiff;
3725         }
3726         else {
3727             aSig |= LIT64( 0x2000000000000000 );
3728         }
3729         shift64RightJamming( aSig, - expDiff, &aSig );
3730         zExp = bExp;
3731     }
3732     else {
3733         if ( aExp == 0x7FF ) {
3734             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3735             return a;
3736         }
3737         if ( aExp == 0 ) {
3738             if (STATUS(flush_to_zero)) {
3739                 if (aSig | bSig) {
3740                     float_raise(float_flag_output_denormal STATUS_VAR);
3741                 }
3742                 return packFloat64(zSign, 0, 0);
3743             }
3744             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3745         }
3746         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3747         zExp = aExp;
3748         goto roundAndPack;
3749     }
3750     aSig |= LIT64( 0x2000000000000000 );
3751     zSig = ( aSig + bSig )<<1;
3752     --zExp;
3753     if ( (int64_t) zSig < 0 ) {
3754         zSig = aSig + bSig;
3755         ++zExp;
3756     }
3757  roundAndPack:
3758     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3759
3760 }
3761
3762 /*----------------------------------------------------------------------------
3763 | Returns the result of subtracting the absolute values of the double-
3764 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3765 | difference is negated before being returned.  `zSign' is ignored if the
3766 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3767 | Standard for Binary Floating-Point Arithmetic.
3768 *----------------------------------------------------------------------------*/
3769
3770 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3771 {
3772     int_fast16_t aExp, bExp, zExp;
3773     uint64_t aSig, bSig, zSig;
3774     int_fast16_t expDiff;
3775
3776     aSig = extractFloat64Frac( a );
3777     aExp = extractFloat64Exp( a );
3778     bSig = extractFloat64Frac( b );
3779     bExp = extractFloat64Exp( b );
3780     expDiff = aExp - bExp;
3781     aSig <<= 10;
3782     bSig <<= 10;
3783     if ( 0 < expDiff ) goto aExpBigger;
3784     if ( expDiff < 0 ) goto bExpBigger;
3785     if ( aExp == 0x7FF ) {
3786         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3787         float_raise( float_flag_invalid STATUS_VAR);
3788         return float64_default_nan;
3789     }
3790     if ( aExp == 0 ) {
3791         aExp = 1;
3792         bExp = 1;
3793     }
3794     if ( bSig < aSig ) goto aBigger;
3795     if ( aSig < bSig ) goto bBigger;
3796     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3797  bExpBigger:
3798     if ( bExp == 0x7FF ) {
3799         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3800         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3801     }
3802     if ( aExp == 0 ) {
3803         ++expDiff;
3804     }
3805     else {
3806         aSig |= LIT64( 0x4000000000000000 );
3807     }
3808     shift64RightJamming( aSig, - expDiff, &aSig );
3809     bSig |= LIT64( 0x4000000000000000 );
3810  bBigger:
3811     zSig = bSig - aSig;
3812     zExp = bExp;
3813     zSign ^= 1;
3814     goto normalizeRoundAndPack;
3815  aExpBigger:
3816     if ( aExp == 0x7FF ) {
3817         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3818         return a;
3819     }
3820     if ( bExp == 0 ) {
3821         --expDiff;
3822     }
3823     else {
3824         bSig |= LIT64( 0x4000000000000000 );
3825     }
3826     shift64RightJamming( bSig, expDiff, &bSig );
3827     aSig |= LIT64( 0x4000000000000000 );
3828  aBigger:
3829     zSig = aSig - bSig;
3830     zExp = aExp;
3831  normalizeRoundAndPack:
3832     --zExp;
3833     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3834
3835 }
3836
3837 /*----------------------------------------------------------------------------
3838 | Returns the result of adding the double-precision floating-point values `a'
3839 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3840 | Binary Floating-Point Arithmetic.
3841 *----------------------------------------------------------------------------*/
3842
3843 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3844 {
3845     flag aSign, bSign;
3846     a = float64_squash_input_denormal(a STATUS_VAR);
3847     b = float64_squash_input_denormal(b STATUS_VAR);
3848
3849     aSign = extractFloat64Sign( a );
3850     bSign = extractFloat64Sign( b );
3851     if ( aSign == bSign ) {
3852         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3853     }
3854     else {
3855         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3856     }
3857
3858 }
3859
3860 /*----------------------------------------------------------------------------
3861 | Returns the result of subtracting the double-precision floating-point values
3862 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3863 | for Binary Floating-Point Arithmetic.
3864 *----------------------------------------------------------------------------*/
3865
3866 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3867 {
3868     flag aSign, bSign;
3869     a = float64_squash_input_denormal(a STATUS_VAR);
3870     b = float64_squash_input_denormal(b STATUS_VAR);
3871
3872     aSign = extractFloat64Sign( a );
3873     bSign = extractFloat64Sign( b );
3874     if ( aSign == bSign ) {
3875         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3876     }
3877     else {
3878         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3879     }
3880
3881 }
3882
3883 /*----------------------------------------------------------------------------
3884 | Returns the result of multiplying the double-precision floating-point values
3885 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3886 | for Binary Floating-Point Arithmetic.
3887 *----------------------------------------------------------------------------*/
3888
3889 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3890 {
3891     flag aSign, bSign, zSign;
3892     int_fast16_t aExp, bExp, zExp;
3893     uint64_t aSig, bSig, zSig0, zSig1;
3894
3895     a = float64_squash_input_denormal(a STATUS_VAR);
3896     b = float64_squash_input_denormal(b STATUS_VAR);
3897
3898     aSig = extractFloat64Frac( a );
3899     aExp = extractFloat64Exp( a );
3900     aSign = extractFloat64Sign( a );
3901     bSig = extractFloat64Frac( b );
3902     bExp = extractFloat64Exp( b );
3903     bSign = extractFloat64Sign( b );
3904     zSign = aSign ^ bSign;
3905     if ( aExp == 0x7FF ) {
3906         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3907             return propagateFloat64NaN( a, b STATUS_VAR );
3908         }
3909         if ( ( bExp | bSig ) == 0 ) {
3910             float_raise( float_flag_invalid STATUS_VAR);
3911             return float64_default_nan;
3912         }
3913         return packFloat64( zSign, 0x7FF, 0 );
3914     }
3915     if ( bExp == 0x7FF ) {
3916         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3917         if ( ( aExp | aSig ) == 0 ) {
3918             float_raise( float_flag_invalid STATUS_VAR);
3919             return float64_default_nan;
3920         }
3921         return packFloat64( zSign, 0x7FF, 0 );
3922     }
3923     if ( aExp == 0 ) {
3924         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3925         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3926     }
3927     if ( bExp == 0 ) {
3928         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3929         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3930     }
3931     zExp = aExp + bExp - 0x3FF;
3932     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3933     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3934     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3935     zSig0 |= ( zSig1 != 0 );
3936     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3937         zSig0 <<= 1;
3938         --zExp;
3939     }
3940     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3941
3942 }
3943
3944 /*----------------------------------------------------------------------------
3945 | Returns the result of dividing the double-precision floating-point value `a'
3946 | by the corresponding value `b'.  The operation is performed according to
3947 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3948 *----------------------------------------------------------------------------*/
3949
3950 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3951 {
3952     flag aSign, bSign, zSign;
3953     int_fast16_t aExp, bExp, zExp;
3954     uint64_t aSig, bSig, zSig;
3955     uint64_t rem0, rem1;
3956     uint64_t term0, term1;
3957     a = float64_squash_input_denormal(a STATUS_VAR);
3958     b = float64_squash_input_denormal(b STATUS_VAR);
3959
3960     aSig = extractFloat64Frac( a );
3961     aExp = extractFloat64Exp( a );
3962     aSign = extractFloat64Sign( a );
3963     bSig = extractFloat64Frac( b );
3964     bExp = extractFloat64Exp( b );
3965     bSign = extractFloat64Sign( b );
3966     zSign = aSign ^ bSign;
3967     if ( aExp == 0x7FF ) {
3968         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3969         if ( bExp == 0x7FF ) {
3970             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3971             float_raise( float_flag_invalid STATUS_VAR);
3972             return float64_default_nan;
3973         }
3974         return packFloat64( zSign, 0x7FF, 0 );
3975     }
3976     if ( bExp == 0x7FF ) {
3977         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3978         return packFloat64( zSign, 0, 0 );
3979     }
3980     if ( bExp == 0 ) {
3981         if ( bSig == 0 ) {
3982             if ( ( aExp | aSig ) == 0 ) {
3983                 float_raise( float_flag_invalid STATUS_VAR);
3984                 return float64_default_nan;
3985             }
3986             float_raise( float_flag_divbyzero STATUS_VAR);
3987             return packFloat64( zSign, 0x7FF, 0 );
3988         }
3989         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3990     }
3991     if ( aExp == 0 ) {
3992         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3993         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3994     }
3995     zExp = aExp - bExp + 0x3FD;
3996     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3997     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3998     if ( bSig <= ( aSig + aSig ) ) {
3999         aSig >>= 1;
4000         ++zExp;
4001     }
4002     zSig = estimateDiv128To64( aSig, 0, bSig );
4003     if ( ( zSig & 0x1FF ) <= 2 ) {
4004         mul64To128( bSig, zSig, &term0, &term1 );
4005         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4006         while ( (int64_t) rem0 < 0 ) {
4007             --zSig;
4008             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4009         }
4010         zSig |= ( rem1 != 0 );
4011     }
4012     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
4013
4014 }
4015
4016 /*----------------------------------------------------------------------------
4017 | Returns the remainder of the double-precision floating-point value `a'
4018 | with respect to the corresponding value `b'.  The operation is performed
4019 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4020 *----------------------------------------------------------------------------*/
4021
4022 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
4023 {
4024     flag aSign, zSign;
4025     int_fast16_t aExp, bExp, expDiff;
4026     uint64_t aSig, bSig;
4027     uint64_t q, alternateASig;
4028     int64_t sigMean;
4029
4030     a = float64_squash_input_denormal(a STATUS_VAR);
4031     b = float64_squash_input_denormal(b STATUS_VAR);
4032     aSig = extractFloat64Frac( a );
4033     aExp = extractFloat64Exp( a );
4034     aSign = extractFloat64Sign( a );
4035     bSig = extractFloat64Frac( b );
4036     bExp = extractFloat64Exp( b );
4037     if ( aExp == 0x7FF ) {
4038         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4039             return propagateFloat64NaN( a, b STATUS_VAR );
4040         }
4041         float_raise( float_flag_invalid STATUS_VAR);
4042         return float64_default_nan;
4043     }
4044     if ( bExp == 0x7FF ) {
4045         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
4046         return a;
4047     }
4048     if ( bExp == 0 ) {
4049         if ( bSig == 0 ) {
4050             float_raise( float_flag_invalid STATUS_VAR);
4051             return float64_default_nan;
4052         }
4053         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4054     }
4055     if ( aExp == 0 ) {
4056         if ( aSig == 0 ) return a;
4057         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4058     }
4059     expDiff = aExp - bExp;
4060     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4061     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4062     if ( expDiff < 0 ) {
4063         if ( expDiff < -1 ) return a;
4064         aSig >>= 1;
4065     }
4066     q = ( bSig <= aSig );
4067     if ( q ) aSig -= bSig;
4068     expDiff -= 64;
4069     while ( 0 < expDiff ) {
4070         q = estimateDiv128To64( aSig, 0, bSig );
4071         q = ( 2 < q ) ? q - 2 : 0;
4072         aSig = - ( ( bSig>>2 ) * q );
4073         expDiff -= 62;
4074     }
4075     expDiff += 64;
4076     if ( 0 < expDiff ) {
4077         q = estimateDiv128To64( aSig, 0, bSig );
4078         q = ( 2 < q ) ? q - 2 : 0;
4079         q >>= 64 - expDiff;
4080         bSig >>= 2;
4081         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4082     }
4083     else {
4084         aSig >>= 2;
4085         bSig >>= 2;
4086     }
4087     do {
4088         alternateASig = aSig;
4089         ++q;
4090         aSig -= bSig;
4091     } while ( 0 <= (int64_t) aSig );
4092     sigMean = aSig + alternateASig;
4093     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4094         aSig = alternateASig;
4095     }
4096     zSign = ( (int64_t) aSig < 0 );
4097     if ( zSign ) aSig = - aSig;
4098     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
4099
4100 }
4101
4102 /*----------------------------------------------------------------------------
4103 | Returns the result of multiplying the double-precision floating-point values
4104 | `a' and `b' then adding 'c', with no intermediate rounding step after the
4105 | multiplication.  The operation is performed according to the IEC/IEEE
4106 | Standard for Binary Floating-Point Arithmetic 754-2008.
4107 | The flags argument allows the caller to select negation of the
4108 | addend, the intermediate product, or the final result. (The difference
4109 | between this and having the caller do a separate negation is that negating
4110 | externally will flip the sign bit on NaNs.)
4111 *----------------------------------------------------------------------------*/
4112
4113 float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
4114 {
4115     flag aSign, bSign, cSign, zSign;
4116     int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
4117     uint64_t aSig, bSig, cSig;
4118     flag pInf, pZero, pSign;
4119     uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4120     int shiftcount;
4121     flag signflip, infzero;
4122
4123     a = float64_squash_input_denormal(a STATUS_VAR);
4124     b = float64_squash_input_denormal(b STATUS_VAR);
4125     c = float64_squash_input_denormal(c STATUS_VAR);
4126     aSig = extractFloat64Frac(a);
4127     aExp = extractFloat64Exp(a);
4128     aSign = extractFloat64Sign(a);
4129     bSig = extractFloat64Frac(b);
4130     bExp = extractFloat64Exp(b);
4131     bSign = extractFloat64Sign(b);
4132     cSig = extractFloat64Frac(c);
4133     cExp = extractFloat64Exp(c);
4134     cSign = extractFloat64Sign(c);
4135
4136     infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4137                (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4138
4139     /* It is implementation-defined whether the cases of (0,inf,qnan)
4140      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4141      * they return if they do), so we have to hand this information
4142      * off to the target-specific pick-a-NaN routine.
4143      */
4144     if (((aExp == 0x7ff) && aSig) ||
4145         ((bExp == 0x7ff) && bSig) ||
4146         ((cExp == 0x7ff) && cSig)) {
4147         return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
4148     }
4149
4150     if (infzero) {
4151         float_raise(float_flag_invalid STATUS_VAR);
4152         return float64_default_nan;
4153     }
4154
4155     if (flags & float_muladd_negate_c) {
4156         cSign ^= 1;
4157     }
4158
4159     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4160
4161     /* Work out the sign and type of the product */
4162     pSign = aSign ^ bSign;
4163     if (flags & float_muladd_negate_product) {
4164         pSign ^= 1;
4165     }
4166     pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4167     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4168
4169     if (cExp == 0x7ff) {
4170         if (pInf && (pSign ^ cSign)) {
4171             /* addition of opposite-signed infinities => InvalidOperation */
4172             float_raise(float_flag_invalid STATUS_VAR);
4173             return float64_default_nan;
4174         }
4175         /* Otherwise generate an infinity of the same sign */
4176         return packFloat64(cSign ^ signflip, 0x7ff, 0);
4177     }
4178
4179     if (pInf) {
4180         return packFloat64(pSign ^ signflip, 0x7ff, 0);
4181     }
4182
4183     if (pZero) {
4184         if (cExp == 0) {
4185             if (cSig == 0) {
4186                 /* Adding two exact zeroes */
4187                 if (pSign == cSign) {
4188                     zSign = pSign;
4189                 } else if (STATUS(float_rounding_mode) == float_round_down) {
4190                     zSign = 1;
4191                 } else {
4192                     zSign = 0;
4193                 }
4194                 return packFloat64(zSign ^ signflip, 0, 0);
4195             }
4196             /* Exact zero plus a denorm */
4197             if (STATUS(flush_to_zero)) {
4198                 float_raise(float_flag_output_denormal STATUS_VAR);
4199                 return packFloat64(cSign ^ signflip, 0, 0);
4200             }
4201         }
4202         /* Zero plus something non-zero : just return the something */
4203         if (flags & float_muladd_halve_result) {
4204             if (cExp == 0) {
4205                 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4206             }
4207             /* Subtract one to halve, and one again because roundAndPackFloat64
4208              * wants one less than the true exponent.
4209              */
4210             cExp -= 2;
4211             cSig = (cSig | 0x0010000000000000ULL) << 10;
4212             return roundAndPackFloat64(cSign ^ signflip, cExp, cSig STATUS_VAR);
4213         }
4214         return packFloat64(cSign ^ signflip, cExp, cSig);
4215     }
4216
4217     if (aExp == 0) {
4218         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4219     }
4220     if (bExp == 0) {
4221         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4222     }
4223
4224     /* Calculate the actual result a * b + c */
4225
4226     /* Multiply first; this is easy. */
4227     /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4228      * because we want the true exponent, not the "one-less-than"
4229      * flavour that roundAndPackFloat64() takes.
4230      */
4231     pExp = aExp + bExp - 0x3fe;
4232     aSig = (aSig | LIT64(0x0010000000000000))<<10;
4233     bSig = (bSig | LIT64(0x0010000000000000))<<11;
4234     mul64To128(aSig, bSig, &pSig0, &pSig1);
4235     if ((int64_t)(pSig0 << 1) >= 0) {
4236         shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4237         pExp--;
4238     }
4239
4240     zSign = pSign ^ signflip;
4241
4242     /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4243      * bit in position 126.
4244      */
4245     if (cExp == 0) {
4246         if (!cSig) {
4247             /* Throw out the special case of c being an exact zero now */
4248             shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4249             if (flags & float_muladd_halve_result) {
4250                 pExp--;
4251             }
4252             return roundAndPackFloat64(zSign, pExp - 1,
4253                                        pSig1 STATUS_VAR);
4254         }
4255         normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4256     }
4257
4258     /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4259      * significand of the addend, with the explicit bit in position 126.
4260      */
4261     cSig0 = cSig << (126 - 64 - 52);
4262     cSig1 = 0;
4263     cSig0 |= LIT64(0x4000000000000000);
4264     expDiff = pExp - cExp;
4265
4266     if (pSign == cSign) {
4267         /* Addition */
4268         if (expDiff > 0) {
4269             /* scale c to match p */
4270             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4271             zExp = pExp;
4272         } else if (expDiff < 0) {
4273             /* scale p to match c */
4274             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4275             zExp = cExp;
4276         } else {
4277             /* no scaling needed */
4278             zExp = cExp;
4279         }
4280         /* Add significands and make sure explicit bit ends up in posn 126 */
4281         add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4282         if ((int64_t)zSig0 < 0) {
4283             shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4284         } else {
4285             zExp--;
4286         }
4287         shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4288         if (flags & float_muladd_halve_result) {
4289             zExp--;
4290         }
4291         return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
4292     } else {
4293         /* Subtraction */
4294         if (expDiff > 0) {
4295             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4296             sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4297             zExp = pExp;
4298         } else if (expDiff < 0) {
4299             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4300             sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4301             zExp = cExp;
4302             zSign ^= 1;
4303         } else {
4304             zExp = pExp;
4305             if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4306                 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4307             } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4308                 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4309                 zSign ^= 1;
4310             } else {
4311                 /* Exact zero */
4312                 zSign = signflip;
4313                 if (STATUS(float_rounding_mode) == float_round_down) {
4314                     zSign ^= 1;
4315                 }
4316                 return packFloat64(zSign, 0, 0);
4317             }
4318         }
4319         --zExp;
4320         /* Do the equivalent of normalizeRoundAndPackFloat64() but
4321          * starting with the significand in a pair of uint64_t.
4322          */
4323         if (zSig0) {
4324             shiftcount = countLeadingZeros64(zSig0) - 1;
4325             shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4326             if (zSig1) {
4327                 zSig0 |= 1;
4328             }
4329             zExp -= shiftcount;
4330         } else {
4331             shiftcount = countLeadingZeros64(zSig1);
4332             if (shiftcount == 0) {
4333                 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4334                 zExp -= 63;
4335             } else {
4336                 shiftcount--;
4337                 zSig0 = zSig1 << shiftcount;
4338                 zExp -= (shiftcount + 64);
4339             }
4340         }
4341         if (flags & float_muladd_halve_result) {
4342             zExp--;
4343         }
4344         return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
4345     }
4346 }
4347
4348 /*----------------------------------------------------------------------------
4349 | Returns the square root of the double-precision floating-point value `a'.
4350 | The operation is performed according to the IEC/IEEE Standard for Binary
4351 | Floating-Point Arithmetic.
4352 *----------------------------------------------------------------------------*/
4353
4354 float64 float64_sqrt( float64 a STATUS_PARAM )
4355 {
4356     flag aSign;
4357     int_fast16_t aExp, zExp;
4358     uint64_t aSig, zSig, doubleZSig;
4359     uint64_t rem0, rem1, term0, term1;
4360     a = float64_squash_input_denormal(a STATUS_VAR);
4361
4362     aSig = extractFloat64Frac( a );
4363     aExp = extractFloat64Exp( a );
4364     aSign = extractFloat64Sign( a );
4365     if ( aExp == 0x7FF ) {
4366         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
4367         if ( ! aSign ) return a;
4368         float_raise( float_flag_invalid STATUS_VAR);
4369         return float64_default_nan;
4370     }
4371     if ( aSign ) {
4372         if ( ( aExp | aSig ) == 0 ) return a;
4373         float_raise( float_flag_invalid STATUS_VAR);
4374         return float64_default_nan;
4375     }
4376     if ( aExp == 0 ) {
4377         if ( aSig == 0 ) return float64_zero;
4378         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4379     }
4380     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4381     aSig |= LIT64( 0x0010000000000000 );
4382     zSig = estimateSqrt32( aExp, aSig>>21 );
4383     aSig <<= 9 - ( aExp & 1 );
4384     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4385     if ( ( zSig & 0x1FF ) <= 5 ) {
4386         doubleZSig = zSig<<1;
4387         mul64To128( zSig, zSig, &term0, &term1 );
4388         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4389         while ( (int64_t) rem0 < 0 ) {
4390             --zSig;
4391             doubleZSig -= 2;
4392             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4393         }
4394         zSig |= ( ( rem0 | rem1 ) != 0 );
4395     }
4396     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
4397
4398 }
4399
4400 /*----------------------------------------------------------------------------
4401 | Returns the binary log of the double-precision floating-point value `a'.
4402 | The operation is performed according to the IEC/IEEE Standard for Binary
4403 | Floating-Point Arithmetic.
4404 *----------------------------------------------------------------------------*/
4405 float64 float64_log2( float64 a STATUS_PARAM )
4406 {
4407     flag aSign, zSign;
4408     int_fast16_t aExp;
4409     uint64_t aSig, aSig0, aSig1, zSig, i;
4410     a = float64_squash_input_denormal(a STATUS_VAR);
4411
4412     aSig = extractFloat64Frac( a );
4413     aExp = extractFloat64Exp( a );
4414     aSign = extractFloat64Sign( a );
4415
4416     if ( aExp == 0 ) {
4417         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4418         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4419     }
4420     if ( aSign ) {
4421         float_raise( float_flag_invalid STATUS_VAR);
4422         return float64_default_nan;
4423     }
4424     if ( aExp == 0x7FF ) {
4425         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
4426         return a;
4427     }
4428
4429     aExp -= 0x3FF;
4430     aSig |= LIT64( 0x0010000000000000 );
4431     zSign = aExp < 0;
4432     zSig = (uint64_t)aExp << 52;
4433     for (i = 1LL << 51; i > 0; i >>= 1) {
4434         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4435         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4436         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4437             aSig >>= 1;
4438             zSig |= i;
4439         }
4440     }
4441
4442     if ( zSign )
4443         zSig = -zSig;
4444     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
4445 }
4446
4447 /*----------------------------------------------------------------------------
4448 | Returns 1 if the double-precision floating-point value `a' is equal to the
4449 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4450 | if either operand is a NaN.  Otherwise, the comparison is performed
4451 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4452 *----------------------------------------------------------------------------*/
4453
4454 int float64_eq( float64 a, float64 b STATUS_PARAM )
4455 {
4456     uint64_t av, bv;
4457     a = float64_squash_input_denormal(a STATUS_VAR);
4458     b = float64_squash_input_denormal(b STATUS_VAR);
4459
4460     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4461          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4462        ) {
4463         float_raise( float_flag_invalid STATUS_VAR);
4464         return 0;
4465     }
4466     av = float64_val(a);
4467     bv = float64_val(b);
4468     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4469
4470 }
4471
4472 /*----------------------------------------------------------------------------
4473 | Returns 1 if the double-precision floating-point value `a' is less than or
4474 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4475 | exception is raised if either operand is a NaN.  The comparison is performed
4476 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4477 *----------------------------------------------------------------------------*/
4478
4479 int float64_le( float64 a, float64 b STATUS_PARAM )
4480 {
4481     flag aSign, bSign;
4482     uint64_t av, bv;
4483     a = float64_squash_input_denormal(a STATUS_VAR);
4484     b = float64_squash_input_denormal(b STATUS_VAR);
4485
4486     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4487          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4488        ) {
4489         float_raise( float_flag_invalid STATUS_VAR);
4490         return 0;
4491     }
4492     aSign = extractFloat64Sign( a );
4493     bSign = extractFloat64Sign( b );
4494     av = float64_val(a);
4495     bv = float64_val(b);
4496     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4497     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4498
4499 }
4500
4501 /*----------------------------------------------------------------------------
4502 | Returns 1 if the double-precision floating-point value `a' is less than
4503 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4504 | raised if either operand is a NaN.  The comparison is performed according
4505 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4506 *----------------------------------------------------------------------------*/
4507
4508 int float64_lt( float64 a, float64 b STATUS_PARAM )
4509 {
4510     flag aSign, bSign;
4511     uint64_t av, bv;
4512
4513     a = float64_squash_input_denormal(a STATUS_VAR);
4514     b = float64_squash_input_denormal(b STATUS_VAR);
4515     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4516          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4517        ) {
4518         float_raise( float_flag_invalid STATUS_VAR);
4519         return 0;
4520     }
4521     aSign = extractFloat64Sign( a );
4522     bSign = extractFloat64Sign( b );
4523     av = float64_val(a);
4524     bv = float64_val(b);
4525     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4526     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4527
4528 }
4529
4530 /*----------------------------------------------------------------------------
4531 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4532 | be compared, and 0 otherwise.  The invalid exception is raised if either
4533 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4534 | Standard for Binary Floating-Point Arithmetic.
4535 *----------------------------------------------------------------------------*/
4536
4537 int float64_unordered( float64 a, float64 b STATUS_PARAM )
4538 {
4539     a = float64_squash_input_denormal(a STATUS_VAR);
4540     b = float64_squash_input_denormal(b STATUS_VAR);
4541
4542     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4543          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4544        ) {
4545         float_raise( float_flag_invalid STATUS_VAR);
4546         return 1;
4547     }
4548     return 0;
4549 }
4550
4551 /*----------------------------------------------------------------------------
4552 | Returns 1 if the double-precision floating-point value `a' is equal to the
4553 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4554 | exception.The comparison is performed according to the IEC/IEEE Standard
4555 | for Binary Floating-Point Arithmetic.
4556 *----------------------------------------------------------------------------*/
4557
4558 int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
4559 {
4560     uint64_t av, bv;
4561     a = float64_squash_input_denormal(a STATUS_VAR);
4562     b = float64_squash_input_denormal(b STATUS_VAR);
4563
4564     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4565          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4566        ) {
4567         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4568             float_raise( float_flag_invalid STATUS_VAR);
4569         }
4570         return 0;
4571     }
4572     av = float64_val(a);
4573     bv = float64_val(b);
4574     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4575
4576 }
4577
4578 /*----------------------------------------------------------------------------
4579 | Returns 1 if the double-precision floating-point value `a' is less than or
4580 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4581 | cause an exception.  Otherwise, the comparison is performed according to the
4582 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4583 *----------------------------------------------------------------------------*/
4584
4585 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
4586 {
4587     flag aSign, bSign;
4588     uint64_t av, bv;
4589     a = float64_squash_input_denormal(a STATUS_VAR);
4590     b = float64_squash_input_denormal(b STATUS_VAR);
4591
4592     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4593          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4594        ) {
4595         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4596             float_raise( float_flag_invalid STATUS_VAR);
4597         }
4598         return 0;
4599     }
4600     aSign = extractFloat64Sign( a );
4601     bSign = extractFloat64Sign( b );
4602     av = float64_val(a);
4603     bv = float64_val(b);
4604     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4605     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4606
4607 }
4608
4609 /*----------------------------------------------------------------------------
4610 | Returns 1 if the double-precision floating-point value `a' is less than
4611 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4612 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4613 | Standard for Binary Floating-Point Arithmetic.
4614 *----------------------------------------------------------------------------*/
4615
4616 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
4617 {
4618     flag aSign, bSign;
4619     uint64_t av, bv;
4620     a = float64_squash_input_denormal(a STATUS_VAR);
4621     b = float64_squash_input_denormal(b STATUS_VAR);
4622
4623     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4624          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4625        ) {
4626         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4627             float_raise( float_flag_invalid STATUS_VAR);
4628         }
4629         return 0;
4630     }
4631     aSign = extractFloat64Sign( a );
4632     bSign = extractFloat64Sign( b );
4633     av = float64_val(a);
4634     bv = float64_val(b);
4635     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4636     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4637
4638 }
4639
4640 /*----------------------------------------------------------------------------
4641 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4642 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4643 | comparison is performed according to the IEC/IEEE Standard for Binary
4644 | Floating-Point Arithmetic.
4645 *----------------------------------------------------------------------------*/
4646
4647 int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4648 {
4649     a = float64_squash_input_denormal(a STATUS_VAR);
4650     b = float64_squash_input_denormal(b STATUS_VAR);
4651
4652     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4653          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4654        ) {
4655         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4656             float_raise( float_flag_invalid STATUS_VAR);
4657         }
4658         return 1;
4659     }
4660     return 0;
4661 }
4662
4663 /*----------------------------------------------------------------------------
4664 | Returns the result of converting the extended double-precision floating-
4665 | point value `a' to the 32-bit two's complement integer format.  The
4666 | conversion is performed according to the IEC/IEEE Standard for Binary
4667 | Floating-Point Arithmetic---which means in particular that the conversion
4668 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4669 | largest positive integer is returned.  Otherwise, if the conversion
4670 | overflows, the largest integer with the same sign as `a' is returned.
4671 *----------------------------------------------------------------------------*/
4672
4673 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4674 {
4675     flag aSign;
4676     int32 aExp, shiftCount;
4677     uint64_t aSig;
4678
4679     aSig = extractFloatx80Frac( a );
4680     aExp = extractFloatx80Exp( a );
4681     aSign = extractFloatx80Sign( a );
4682     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4683     shiftCount = 0x4037 - aExp;
4684     if ( shiftCount <= 0 ) shiftCount = 1;
4685     shift64RightJamming( aSig, shiftCount, &aSig );
4686     return roundAndPackInt32( aSign, aSig STATUS_VAR );
4687
4688 }
4689
4690 /*----------------------------------------------------------------------------
4691 | Returns the result of converting the extended double-precision floating-
4692 | point value `a' to the 32-bit two's complement integer format.  The
4693 | conversion is performed according to the IEC/IEEE Standard for Binary
4694 | Floating-Point Arithmetic, except that the conversion is always rounded
4695 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4696 | Otherwise, if the conversion overflows, the largest integer with the same
4697 | sign as `a' is returned.
4698 *----------------------------------------------------------------------------*/
4699
4700 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4701 {
4702     flag aSign;
4703     int32 aExp, shiftCount;
4704     uint64_t aSig, savedASig;
4705     int32_t z;
4706
4707     aSig = extractFloatx80Frac( a );
4708     aExp = extractFloatx80Exp( a );
4709     aSign = extractFloatx80Sign( a );
4710     if ( 0x401E < aExp ) {
4711         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4712         goto invalid;
4713     }
4714     else if ( aExp < 0x3FFF ) {
4715         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4716         return 0;
4717     }
4718     shiftCount = 0x403E - aExp;
4719     savedASig = aSig;
4720     aSig >>= shiftCount;
4721     z = aSig;
4722     if ( aSign ) z = - z;
4723     if ( ( z < 0 ) ^ aSign ) {
4724  invalid:
4725         float_raise( float_flag_invalid STATUS_VAR);
4726         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4727     }
4728     if ( ( aSig<<shiftCount ) != savedASig ) {
4729         STATUS(float_exception_flags) |= float_flag_inexact;
4730     }
4731     return z;
4732
4733 }
4734
4735 /*----------------------------------------------------------------------------
4736 | Returns the result of converting the extended double-precision floating-
4737 | point value `a' to the 64-bit two's complement integer format.  The
4738 | conversion is performed according to the IEC/IEEE Standard for Binary
4739 | Floating-Point Arithmetic---which means in particular that the conversion
4740 | is rounded according to the current rounding mode.  If `a' is a NaN,
4741 | the largest positive integer is returned.  Otherwise, if the conversion
4742 | overflows, the largest integer with the same sign as `a' is returned.
4743 *----------------------------------------------------------------------------*/
4744
4745 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4746 {
4747     flag aSign;
4748     int32 aExp, shiftCount;
4749     uint64_t aSig, aSigExtra;
4750
4751     aSig = extractFloatx80Frac( a );
4752     aExp = extractFloatx80Exp( a );
4753     aSign = extractFloatx80Sign( a );
4754     shiftCount = 0x403E - aExp;
4755     if ( shiftCount <= 0 ) {
4756         if ( shiftCount ) {
4757             float_raise( float_flag_invalid STATUS_VAR);
4758             if (    ! aSign
4759                  || (    ( aExp == 0x7FFF )
4760                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
4761                ) {
4762                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4763             }
4764             return (int64_t) LIT64( 0x8000000000000000 );
4765         }
4766         aSigExtra = 0;
4767     }
4768     else {
4769         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4770     }
4771     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4772
4773 }
4774
4775 /*----------------------------------------------------------------------------
4776 | Returns the result of converting the extended double-precision floating-
4777 | point value `a' to the 64-bit two's complement integer format.  The
4778 | conversion is performed according to the IEC/IEEE Standard for Binary
4779 | Floating-Point Arithmetic, except that the conversion is always rounded
4780 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4781 | Otherwise, if the conversion overflows, the largest integer with the same
4782 | sign as `a' is returned.
4783 *----------------------------------------------------------------------------*/
4784
4785 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4786 {
4787     flag aSign;
4788     int32 aExp, shiftCount;
4789     uint64_t aSig;
4790     int64 z;
4791
4792     aSig = extractFloatx80Frac( a );
4793     aExp = extractFloatx80Exp( a );
4794     aSign = extractFloatx80Sign( a );
4795     shiftCount = aExp - 0x403E;
4796     if ( 0 <= shiftCount ) {
4797         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4798         if ( ( a.high != 0xC03E ) || aSig ) {
4799             float_raise( float_flag_invalid STATUS_VAR);
4800             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4801                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4802             }
4803         }
4804         return (int64_t) LIT64( 0x8000000000000000 );
4805     }
4806     else if ( aExp < 0x3FFF ) {
4807         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4808         return 0;
4809     }
4810     z = aSig>>( - shiftCount );
4811     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4812         STATUS(float_exception_flags) |= float_flag_inexact;
4813     }
4814     if ( aSign ) z = - z;
4815     return z;
4816
4817 }
4818
4819 /*----------------------------------------------------------------------------
4820 | Returns the result of converting the extended double-precision floating-
4821 | point value `a' to the single-precision floating-point format.  The
4822 | conversion is performed according to the IEC/IEEE Standard for Binary
4823 | Floating-Point Arithmetic.
4824 *----------------------------------------------------------------------------*/
4825
4826 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4827 {
4828     flag aSign;
4829     int32 aExp;
4830     uint64_t aSig;
4831
4832     aSig = extractFloatx80Frac( a );
4833     aExp = extractFloatx80Exp( a );
4834     aSign = extractFloatx80Sign( a );
4835     if ( aExp == 0x7FFF ) {
4836         if ( (uint64_t) ( aSig<<1 ) ) {
4837             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4838         }
4839         return packFloat32( aSign, 0xFF, 0 );
4840     }
4841     shift64RightJamming( aSig, 33, &aSig );
4842     if ( aExp || aSig ) aExp -= 0x3F81;
4843     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4844
4845 }
4846
4847 /*----------------------------------------------------------------------------
4848 | Returns the result of converting the extended double-precision floating-
4849 | point value `a' to the double-precision floating-point format.  The
4850 | conversion is performed according to the IEC/IEEE Standard for Binary
4851 | Floating-Point Arithmetic.
4852 *----------------------------------------------------------------------------*/
4853
4854 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4855 {
4856     flag aSign;
4857     int32 aExp;
4858     uint64_t aSig, zSig;
4859
4860     aSig = extractFloatx80Frac( a );
4861     aExp = extractFloatx80Exp( a );
4862     aSign = extractFloatx80Sign( a );
4863     if ( aExp == 0x7FFF ) {
4864         if ( (uint64_t) ( aSig<<1 ) ) {
4865             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4866         }
4867         return packFloat64( aSign, 0x7FF, 0 );
4868     }
4869     shift64RightJamming( aSig, 1, &zSig );
4870     if ( aExp || aSig ) aExp -= 0x3C01;
4871     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4872
4873 }
4874
4875 /*----------------------------------------------------------------------------
4876 | Returns the result of converting the extended double-precision floating-
4877 | point value `a' to the quadruple-precision floating-point format.  The
4878 | conversion is performed according to the IEC/IEEE Standard for Binary
4879 | Floating-Point Arithmetic.
4880 *----------------------------------------------------------------------------*/
4881
4882 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4883 {
4884     flag aSign;
4885     int_fast16_t aExp;
4886     uint64_t aSig, zSig0, zSig1;
4887
4888     aSig = extractFloatx80Frac( a );
4889     aExp = extractFloatx80Exp( a );
4890     aSign = extractFloatx80Sign( a );
4891     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4892         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4893     }
4894     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4895     return packFloat128( aSign, aExp, zSig0, zSig1 );
4896
4897 }
4898
4899 /*----------------------------------------------------------------------------
4900 | Rounds the extended double-precision floating-point value `a' to an integer,
4901 | and returns the result as an extended quadruple-precision floating-point
4902 | value.  The operation is performed according to the IEC/IEEE Standard for
4903 | Binary Floating-Point Arithmetic.
4904 *----------------------------------------------------------------------------*/
4905
4906 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4907 {
4908     flag aSign;
4909     int32 aExp;
4910     uint64_t lastBitMask, roundBitsMask;
4911     floatx80 z;
4912
4913     aExp = extractFloatx80Exp( a );
4914     if ( 0x403E <= aExp ) {
4915         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4916             return propagateFloatx80NaN( a, a STATUS_VAR );
4917         }
4918         return a;
4919     }
4920     if ( aExp < 0x3FFF ) {
4921         if (    ( aExp == 0 )
4922              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4923             return a;
4924         }
4925         STATUS(float_exception_flags) |= float_flag_inexact;
4926         aSign = extractFloatx80Sign( a );
4927         switch ( STATUS(float_rounding_mode) ) {
4928          case float_round_nearest_even:
4929             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4930                ) {
4931                 return
4932                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4933             }
4934             break;
4935         case float_round_ties_away:
4936             if (aExp == 0x3FFE) {
4937                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4938             }
4939             break;
4940          case float_round_down:
4941             return
4942                   aSign ?
4943                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4944                 : packFloatx80( 0, 0, 0 );
4945          case float_round_up:
4946             return
4947                   aSign ? packFloatx80( 1, 0, 0 )
4948                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4949         }
4950         return packFloatx80( aSign, 0, 0 );
4951     }
4952     lastBitMask = 1;
4953     lastBitMask <<= 0x403E - aExp;
4954     roundBitsMask = lastBitMask - 1;
4955     z = a;
4956     switch (STATUS(float_rounding_mode)) {
4957     case float_round_nearest_even:
4958         z.low += lastBitMask>>1;
4959         if ((z.low & roundBitsMask) == 0) {
4960             z.low &= ~lastBitMask;
4961         }
4962         break;
4963     case float_round_ties_away:
4964         z.low += lastBitMask >> 1;
4965         break;
4966     case float_round_to_zero:
4967         break;
4968     case float_round_up:
4969         if (!extractFloatx80Sign(z)) {
4970             z.low += roundBitsMask;
4971         }
4972         break;
4973     case float_round_down:
4974         if (extractFloatx80Sign(z)) {
4975             z.low += roundBitsMask;
4976         }
4977         break;
4978     default:
4979         abort();
4980     }
4981     z.low &= ~ roundBitsMask;
4982     if ( z.low == 0 ) {
4983         ++z.high;
4984         z.low = LIT64( 0x8000000000000000 );
4985     }
4986     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4987     return z;
4988
4989 }
4990
4991 /*----------------------------------------------------------------------------
4992 | Returns the result of adding the absolute values of the extended double-
4993 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4994 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4995 | The addition is performed according to the IEC/IEEE Standard for Binary
4996 | Floating-Point Arithmetic.
4997 *----------------------------------------------------------------------------*/
4998
4999 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
5000 {
5001     int32 aExp, bExp, zExp;
5002     uint64_t aSig, bSig, zSig0, zSig1;
5003     int32 expDiff;
5004
5005     aSig = extractFloatx80Frac( a );
5006     aExp = extractFloatx80Exp( a );
5007     bSig = extractFloatx80Frac( b );
5008     bExp = extractFloatx80Exp( b );
5009     expDiff = aExp - bExp;
5010     if ( 0 < expDiff ) {
5011         if ( aExp == 0x7FFF ) {
5012             if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5013             return a;
5014         }
5015         if ( bExp == 0 ) --expDiff;
5016         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5017         zExp = aExp;
5018     }
5019     else if ( expDiff < 0 ) {
5020         if ( bExp == 0x7FFF ) {
5021             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5022             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5023         }
5024         if ( aExp == 0 ) ++expDiff;
5025         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5026         zExp = bExp;
5027     }
5028     else {
5029         if ( aExp == 0x7FFF ) {
5030             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5031                 return propagateFloatx80NaN( a, b STATUS_VAR );
5032             }
5033             return a;
5034         }
5035         zSig1 = 0;
5036         zSig0 = aSig + bSig;
5037         if ( aExp == 0 ) {
5038             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5039             goto roundAndPack;
5040         }
5041         zExp = aExp;
5042         goto shiftRight1;
5043     }
5044     zSig0 = aSig + bSig;
5045     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5046  shiftRight1:
5047     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5048     zSig0 |= LIT64( 0x8000000000000000 );
5049     ++zExp;
5050  roundAndPack:
5051     return
5052         roundAndPackFloatx80(
5053             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5054
5055 }
5056
5057 /*----------------------------------------------------------------------------
5058 | Returns the result of subtracting the absolute values of the extended
5059 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5060 | difference is negated before being returned.  `zSign' is ignored if the
5061 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5062 | Standard for Binary Floating-Point Arithmetic.
5063 *----------------------------------------------------------------------------*/
5064
5065 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
5066 {
5067     int32 aExp, bExp, zExp;
5068     uint64_t aSig, bSig, zSig0, zSig1;
5069     int32 expDiff;
5070     floatx80 z;
5071
5072     aSig = extractFloatx80Frac( a );
5073     aExp = extractFloatx80Exp( a );
5074     bSig = extractFloatx80Frac( b );
5075     bExp = extractFloatx80Exp( b );
5076     expDiff = aExp - bExp;
5077     if ( 0 < expDiff ) goto aExpBigger;
5078     if ( expDiff < 0 ) goto bExpBigger;
5079     if ( aExp == 0x7FFF ) {
5080         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5081             return propagateFloatx80NaN( a, b STATUS_VAR );
5082         }
5083         float_raise( float_flag_invalid STATUS_VAR);
5084         z.low = floatx80_default_nan_low;
5085         z.high = floatx80_default_nan_high;
5086         return z;
5087     }
5088     if ( aExp == 0 ) {
5089         aExp = 1;
5090         bExp = 1;
5091     }
5092     zSig1 = 0;
5093     if ( bSig < aSig ) goto aBigger;
5094     if ( aSig < bSig ) goto bBigger;
5095     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
5096  bExpBigger:
5097     if ( bExp == 0x7FFF ) {
5098         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5099         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5100     }
5101     if ( aExp == 0 ) ++expDiff;
5102     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5103  bBigger:
5104     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5105     zExp = bExp;
5106     zSign ^= 1;
5107     goto normalizeRoundAndPack;
5108  aExpBigger:
5109     if ( aExp == 0x7FFF ) {
5110         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5111         return a;
5112     }
5113     if ( bExp == 0 ) --expDiff;
5114     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5115  aBigger:
5116     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5117     zExp = aExp;
5118  normalizeRoundAndPack:
5119     return
5120         normalizeRoundAndPackFloatx80(
5121             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5122
5123 }
5124
5125 /*----------------------------------------------------------------------------
5126 | Returns the result of adding the extended double-precision floating-point
5127 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5128 | Standard for Binary Floating-Point Arithmetic.
5129 *----------------------------------------------------------------------------*/
5130
5131 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
5132 {
5133     flag aSign, bSign;
5134
5135     aSign = extractFloatx80Sign( a );
5136     bSign = extractFloatx80Sign( b );
5137     if ( aSign == bSign ) {
5138         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
5139     }
5140     else {
5141         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
5142     }
5143
5144 }
5145
5146 /*----------------------------------------------------------------------------
5147 | Returns the result of subtracting the extended double-precision floating-
5148 | point values `a' and `b'.  The operation is performed according to the
5149 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5150 *----------------------------------------------------------------------------*/
5151
5152 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
5153 {
5154     flag aSign, bSign;
5155
5156     aSign = extractFloatx80Sign( a );
5157     bSign = extractFloatx80Sign( b );
5158     if ( aSign == bSign ) {
5159         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
5160     }
5161     else {
5162         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
5163     }
5164
5165 }
5166
5167 /*----------------------------------------------------------------------------
5168 | Returns the result of multiplying the extended double-precision floating-
5169 | point values `a' and `b'.  The operation is performed according to the
5170 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5171 *----------------------------------------------------------------------------*/
5172
5173 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
5174 {
5175     flag aSign, bSign, zSign;
5176     int32 aExp, bExp, zExp;
5177     uint64_t aSig, bSig, zSig0, zSig1;
5178     floatx80 z;
5179
5180     aSig = extractFloatx80Frac( a );
5181     aExp = extractFloatx80Exp( a );
5182     aSign = extractFloatx80Sign( a );
5183     bSig = extractFloatx80Frac( b );
5184     bExp = extractFloatx80Exp( b );
5185     bSign = extractFloatx80Sign( b );
5186     zSign = aSign ^ bSign;
5187     if ( aExp == 0x7FFF ) {
5188         if (    (uint64_t) ( aSig<<1 )
5189              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5190             return propagateFloatx80NaN( a, b STATUS_VAR );
5191         }
5192         if ( ( bExp | bSig ) == 0 ) goto invalid;
5193         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5194     }
5195     if ( bExp == 0x7FFF ) {
5196         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5197         if ( ( aExp | aSig ) == 0 ) {
5198  invalid:
5199             float_raise( float_flag_invalid STATUS_VAR);
5200             z.low = floatx80_default_nan_low;
5201             z.high = floatx80_default_nan_high;
5202             return z;
5203         }
5204         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5205     }
5206     if ( aExp == 0 ) {
5207         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5208         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5209     }
5210     if ( bExp == 0 ) {
5211         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5212         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5213     }
5214     zExp = aExp + bExp - 0x3FFE;
5215     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5216     if ( 0 < (int64_t) zSig0 ) {
5217         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5218         --zExp;
5219     }
5220     return
5221         roundAndPackFloatx80(
5222             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5223
5224 }
5225
5226 /*----------------------------------------------------------------------------
5227 | Returns the result of dividing the extended double-precision floating-point
5228 | value `a' by the corresponding value `b'.  The operation is performed
5229 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5230 *----------------------------------------------------------------------------*/
5231
5232 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
5233 {
5234     flag aSign, bSign, zSign;
5235     int32 aExp, bExp, zExp;
5236     uint64_t aSig, bSig, zSig0, zSig1;
5237     uint64_t rem0, rem1, rem2, term0, term1, term2;
5238     floatx80 z;
5239
5240     aSig = extractFloatx80Frac( a );
5241     aExp = extractFloatx80Exp( a );
5242     aSign = extractFloatx80Sign( a );
5243     bSig = extractFloatx80Frac( b );
5244     bExp = extractFloatx80Exp( b );
5245     bSign = extractFloatx80Sign( b );
5246     zSign = aSign ^ bSign;
5247     if ( aExp == 0x7FFF ) {
5248         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5249         if ( bExp == 0x7FFF ) {
5250             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5251             goto invalid;
5252         }
5253         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5254     }
5255     if ( bExp == 0x7FFF ) {
5256         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5257         return packFloatx80( zSign, 0, 0 );
5258     }
5259     if ( bExp == 0 ) {
5260         if ( bSig == 0 ) {
5261             if ( ( aExp | aSig ) == 0 ) {
5262  invalid:
5263                 float_raise( float_flag_invalid STATUS_VAR);
5264                 z.low = floatx80_default_nan_low;
5265                 z.high = floatx80_default_nan_high;
5266                 return z;
5267             }
5268             float_raise( float_flag_divbyzero STATUS_VAR);
5269             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5270         }
5271         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5272     }
5273     if ( aExp == 0 ) {
5274         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5275         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5276     }
5277     zExp = aExp - bExp + 0x3FFE;
5278     rem1 = 0;
5279     if ( bSig <= aSig ) {
5280         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5281         ++zExp;
5282     }
5283     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5284     mul64To128( bSig, zSig0, &term0, &term1 );
5285     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5286     while ( (int64_t) rem0 < 0 ) {
5287         --zSig0;
5288         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5289     }
5290     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5291     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5292         mul64To128( bSig, zSig1, &term1, &term2 );
5293         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5294         while ( (int64_t) rem1 < 0 ) {
5295             --zSig1;
5296             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5297         }
5298         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5299     }
5300     return
5301         roundAndPackFloatx80(
5302             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5303
5304 }
5305
5306 /*----------------------------------------------------------------------------
5307 | Returns the remainder of the extended double-precision floating-point value
5308 | `a' with respect to the corresponding value `b'.  The operation is performed
5309 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5310 *----------------------------------------------------------------------------*/
5311
5312 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
5313 {
5314     flag aSign, zSign;
5315     int32 aExp, bExp, expDiff;
5316     uint64_t aSig0, aSig1, bSig;
5317     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5318     floatx80 z;
5319
5320     aSig0 = extractFloatx80Frac( a );
5321     aExp = extractFloatx80Exp( a );
5322     aSign = extractFloatx80Sign( a );
5323     bSig = extractFloatx80Frac( b );
5324     bExp = extractFloatx80Exp( b );
5325     if ( aExp == 0x7FFF ) {
5326         if (    (uint64_t) ( aSig0<<1 )
5327              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5328             return propagateFloatx80NaN( a, b STATUS_VAR );
5329         }
5330         goto invalid;
5331     }
5332     if ( bExp == 0x7FFF ) {
5333         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5334         return a;
5335     }
5336     if ( bExp == 0 ) {
5337         if ( bSig == 0 ) {
5338  invalid:
5339             float_raise( float_flag_invalid STATUS_VAR);
5340             z.low = floatx80_default_nan_low;
5341             z.high = floatx80_default_nan_high;
5342             return z;
5343         }
5344         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5345     }
5346     if ( aExp == 0 ) {
5347         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5348         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5349     }
5350     bSig |= LIT64( 0x8000000000000000 );
5351     zSign = aSign;
5352     expDiff = aExp - bExp;
5353     aSig1 = 0;
5354     if ( expDiff < 0 ) {
5355         if ( expDiff < -1 ) return a;
5356         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5357         expDiff = 0;
5358     }
5359     q = ( bSig <= aSig0 );
5360     if ( q ) aSig0 -= bSig;
5361     expDiff -= 64;
5362     while ( 0 < expDiff ) {
5363         q = estimateDiv128To64( aSig0, aSig1, bSig );
5364         q = ( 2 < q ) ? q - 2 : 0;
5365         mul64To128( bSig, q, &term0, &term1 );
5366         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5367         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5368         expDiff -= 62;
5369     }
5370     expDiff += 64;
5371     if ( 0 < expDiff ) {
5372         q = estimateDiv128To64( aSig0, aSig1, bSig );
5373         q = ( 2 < q ) ? q - 2 : 0;
5374         q >>= 64 - expDiff;
5375         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5376         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5377         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5378         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5379             ++q;
5380             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5381         }
5382     }
5383     else {
5384         term1 = 0;
5385         term0 = bSig;
5386     }
5387     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5388     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5389          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5390               && ( q & 1 ) )
5391        ) {
5392         aSig0 = alternateASig0;
5393         aSig1 = alternateASig1;
5394         zSign = ! zSign;
5395     }
5396     return
5397         normalizeRoundAndPackFloatx80(
5398             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
5399
5400 }
5401
5402 /*----------------------------------------------------------------------------
5403 | Returns the square root of the extended double-precision floating-point
5404 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5405 | for Binary Floating-Point Arithmetic.
5406 *----------------------------------------------------------------------------*/
5407
5408 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
5409 {
5410     flag aSign;
5411     int32 aExp, zExp;
5412     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5413     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5414     floatx80 z;
5415
5416     aSig0 = extractFloatx80Frac( a );
5417     aExp = extractFloatx80Exp( a );
5418     aSign = extractFloatx80Sign( a );
5419     if ( aExp == 0x7FFF ) {
5420         if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
5421         if ( ! aSign ) return a;
5422         goto invalid;
5423     }
5424     if ( aSign ) {
5425         if ( ( aExp | aSig0 ) == 0 ) return a;
5426  invalid:
5427         float_raise( float_flag_invalid STATUS_VAR);
5428         z.low = floatx80_default_nan_low;
5429         z.high = floatx80_default_nan_high;
5430         return z;
5431     }
5432     if ( aExp == 0 ) {
5433         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5434         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5435     }
5436     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5437     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5438     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5439     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5440     doubleZSig0 = zSig0<<1;
5441     mul64To128( zSig0, zSig0, &term0, &term1 );
5442     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5443     while ( (int64_t) rem0 < 0 ) {
5444         --zSig0;
5445         doubleZSig0 -= 2;
5446         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5447     }
5448     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5449     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5450         if ( zSig1 == 0 ) zSig1 = 1;
5451         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5452         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5453         mul64To128( zSig1, zSig1, &term2, &term3 );
5454         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5455         while ( (int64_t) rem1 < 0 ) {
5456             --zSig1;
5457             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5458             term3 |= 1;
5459             term2 |= doubleZSig0;
5460             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5461         }
5462         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5463     }
5464     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5465     zSig0 |= doubleZSig0;
5466     return
5467         roundAndPackFloatx80(
5468             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
5469
5470 }
5471
5472 /*----------------------------------------------------------------------------
5473 | Returns 1 if the extended double-precision floating-point value `a' is equal
5474 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5475 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5476 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5477 *----------------------------------------------------------------------------*/
5478
5479 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
5480 {
5481
5482     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5483               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5484          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5485               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5486        ) {
5487         float_raise( float_flag_invalid STATUS_VAR);
5488         return 0;
5489     }
5490     return
5491            ( a.low == b.low )
5492         && (    ( a.high == b.high )
5493              || (    ( a.low == 0 )
5494                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5495            );
5496
5497 }
5498
5499 /*----------------------------------------------------------------------------
5500 | Returns 1 if the extended double-precision floating-point value `a' is
5501 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5502 | invalid exception is raised if either operand is a NaN.  The comparison is
5503 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5504 | Arithmetic.
5505 *----------------------------------------------------------------------------*/
5506
5507 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
5508 {
5509     flag aSign, bSign;
5510
5511     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5512               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5513          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5514               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5515        ) {
5516         float_raise( float_flag_invalid STATUS_VAR);
5517         return 0;
5518     }
5519     aSign = extractFloatx80Sign( a );
5520     bSign = extractFloatx80Sign( b );
5521     if ( aSign != bSign ) {
5522         return
5523                aSign
5524             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5525                  == 0 );
5526     }
5527     return
5528           aSign ? le128( b.high, b.low, a.high, a.low )
5529         : le128( a.high, a.low, b.high, b.low );
5530
5531 }
5532
5533 /*----------------------------------------------------------------------------
5534 | Returns 1 if the extended double-precision floating-point value `a' is
5535 | less than the corresponding value `b', and 0 otherwise.  The invalid
5536 | exception is raised if either operand is a NaN.  The comparison is performed
5537 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5538 *----------------------------------------------------------------------------*/
5539
5540 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
5541 {
5542     flag aSign, bSign;
5543
5544     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5545               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5546          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5547               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5548        ) {
5549         float_raise( float_flag_invalid STATUS_VAR);
5550         return 0;
5551     }
5552     aSign = extractFloatx80Sign( a );
5553     bSign = extractFloatx80Sign( b );
5554     if ( aSign != bSign ) {
5555         return
5556                aSign
5557             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5558                  != 0 );
5559     }
5560     return
5561           aSign ? lt128( b.high, b.low, a.high, a.low )
5562         : lt128( a.high, a.low, b.high, b.low );
5563
5564 }
5565
5566 /*----------------------------------------------------------------------------
5567 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5568 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5569 | either operand is a NaN.   The comparison is performed according to the
5570 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5571 *----------------------------------------------------------------------------*/
5572 int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5573 {
5574     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5575               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5576          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5577               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5578        ) {
5579         float_raise( float_flag_invalid STATUS_VAR);
5580         return 1;
5581     }
5582     return 0;
5583 }
5584
5585 /*----------------------------------------------------------------------------
5586 | Returns 1 if the extended double-precision floating-point value `a' is
5587 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5588 | cause an exception.  The comparison is performed according to the IEC/IEEE
5589 | Standard for Binary Floating-Point Arithmetic.
5590 *----------------------------------------------------------------------------*/
5591
5592 int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5593 {
5594
5595     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5596               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5597          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5598               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5599        ) {
5600         if (    floatx80_is_signaling_nan( a )
5601              || floatx80_is_signaling_nan( b ) ) {
5602             float_raise( float_flag_invalid STATUS_VAR);
5603         }
5604         return 0;
5605     }
5606     return
5607            ( a.low == b.low )
5608         && (    ( a.high == b.high )
5609              || (    ( a.low == 0 )
5610                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5611            );
5612
5613 }
5614
5615 /*----------------------------------------------------------------------------
5616 | Returns 1 if the extended double-precision floating-point value `a' is less
5617 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5618 | do not cause an exception.  Otherwise, the comparison is performed according
5619 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5620 *----------------------------------------------------------------------------*/
5621
5622 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5623 {
5624     flag aSign, bSign;
5625
5626     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5627               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5628          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5629               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5630        ) {
5631         if (    floatx80_is_signaling_nan( a )
5632              || floatx80_is_signaling_nan( b ) ) {
5633             float_raise( float_flag_invalid STATUS_VAR);
5634         }
5635         return 0;
5636     }
5637     aSign = extractFloatx80Sign( a );
5638     bSign = extractFloatx80Sign( b );
5639     if ( aSign != bSign ) {
5640         return
5641                aSign
5642             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5643                  == 0 );
5644     }
5645     return
5646           aSign ? le128( b.high, b.low, a.high, a.low )
5647         : le128( a.high, a.low, b.high, b.low );
5648
5649 }
5650
5651 /*----------------------------------------------------------------------------
5652 | Returns 1 if the extended double-precision floating-point value `a' is less
5653 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5654 | an exception.  Otherwise, the comparison is performed according to the
5655 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5656 *----------------------------------------------------------------------------*/
5657
5658 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5659 {
5660     flag aSign, bSign;
5661
5662     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5663               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5664          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5665               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5666        ) {
5667         if (    floatx80_is_signaling_nan( a )
5668              || floatx80_is_signaling_nan( b ) ) {
5669             float_raise( float_flag_invalid STATUS_VAR);
5670         }
5671         return 0;
5672     }
5673     aSign = extractFloatx80Sign( a );
5674     bSign = extractFloatx80Sign( b );
5675     if ( aSign != bSign ) {
5676         return
5677                aSign
5678             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5679                  != 0 );
5680     }
5681     return
5682           aSign ? lt128( b.high, b.low, a.high, a.low )
5683         : lt128( a.high, a.low, b.high, b.low );
5684
5685 }
5686
5687 /*----------------------------------------------------------------------------
5688 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5689 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5690 | The comparison is performed according to the IEC/IEEE Standard for Binary
5691 | Floating-Point Arithmetic.
5692 *----------------------------------------------------------------------------*/
5693 int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5694 {
5695     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5696               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5697          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5698               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5699        ) {
5700         if (    floatx80_is_signaling_nan( a )
5701              || floatx80_is_signaling_nan( b ) ) {
5702             float_raise( float_flag_invalid STATUS_VAR);
5703         }
5704         return 1;
5705     }
5706     return 0;
5707 }
5708
5709 /*----------------------------------------------------------------------------
5710 | Returns the result of converting the quadruple-precision floating-point
5711 | value `a' to the 32-bit two's complement integer format.  The conversion
5712 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5713 | Arithmetic---which means in particular that the conversion is rounded
5714 | according to the current rounding mode.  If `a' is a NaN, the largest
5715 | positive integer is returned.  Otherwise, if the conversion overflows, the
5716 | largest integer with the same sign as `a' is returned.
5717 *----------------------------------------------------------------------------*/
5718
5719 int32 float128_to_int32( float128 a STATUS_PARAM )
5720 {
5721     flag aSign;
5722     int32 aExp, shiftCount;
5723     uint64_t aSig0, aSig1;
5724
5725     aSig1 = extractFloat128Frac1( a );
5726     aSig0 = extractFloat128Frac0( a );
5727     aExp = extractFloat128Exp( a );
5728     aSign = extractFloat128Sign( a );
5729     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5730     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5731     aSig0 |= ( aSig1 != 0 );
5732     shiftCount = 0x4028 - aExp;
5733     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5734     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5735
5736 }
5737
5738 /*----------------------------------------------------------------------------
5739 | Returns the result of converting the quadruple-precision floating-point
5740 | value `a' to the 32-bit two's complement integer format.  The conversion
5741 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5742 | Arithmetic, except that the conversion is always rounded toward zero.  If
5743 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5744 | conversion overflows, the largest integer with the same sign as `a' is
5745 | returned.
5746 *----------------------------------------------------------------------------*/
5747
5748 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5749 {
5750     flag aSign;
5751     int32 aExp, shiftCount;
5752     uint64_t aSig0, aSig1, savedASig;
5753     int32_t z;
5754
5755     aSig1 = extractFloat128Frac1( a );
5756     aSig0 = extractFloat128Frac0( a );
5757     aExp = extractFloat128Exp( a );
5758     aSign = extractFloat128Sign( a );
5759     aSig0 |= ( aSig1 != 0 );
5760     if ( 0x401E < aExp ) {
5761         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5762         goto invalid;
5763     }
5764     else if ( aExp < 0x3FFF ) {
5765         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5766         return 0;
5767     }
5768     aSig0 |= LIT64( 0x0001000000000000 );
5769     shiftCount = 0x402F - aExp;
5770     savedASig = aSig0;
5771     aSig0 >>= shiftCount;
5772     z = aSig0;
5773     if ( aSign ) z = - z;
5774     if ( ( z < 0 ) ^ aSign ) {
5775  invalid:
5776         float_raise( float_flag_invalid STATUS_VAR);
5777         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5778     }
5779     if ( ( aSig0<<shiftCount ) != savedASig ) {
5780         STATUS(float_exception_flags) |= float_flag_inexact;
5781     }
5782     return z;
5783
5784 }
5785
5786 /*----------------------------------------------------------------------------
5787 | Returns the result of converting the quadruple-precision floating-point
5788 | value `a' to the 64-bit two's complement integer format.  The conversion
5789 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5790 | Arithmetic---which means in particular that the conversion is rounded
5791 | according to the current rounding mode.  If `a' is a NaN, the largest
5792 | positive integer is returned.  Otherwise, if the conversion overflows, the
5793 | largest integer with the same sign as `a' is returned.
5794 *----------------------------------------------------------------------------*/
5795
5796 int64 float128_to_int64( float128 a STATUS_PARAM )
5797 {
5798     flag aSign;
5799     int32 aExp, shiftCount;
5800     uint64_t aSig0, aSig1;
5801
5802     aSig1 = extractFloat128Frac1( a );
5803     aSig0 = extractFloat128Frac0( a );
5804     aExp = extractFloat128Exp( a );
5805     aSign = extractFloat128Sign( a );
5806     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5807     shiftCount = 0x402F - aExp;
5808     if ( shiftCount <= 0 ) {
5809         if ( 0x403E < aExp ) {
5810             float_raise( float_flag_invalid STATUS_VAR);
5811             if (    ! aSign
5812                  || (    ( aExp == 0x7FFF )
5813                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5814                     )
5815                ) {
5816                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5817             }
5818             return (int64_t) LIT64( 0x8000000000000000 );
5819         }
5820         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5821     }
5822     else {
5823         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5824     }
5825     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5826
5827 }
5828
5829 /*----------------------------------------------------------------------------
5830 | Returns the result of converting the quadruple-precision floating-point
5831 | value `a' to the 64-bit two's complement integer format.  The conversion
5832 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5833 | Arithmetic, except that the conversion is always rounded toward zero.
5834 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5835 | the conversion overflows, the largest integer with the same sign as `a' is
5836 | returned.
5837 *----------------------------------------------------------------------------*/
5838
5839 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5840 {
5841     flag aSign;
5842     int32 aExp, shiftCount;
5843     uint64_t aSig0, aSig1;
5844     int64 z;
5845
5846     aSig1 = extractFloat128Frac1( a );
5847     aSig0 = extractFloat128Frac0( a );
5848     aExp = extractFloat128Exp( a );
5849     aSign = extractFloat128Sign( a );
5850     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5851     shiftCount = aExp - 0x402F;
5852     if ( 0 < shiftCount ) {
5853         if ( 0x403E <= aExp ) {
5854             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5855             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5856                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5857                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5858             }
5859             else {
5860                 float_raise( float_flag_invalid STATUS_VAR);
5861                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5862                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5863                 }
5864             }
5865             return (int64_t) LIT64( 0x8000000000000000 );
5866         }
5867         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5868         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5869             STATUS(float_exception_flags) |= float_flag_inexact;
5870         }
5871     }
5872     else {
5873         if ( aExp < 0x3FFF ) {
5874             if ( aExp | aSig0 | aSig1 ) {
5875                 STATUS(float_exception_flags) |= float_flag_inexact;
5876             }
5877             return 0;
5878         }
5879         z = aSig0>>( - shiftCount );
5880         if (    aSig1
5881              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5882             STATUS(float_exception_flags) |= float_flag_inexact;
5883         }
5884     }
5885     if ( aSign ) z = - z;
5886     return z;
5887
5888 }
5889
5890 /*----------------------------------------------------------------------------
5891 | Returns the result of converting the quadruple-precision floating-point
5892 | value `a' to the single-precision floating-point format.  The conversion
5893 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5894 | Arithmetic.
5895 *----------------------------------------------------------------------------*/
5896
5897 float32 float128_to_float32( float128 a STATUS_PARAM )
5898 {
5899     flag aSign;
5900     int32 aExp;
5901     uint64_t aSig0, aSig1;
5902     uint32_t zSig;
5903
5904     aSig1 = extractFloat128Frac1( a );
5905     aSig0 = extractFloat128Frac0( a );
5906     aExp = extractFloat128Exp( a );
5907     aSign = extractFloat128Sign( a );
5908     if ( aExp == 0x7FFF ) {
5909         if ( aSig0 | aSig1 ) {
5910             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5911         }
5912         return packFloat32( aSign, 0xFF, 0 );
5913     }
5914     aSig0 |= ( aSig1 != 0 );
5915     shift64RightJamming( aSig0, 18, &aSig0 );
5916     zSig = aSig0;
5917     if ( aExp || zSig ) {
5918         zSig |= 0x40000000;
5919         aExp -= 0x3F81;
5920     }
5921     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5922
5923 }
5924
5925 /*----------------------------------------------------------------------------
5926 | Returns the result of converting the quadruple-precision floating-point
5927 | value `a' to the double-precision floating-point format.  The conversion
5928 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5929 | Arithmetic.
5930 *----------------------------------------------------------------------------*/
5931
5932 float64 float128_to_float64( float128 a STATUS_PARAM )
5933 {
5934     flag aSign;
5935     int32 aExp;
5936     uint64_t aSig0, aSig1;
5937
5938     aSig1 = extractFloat128Frac1( a );
5939     aSig0 = extractFloat128Frac0( a );
5940     aExp = extractFloat128Exp( a );
5941     aSign = extractFloat128Sign( a );
5942     if ( aExp == 0x7FFF ) {
5943         if ( aSig0 | aSig1 ) {
5944             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5945         }
5946         return packFloat64( aSign, 0x7FF, 0 );
5947     }
5948     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5949     aSig0 |= ( aSig1 != 0 );
5950     if ( aExp || aSig0 ) {
5951         aSig0 |= LIT64( 0x4000000000000000 );
5952         aExp -= 0x3C01;
5953     }
5954     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5955
5956 }
5957
5958 /*----------------------------------------------------------------------------
5959 | Returns the result of converting the quadruple-precision floating-point
5960 | value `a' to the extended double-precision floating-point format.  The
5961 | conversion is performed according to the IEC/IEEE Standard for Binary
5962 | Floating-Point Arithmetic.
5963 *----------------------------------------------------------------------------*/
5964
5965 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5966 {
5967     flag aSign;
5968     int32 aExp;
5969     uint64_t aSig0, aSig1;
5970
5971     aSig1 = extractFloat128Frac1( a );
5972     aSig0 = extractFloat128Frac0( a );
5973     aExp = extractFloat128Exp( a );
5974     aSign = extractFloat128Sign( a );
5975     if ( aExp == 0x7FFF ) {
5976         if ( aSig0 | aSig1 ) {
5977             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5978         }
5979         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5980     }
5981     if ( aExp == 0 ) {
5982         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5983         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5984     }
5985     else {
5986         aSig0 |= LIT64( 0x0001000000000000 );
5987     }
5988     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5989     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5990
5991 }
5992
5993 /*----------------------------------------------------------------------------
5994 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5995 | returns the result as a quadruple-precision floating-point value.  The
5996 | operation is performed according to the IEC/IEEE Standard for Binary
5997 | Floating-Point Arithmetic.
5998 *----------------------------------------------------------------------------*/
5999
6000 float128 float128_round_to_int( float128 a STATUS_PARAM )
6001 {
6002     flag aSign;
6003     int32 aExp;
6004     uint64_t lastBitMask, roundBitsMask;
6005     float128 z;
6006
6007     aExp = extractFloat128Exp( a );
6008     if ( 0x402F <= aExp ) {
6009         if ( 0x406F <= aExp ) {
6010             if (    ( aExp == 0x7FFF )
6011                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6012                ) {
6013                 return propagateFloat128NaN( a, a STATUS_VAR );
6014             }
6015             return a;
6016         }
6017         lastBitMask = 1;
6018         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6019         roundBitsMask = lastBitMask - 1;
6020         z = a;
6021         switch (STATUS(float_rounding_mode)) {
6022         case float_round_nearest_even:
6023             if ( lastBitMask ) {
6024                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6025                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6026             }
6027             else {
6028                 if ( (int64_t) z.low < 0 ) {
6029                     ++z.high;
6030                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6031                 }
6032             }
6033             break;
6034         case float_round_ties_away:
6035             if (lastBitMask) {
6036                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6037             } else {
6038                 if ((int64_t) z.low < 0) {
6039                     ++z.high;
6040                 }
6041             }
6042             break;
6043         case float_round_to_zero:
6044             break;
6045         case float_round_up:
6046             if (!extractFloat128Sign(z)) {
6047                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6048             }
6049             break;
6050         case float_round_down:
6051             if (extractFloat128Sign(z)) {
6052                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6053             }
6054             break;
6055         default:
6056             abort();
6057         }
6058         z.low &= ~ roundBitsMask;
6059     }
6060     else {
6061         if ( aExp < 0x3FFF ) {
6062             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6063             STATUS(float_exception_flags) |= float_flag_inexact;
6064             aSign = extractFloat128Sign( a );
6065             switch ( STATUS(float_rounding_mode) ) {
6066              case float_round_nearest_even:
6067                 if (    ( aExp == 0x3FFE )
6068                      && (   extractFloat128Frac0( a )
6069                           | extractFloat128Frac1( a ) )
6070                    ) {
6071                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6072                 }
6073                 break;
6074             case float_round_ties_away:
6075                 if (aExp == 0x3FFE) {
6076                     return packFloat128(aSign, 0x3FFF, 0, 0);
6077                 }
6078                 break;
6079              case float_round_down:
6080                 return
6081                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6082                     : packFloat128( 0, 0, 0, 0 );
6083              case float_round_up:
6084                 return
6085                       aSign ? packFloat128( 1, 0, 0, 0 )
6086                     : packFloat128( 0, 0x3FFF, 0, 0 );
6087             }
6088             return packFloat128( aSign, 0, 0, 0 );
6089         }
6090         lastBitMask = 1;
6091         lastBitMask <<= 0x402F - aExp;
6092         roundBitsMask = lastBitMask - 1;
6093         z.low = 0;
6094         z.high = a.high;
6095         switch (STATUS(float_rounding_mode)) {
6096         case float_round_nearest_even:
6097             z.high += lastBitMask>>1;
6098             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6099                 z.high &= ~ lastBitMask;
6100             }
6101             break;
6102         case float_round_ties_away:
6103             z.high += lastBitMask>>1;
6104             break;
6105         case float_round_to_zero:
6106             break;
6107         case float_round_up:
6108             if (!extractFloat128Sign(z)) {
6109                 z.high |= ( a.low != 0 );
6110                 z.high += roundBitsMask;
6111             }
6112             break;
6113         case float_round_down:
6114             if (extractFloat128Sign(z)) {
6115                 z.high |= (a.low != 0);
6116                 z.high += roundBitsMask;
6117             }
6118             break;
6119         default:
6120             abort();
6121         }
6122         z.high &= ~ roundBitsMask;
6123     }
6124     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6125         STATUS(float_exception_flags) |= float_flag_inexact;
6126     }
6127     return z;
6128
6129 }
6130
6131 /*----------------------------------------------------------------------------
6132 | Returns the result of adding the absolute values of the quadruple-precision
6133 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6134 | before being returned.  `zSign' is ignored if the result is a NaN.
6135 | The addition is performed according to the IEC/IEEE Standard for Binary
6136 | Floating-Point Arithmetic.
6137 *----------------------------------------------------------------------------*/
6138
6139 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
6140 {
6141     int32 aExp, bExp, zExp;
6142     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6143     int32 expDiff;
6144
6145     aSig1 = extractFloat128Frac1( a );
6146     aSig0 = extractFloat128Frac0( a );
6147     aExp = extractFloat128Exp( a );
6148     bSig1 = extractFloat128Frac1( b );
6149     bSig0 = extractFloat128Frac0( b );
6150     bExp = extractFloat128Exp( b );
6151     expDiff = aExp - bExp;
6152     if ( 0 < expDiff ) {
6153         if ( aExp == 0x7FFF ) {
6154             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6155             return a;
6156         }
6157         if ( bExp == 0 ) {
6158             --expDiff;
6159         }
6160         else {
6161             bSig0 |= LIT64( 0x0001000000000000 );
6162         }
6163         shift128ExtraRightJamming(
6164             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6165         zExp = aExp;
6166     }
6167     else if ( expDiff < 0 ) {
6168         if ( bExp == 0x7FFF ) {
6169             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6170             return packFloat128( zSign, 0x7FFF, 0, 0 );
6171         }
6172         if ( aExp == 0 ) {
6173             ++expDiff;
6174         }
6175         else {
6176             aSig0 |= LIT64( 0x0001000000000000 );
6177         }
6178         shift128ExtraRightJamming(
6179             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6180         zExp = bExp;
6181     }
6182     else {
6183         if ( aExp == 0x7FFF ) {
6184             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6185                 return propagateFloat128NaN( a, b STATUS_VAR );
6186             }
6187             return a;
6188         }
6189         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6190         if ( aExp == 0 ) {
6191             if (STATUS(flush_to_zero)) {
6192                 if (zSig0 | zSig1) {
6193                     float_raise(float_flag_output_denormal STATUS_VAR);
6194                 }
6195                 return packFloat128(zSign, 0, 0, 0);
6196             }
6197             return packFloat128( zSign, 0, zSig0, zSig1 );
6198         }
6199         zSig2 = 0;
6200         zSig0 |= LIT64( 0x0002000000000000 );
6201         zExp = aExp;
6202         goto shiftRight1;
6203     }
6204     aSig0 |= LIT64( 0x0001000000000000 );
6205     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6206     --zExp;
6207     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6208     ++zExp;
6209  shiftRight1:
6210     shift128ExtraRightJamming(
6211         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6212  roundAndPack:
6213     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6214
6215 }
6216
6217 /*----------------------------------------------------------------------------
6218 | Returns the result of subtracting the absolute values of the quadruple-
6219 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6220 | difference is negated before being returned.  `zSign' is ignored if the
6221 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6222 | Standard for Binary Floating-Point Arithmetic.
6223 *----------------------------------------------------------------------------*/
6224
6225 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
6226 {
6227     int32 aExp, bExp, zExp;
6228     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6229     int32 expDiff;
6230     float128 z;
6231
6232     aSig1 = extractFloat128Frac1( a );
6233     aSig0 = extractFloat128Frac0( a );
6234     aExp = extractFloat128Exp( a );
6235     bSig1 = extractFloat128Frac1( b );
6236     bSig0 = extractFloat128Frac0( b );
6237     bExp = extractFloat128Exp( b );
6238     expDiff = aExp - bExp;
6239     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6240     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6241     if ( 0 < expDiff ) goto aExpBigger;
6242     if ( expDiff < 0 ) goto bExpBigger;
6243     if ( aExp == 0x7FFF ) {
6244         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6245             return propagateFloat128NaN( a, b STATUS_VAR );
6246         }
6247         float_raise( float_flag_invalid STATUS_VAR);
6248         z.low = float128_default_nan_low;
6249         z.high = float128_default_nan_high;
6250         return z;
6251     }
6252     if ( aExp == 0 ) {
6253         aExp = 1;
6254         bExp = 1;
6255     }
6256     if ( bSig0 < aSig0 ) goto aBigger;
6257     if ( aSig0 < bSig0 ) goto bBigger;
6258     if ( bSig1 < aSig1 ) goto aBigger;
6259     if ( aSig1 < bSig1 ) goto bBigger;
6260     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
6261  bExpBigger:
6262     if ( bExp == 0x7FFF ) {
6263         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6264         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6265     }
6266     if ( aExp == 0 ) {
6267         ++expDiff;
6268     }
6269     else {
6270         aSig0 |= LIT64( 0x4000000000000000 );
6271     }
6272     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6273     bSig0 |= LIT64( 0x4000000000000000 );
6274  bBigger:
6275     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6276     zExp = bExp;
6277     zSign ^= 1;
6278     goto normalizeRoundAndPack;
6279  aExpBigger:
6280     if ( aExp == 0x7FFF ) {
6281         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6282         return a;
6283     }
6284     if ( bExp == 0 ) {
6285         --expDiff;
6286     }
6287     else {
6288         bSig0 |= LIT64( 0x4000000000000000 );
6289     }
6290     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6291     aSig0 |= LIT64( 0x4000000000000000 );
6292  aBigger:
6293     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6294     zExp = aExp;
6295  normalizeRoundAndPack:
6296     --zExp;
6297     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
6298
6299 }
6300
6301 /*----------------------------------------------------------------------------
6302 | Returns the result of adding the quadruple-precision floating-point values
6303 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6304 | for Binary Floating-Point Arithmetic.
6305 *----------------------------------------------------------------------------*/
6306
6307 float128 float128_add( float128 a, float128 b STATUS_PARAM )
6308 {
6309     flag aSign, bSign;
6310
6311     aSign = extractFloat128Sign( a );
6312     bSign = extractFloat128Sign( b );
6313     if ( aSign == bSign ) {
6314         return addFloat128Sigs( a, b, aSign STATUS_VAR );
6315     }
6316     else {
6317         return subFloat128Sigs( a, b, aSign STATUS_VAR );
6318     }
6319
6320 }
6321
6322 /*----------------------------------------------------------------------------
6323 | Returns the result of subtracting the quadruple-precision floating-point
6324 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6325 | Standard for Binary Floating-Point Arithmetic.
6326 *----------------------------------------------------------------------------*/
6327
6328 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
6329 {
6330     flag aSign, bSign;
6331
6332     aSign = extractFloat128Sign( a );
6333     bSign = extractFloat128Sign( b );
6334     if ( aSign == bSign ) {
6335         return subFloat128Sigs( a, b, aSign STATUS_VAR );
6336     }
6337     else {
6338         return addFloat128Sigs( a, b, aSign STATUS_VAR );
6339     }
6340
6341 }
6342
6343 /*----------------------------------------------------------------------------
6344 | Returns the result of multiplying the quadruple-precision floating-point
6345 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6346 | Standard for Binary Floating-Point Arithmetic.
6347 *----------------------------------------------------------------------------*/
6348
6349 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
6350 {
6351     flag aSign, bSign, zSign;
6352     int32 aExp, bExp, zExp;
6353     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6354     float128 z;
6355
6356     aSig1 = extractFloat128Frac1( a );
6357     aSig0 = extractFloat128Frac0( a );
6358     aExp = extractFloat128Exp( a );
6359     aSign = extractFloat128Sign( a );
6360     bSig1 = extractFloat128Frac1( b );
6361     bSig0 = extractFloat128Frac0( b );
6362     bExp = extractFloat128Exp( b );
6363     bSign = extractFloat128Sign( b );
6364     zSign = aSign ^ bSign;
6365     if ( aExp == 0x7FFF ) {
6366         if (    ( aSig0 | aSig1 )
6367              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6368             return propagateFloat128NaN( a, b STATUS_VAR );
6369         }
6370         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6371         return packFloat128( zSign, 0x7FFF, 0, 0 );
6372     }
6373     if ( bExp == 0x7FFF ) {
6374         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6375         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6376  invalid:
6377             float_raise( float_flag_invalid STATUS_VAR);
6378             z.low = float128_default_nan_low;
6379             z.high = float128_default_nan_high;
6380             return z;
6381         }
6382         return packFloat128( zSign, 0x7FFF, 0, 0 );
6383     }
6384     if ( aExp == 0 ) {
6385         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6386         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6387     }
6388     if ( bExp == 0 ) {
6389         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6390         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6391     }
6392     zExp = aExp + bExp - 0x4000;
6393     aSig0 |= LIT64( 0x0001000000000000 );
6394     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6395     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6396     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6397     zSig2 |= ( zSig3 != 0 );
6398     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6399         shift128ExtraRightJamming(
6400             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6401         ++zExp;
6402     }
6403     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6404
6405 }
6406
6407 /*----------------------------------------------------------------------------
6408 | Returns the result of dividing the quadruple-precision floating-point value
6409 | `a' by the corresponding value `b'.  The operation is performed according to
6410 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6411 *----------------------------------------------------------------------------*/
6412
6413 float128 float128_div( float128 a, float128 b STATUS_PARAM )
6414 {
6415     flag aSign, bSign, zSign;
6416     int32 aExp, bExp, zExp;
6417     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6418     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6419     float128 z;
6420
6421     aSig1 = extractFloat128Frac1( a );
6422     aSig0 = extractFloat128Frac0( a );
6423     aExp = extractFloat128Exp( a );
6424     aSign = extractFloat128Sign( a );
6425     bSig1 = extractFloat128Frac1( b );
6426     bSig0 = extractFloat128Frac0( b );
6427     bExp = extractFloat128Exp( b );
6428     bSign = extractFloat128Sign( b );
6429     zSign = aSign ^ bSign;
6430     if ( aExp == 0x7FFF ) {
6431         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6432         if ( bExp == 0x7FFF ) {
6433             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6434             goto invalid;
6435         }
6436         return packFloat128( zSign, 0x7FFF, 0, 0 );
6437     }
6438     if ( bExp == 0x7FFF ) {
6439         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6440         return packFloat128( zSign, 0, 0, 0 );
6441     }
6442     if ( bExp == 0 ) {
6443         if ( ( bSig0 | bSig1 ) == 0 ) {
6444             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6445  invalid:
6446                 float_raise( float_flag_invalid STATUS_VAR);
6447                 z.low = float128_default_nan_low;
6448                 z.high = float128_default_nan_high;
6449                 return z;
6450             }
6451             float_raise( float_flag_divbyzero STATUS_VAR);
6452             return packFloat128( zSign, 0x7FFF, 0, 0 );
6453         }
6454         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6455     }
6456     if ( aExp == 0 ) {
6457         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6458         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6459     }
6460     zExp = aExp - bExp + 0x3FFD;
6461     shortShift128Left(
6462         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6463     shortShift128Left(
6464         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6465     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6466         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6467         ++zExp;
6468     }
6469     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6470     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6471     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6472     while ( (int64_t) rem0 < 0 ) {
6473         --zSig0;
6474         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6475     }
6476     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6477     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6478         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6479         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6480         while ( (int64_t) rem1 < 0 ) {
6481             --zSig1;
6482             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6483         }
6484         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6485     }
6486     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6487     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6488
6489 }
6490
6491 /*----------------------------------------------------------------------------
6492 | Returns the remainder of the quadruple-precision floating-point value `a'
6493 | with respect to the corresponding value `b'.  The operation is performed
6494 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6495 *----------------------------------------------------------------------------*/
6496
6497 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
6498 {
6499     flag aSign, zSign;
6500     int32 aExp, bExp, expDiff;
6501     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6502     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6503     int64_t sigMean0;
6504     float128 z;
6505
6506     aSig1 = extractFloat128Frac1( a );
6507     aSig0 = extractFloat128Frac0( a );
6508     aExp = extractFloat128Exp( a );
6509     aSign = extractFloat128Sign( a );
6510     bSig1 = extractFloat128Frac1( b );
6511     bSig0 = extractFloat128Frac0( b );
6512     bExp = extractFloat128Exp( b );
6513     if ( aExp == 0x7FFF ) {
6514         if (    ( aSig0 | aSig1 )
6515              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6516             return propagateFloat128NaN( a, b STATUS_VAR );
6517         }
6518         goto invalid;
6519     }
6520     if ( bExp == 0x7FFF ) {
6521         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6522         return a;
6523     }
6524     if ( bExp == 0 ) {
6525         if ( ( bSig0 | bSig1 ) == 0 ) {
6526  invalid:
6527             float_raise( float_flag_invalid STATUS_VAR);
6528             z.low = float128_default_nan_low;
6529             z.high = float128_default_nan_high;
6530             return z;
6531         }
6532         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6533     }
6534     if ( aExp == 0 ) {
6535         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6536         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6537     }
6538     expDiff = aExp - bExp;
6539     if ( expDiff < -1 ) return a;
6540     shortShift128Left(
6541         aSig0 | LIT64( 0x0001000000000000 ),
6542         aSig1,
6543         15 - ( expDiff < 0 ),
6544         &aSig0,
6545         &aSig1
6546     );
6547     shortShift128Left(
6548         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6549     q = le128( bSig0, bSig1, aSig0, aSig1 );
6550     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6551     expDiff -= 64;
6552     while ( 0 < expDiff ) {
6553         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6554         q = ( 4 < q ) ? q - 4 : 0;
6555         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6556         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6557         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6558         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6559         expDiff -= 61;
6560     }
6561     if ( -64 < expDiff ) {
6562         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6563         q = ( 4 < q ) ? q - 4 : 0;
6564         q >>= - expDiff;
6565         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6566         expDiff += 52;
6567         if ( expDiff < 0 ) {
6568             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6569         }
6570         else {
6571             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6572         }
6573         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6574         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6575     }
6576     else {
6577         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6578         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6579     }
6580     do {
6581         alternateASig0 = aSig0;
6582         alternateASig1 = aSig1;
6583         ++q;
6584         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6585     } while ( 0 <= (int64_t) aSig0 );
6586     add128(
6587         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6588     if (    ( sigMean0 < 0 )
6589          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6590         aSig0 = alternateASig0;
6591         aSig1 = alternateASig1;
6592     }
6593     zSign = ( (int64_t) aSig0 < 0 );
6594     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6595     return
6596         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6597
6598 }
6599
6600 /*----------------------------------------------------------------------------
6601 | Returns the square root of the quadruple-precision floating-point value `a'.
6602 | The operation is performed according to the IEC/IEEE Standard for Binary
6603 | Floating-Point Arithmetic.
6604 *----------------------------------------------------------------------------*/
6605
6606 float128 float128_sqrt( float128 a STATUS_PARAM )
6607 {
6608     flag aSign;
6609     int32 aExp, zExp;
6610     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6611     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6612     float128 z;
6613
6614     aSig1 = extractFloat128Frac1( a );
6615     aSig0 = extractFloat128Frac0( a );
6616     aExp = extractFloat128Exp( a );
6617     aSign = extractFloat128Sign( a );
6618     if ( aExp == 0x7FFF ) {
6619         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6620         if ( ! aSign ) return a;
6621         goto invalid;
6622     }
6623     if ( aSign ) {
6624         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6625  invalid:
6626         float_raise( float_flag_invalid STATUS_VAR);
6627         z.low = float128_default_nan_low;
6628         z.high = float128_default_nan_high;
6629         return z;
6630     }
6631     if ( aExp == 0 ) {
6632         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6633         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6634     }
6635     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6636     aSig0 |= LIT64( 0x0001000000000000 );
6637     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6638     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6639     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6640     doubleZSig0 = zSig0<<1;
6641     mul64To128( zSig0, zSig0, &term0, &term1 );
6642     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6643     while ( (int64_t) rem0 < 0 ) {
6644         --zSig0;
6645         doubleZSig0 -= 2;
6646         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6647     }
6648     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6649     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6650         if ( zSig1 == 0 ) zSig1 = 1;
6651         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6652         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6653         mul64To128( zSig1, zSig1, &term2, &term3 );
6654         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6655         while ( (int64_t) rem1 < 0 ) {
6656             --zSig1;
6657             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6658             term3 |= 1;
6659             term2 |= doubleZSig0;
6660             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6661         }
6662         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6663     }
6664     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6665     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6666
6667 }
6668
6669 /*----------------------------------------------------------------------------
6670 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6671 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6672 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6673 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6674 *----------------------------------------------------------------------------*/
6675
6676 int float128_eq( float128 a, float128 b STATUS_PARAM )
6677 {
6678
6679     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6680               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6681          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6682               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6683        ) {
6684         float_raise( float_flag_invalid STATUS_VAR);
6685         return 0;
6686     }
6687     return
6688            ( a.low == b.low )
6689         && (    ( a.high == b.high )
6690              || (    ( a.low == 0 )
6691                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6692            );
6693
6694 }
6695
6696 /*----------------------------------------------------------------------------
6697 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6698 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6699 | exception is raised if either operand is a NaN.  The comparison is performed
6700 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6701 *----------------------------------------------------------------------------*/
6702
6703 int float128_le( float128 a, float128 b STATUS_PARAM )
6704 {
6705     flag aSign, bSign;
6706
6707     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6708               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6709          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6710               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6711        ) {
6712         float_raise( float_flag_invalid STATUS_VAR);
6713         return 0;
6714     }
6715     aSign = extractFloat128Sign( a );
6716     bSign = extractFloat128Sign( b );
6717     if ( aSign != bSign ) {
6718         return
6719                aSign
6720             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6721                  == 0 );
6722     }
6723     return
6724           aSign ? le128( b.high, b.low, a.high, a.low )
6725         : le128( a.high, a.low, b.high, b.low );
6726
6727 }
6728
6729 /*----------------------------------------------------------------------------
6730 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6731 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6732 | raised if either operand is a NaN.  The comparison is performed according
6733 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6734 *----------------------------------------------------------------------------*/
6735
6736 int float128_lt( float128 a, float128 b STATUS_PARAM )
6737 {
6738     flag aSign, bSign;
6739
6740     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6741               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6742          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6743               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6744        ) {
6745         float_raise( float_flag_invalid STATUS_VAR);
6746         return 0;
6747     }
6748     aSign = extractFloat128Sign( a );
6749     bSign = extractFloat128Sign( b );
6750     if ( aSign != bSign ) {
6751         return
6752                aSign
6753             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6754                  != 0 );
6755     }
6756     return
6757           aSign ? lt128( b.high, b.low, a.high, a.low )
6758         : lt128( a.high, a.low, b.high, b.low );
6759
6760 }
6761
6762 /*----------------------------------------------------------------------------
6763 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6764 | be compared, and 0 otherwise.  The invalid exception is raised if either
6765 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6766 | Standard for Binary Floating-Point Arithmetic.
6767 *----------------------------------------------------------------------------*/
6768
6769 int float128_unordered( float128 a, float128 b STATUS_PARAM )
6770 {
6771     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6772               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6773          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6774               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6775        ) {
6776         float_raise( float_flag_invalid STATUS_VAR);
6777         return 1;
6778     }
6779     return 0;
6780 }
6781
6782 /*----------------------------------------------------------------------------
6783 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6784 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6785 | exception.  The comparison is performed according to the IEC/IEEE Standard
6786 | for Binary Floating-Point Arithmetic.
6787 *----------------------------------------------------------------------------*/
6788
6789 int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
6790 {
6791
6792     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6793               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6794          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6795               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6796        ) {
6797         if (    float128_is_signaling_nan( a )
6798              || float128_is_signaling_nan( b ) ) {
6799             float_raise( float_flag_invalid STATUS_VAR);
6800         }
6801         return 0;
6802     }
6803     return
6804            ( a.low == b.low )
6805         && (    ( a.high == b.high )
6806              || (    ( a.low == 0 )
6807                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6808            );
6809
6810 }
6811
6812 /*----------------------------------------------------------------------------
6813 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6814 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6815 | cause an exception.  Otherwise, the comparison is performed according to the
6816 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6817 *----------------------------------------------------------------------------*/
6818
6819 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
6820 {
6821     flag aSign, bSign;
6822
6823     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6824               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6825          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6826               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6827        ) {
6828         if (    float128_is_signaling_nan( a )
6829              || float128_is_signaling_nan( b ) ) {
6830             float_raise( float_flag_invalid STATUS_VAR);
6831         }
6832         return 0;
6833     }
6834     aSign = extractFloat128Sign( a );
6835     bSign = extractFloat128Sign( b );
6836     if ( aSign != bSign ) {
6837         return
6838                aSign
6839             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6840                  == 0 );
6841     }
6842     return
6843           aSign ? le128( b.high, b.low, a.high, a.low )
6844         : le128( a.high, a.low, b.high, b.low );
6845
6846 }
6847
6848 /*----------------------------------------------------------------------------
6849 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6850 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6851 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6852 | Standard for Binary Floating-Point Arithmetic.
6853 *----------------------------------------------------------------------------*/
6854
6855 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
6856 {
6857     flag aSign, bSign;
6858
6859     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6860               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6861          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6862               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6863        ) {
6864         if (    float128_is_signaling_nan( a )
6865              || float128_is_signaling_nan( b ) ) {
6866             float_raise( float_flag_invalid STATUS_VAR);
6867         }
6868         return 0;
6869     }
6870     aSign = extractFloat128Sign( a );
6871     bSign = extractFloat128Sign( b );
6872     if ( aSign != bSign ) {
6873         return
6874                aSign
6875             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6876                  != 0 );
6877     }
6878     return
6879           aSign ? lt128( b.high, b.low, a.high, a.low )
6880         : lt128( a.high, a.low, b.high, b.low );
6881
6882 }
6883
6884 /*----------------------------------------------------------------------------
6885 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6886 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6887 | comparison is performed according to the IEC/IEEE Standard for Binary
6888 | Floating-Point Arithmetic.
6889 *----------------------------------------------------------------------------*/
6890
6891 int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6892 {
6893     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6894               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6895          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6896               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6897        ) {
6898         if (    float128_is_signaling_nan( a )
6899              || float128_is_signaling_nan( b ) ) {
6900             float_raise( float_flag_invalid STATUS_VAR);
6901         }
6902         return 1;
6903     }
6904     return 0;
6905 }
6906
6907 /* misc functions */
6908 float32 uint32_to_float32(uint32_t a STATUS_PARAM)
6909 {
6910     return int64_to_float32(a STATUS_VAR);
6911 }
6912
6913 float64 uint32_to_float64(uint32_t a STATUS_PARAM)
6914 {
6915     return int64_to_float64(a STATUS_VAR);
6916 }
6917
6918 uint32 float32_to_uint32( float32 a STATUS_PARAM )
6919 {
6920     int64_t v;
6921     uint32 res;
6922     int old_exc_flags = get_float_exception_flags(status);
6923
6924     v = float32_to_int64(a STATUS_VAR);
6925     if (v < 0) {
6926         res = 0;
6927     } else if (v > 0xffffffff) {
6928         res = 0xffffffff;
6929     } else {
6930         return v;
6931     }
6932     set_float_exception_flags(old_exc_flags, status);
6933     float_raise(float_flag_invalid STATUS_VAR);
6934     return res;
6935 }
6936
6937 uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6938 {
6939     int64_t v;
6940     uint32 res;
6941     int old_exc_flags = get_float_exception_flags(status);
6942
6943     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6944     if (v < 0) {
6945         res = 0;
6946     } else if (v > 0xffffffff) {
6947         res = 0xffffffff;
6948     } else {
6949         return v;
6950     }
6951     set_float_exception_flags(old_exc_flags, status);
6952     float_raise(float_flag_invalid STATUS_VAR);
6953     return res;
6954 }
6955
6956 int_fast16_t float32_to_int16(float32 a STATUS_PARAM)
6957 {
6958     int32_t v;
6959     int_fast16_t res;
6960     int old_exc_flags = get_float_exception_flags(status);
6961
6962     v = float32_to_int32(a STATUS_VAR);
6963     if (v < -0x8000) {
6964         res = -0x8000;
6965     } else if (v > 0x7fff) {
6966         res = 0x7fff;
6967     } else {
6968         return v;
6969     }
6970
6971     set_float_exception_flags(old_exc_flags, status);
6972     float_raise(float_flag_invalid STATUS_VAR);
6973     return res;
6974 }
6975
6976 uint_fast16_t float32_to_uint16(float32 a STATUS_PARAM)
6977 {
6978     int32_t v;
6979     uint_fast16_t res;
6980     int old_exc_flags = get_float_exception_flags(status);
6981
6982     v = float32_to_int32(a STATUS_VAR);
6983     if (v < 0) {
6984         res = 0;
6985     } else if (v > 0xffff) {
6986         res = 0xffff;
6987     } else {
6988         return v;
6989     }
6990
6991     set_float_exception_flags(old_exc_flags, status);
6992     float_raise(float_flag_invalid STATUS_VAR);
6993     return res;
6994 }
6995
6996 uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
6997 {
6998     int64_t v;
6999     uint_fast16_t res;
7000     int old_exc_flags = get_float_exception_flags(status);
7001
7002     v = float32_to_int64_round_to_zero(a STATUS_VAR);
7003     if (v < 0) {
7004         res = 0;
7005     } else if (v > 0xffff) {
7006         res = 0xffff;
7007     } else {
7008         return v;
7009     }
7010     set_float_exception_flags(old_exc_flags, status);
7011     float_raise(float_flag_invalid STATUS_VAR);
7012     return res;
7013 }
7014
7015 uint32 float64_to_uint32( float64 a STATUS_PARAM )
7016 {
7017     uint64_t v;
7018     uint32 res;
7019     int old_exc_flags = get_float_exception_flags(status);
7020
7021     v = float64_to_uint64(a STATUS_VAR);
7022     if (v > 0xffffffff) {
7023         res = 0xffffffff;
7024     } else {
7025         return v;
7026     }
7027     set_float_exception_flags(old_exc_flags, status);
7028     float_raise(float_flag_invalid STATUS_VAR);
7029     return res;
7030 }
7031
7032 uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
7033 {
7034     uint64_t v;
7035     uint32 res;
7036     int old_exc_flags = get_float_exception_flags(status);
7037
7038     v = float64_to_uint64_round_to_zero(a STATUS_VAR);
7039     if (v > 0xffffffff) {
7040         res = 0xffffffff;
7041     } else {
7042         return v;
7043     }
7044     set_float_exception_flags(old_exc_flags, status);
7045     float_raise(float_flag_invalid STATUS_VAR);
7046     return res;
7047 }
7048
7049 int_fast16_t float64_to_int16(float64 a STATUS_PARAM)
7050 {
7051     int64_t v;
7052     int_fast16_t res;
7053     int old_exc_flags = get_float_exception_flags(status);
7054
7055     v = float64_to_int32(a STATUS_VAR);
7056     if (v < -0x8000) {
7057         res = -0x8000;
7058     } else if (v > 0x7fff) {
7059         res = 0x7fff;
7060     } else {
7061         return v;
7062     }
7063
7064     set_float_exception_flags(old_exc_flags, status);
7065     float_raise(float_flag_invalid STATUS_VAR);
7066     return res;
7067 }
7068
7069 uint_fast16_t float64_to_uint16(float64 a STATUS_PARAM)
7070 {
7071     int64_t v;
7072     uint_fast16_t res;
7073     int old_exc_flags = get_float_exception_flags(status);
7074
7075     v = float64_to_int32(a STATUS_VAR);
7076     if (v < 0) {
7077         res = 0;
7078     } else if (v > 0xffff) {
7079         res = 0xffff;
7080     } else {
7081         return v;
7082     }
7083
7084     set_float_exception_flags(old_exc_flags, status);
7085     float_raise(float_flag_invalid STATUS_VAR);
7086     return res;
7087 }
7088
7089 uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
7090 {
7091     int64_t v;
7092     uint_fast16_t res;
7093     int old_exc_flags = get_float_exception_flags(status);
7094
7095     v = float64_to_int64_round_to_zero(a STATUS_VAR);
7096     if (v < 0) {
7097         res = 0;
7098     } else if (v > 0xffff) {
7099         res = 0xffff;
7100     } else {
7101         return v;
7102     }
7103     set_float_exception_flags(old_exc_flags, status);
7104     float_raise(float_flag_invalid STATUS_VAR);
7105     return res;
7106 }
7107
7108 /*----------------------------------------------------------------------------
7109 | Returns the result of converting the double-precision floating-point value
7110 | `a' to the 64-bit unsigned integer format.  The conversion is
7111 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7112 | Arithmetic---which means in particular that the conversion is rounded
7113 | according to the current rounding mode.  If `a' is a NaN, the largest
7114 | positive integer is returned.  If the conversion overflows, the
7115 | largest unsigned integer is returned.  If 'a' is negative, the value is
7116 | rounded and zero is returned; negative values that do not round to zero
7117 | will raise the inexact exception.
7118 *----------------------------------------------------------------------------*/
7119
7120 uint64_t float64_to_uint64(float64 a STATUS_PARAM)
7121 {
7122     flag aSign;
7123     int_fast16_t aExp, shiftCount;
7124     uint64_t aSig, aSigExtra;
7125     a = float64_squash_input_denormal(a STATUS_VAR);
7126
7127     aSig = extractFloat64Frac(a);
7128     aExp = extractFloat64Exp(a);
7129     aSign = extractFloat64Sign(a);
7130     if (aSign && (aExp > 1022)) {
7131         float_raise(float_flag_invalid STATUS_VAR);
7132         if (float64_is_any_nan(a)) {
7133             return LIT64(0xFFFFFFFFFFFFFFFF);
7134         } else {
7135             return 0;
7136         }
7137     }
7138     if (aExp) {
7139         aSig |= LIT64(0x0010000000000000);
7140     }
7141     shiftCount = 0x433 - aExp;
7142     if (shiftCount <= 0) {
7143         if (0x43E < aExp) {
7144             float_raise(float_flag_invalid STATUS_VAR);
7145             return LIT64(0xFFFFFFFFFFFFFFFF);
7146         }
7147         aSigExtra = 0;
7148         aSig <<= -shiftCount;
7149     } else {
7150         shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7151     }
7152     return roundAndPackUint64(aSign, aSig, aSigExtra STATUS_VAR);
7153 }
7154
7155 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
7156 {
7157     signed char current_rounding_mode = STATUS(float_rounding_mode);
7158     set_float_rounding_mode(float_round_to_zero STATUS_VAR);
7159     int64_t v = float64_to_uint64(a STATUS_VAR);
7160     set_float_rounding_mode(current_rounding_mode STATUS_VAR);
7161     return v;
7162 }
7163
7164 #define COMPARE(s, nan_exp)                                                  \
7165 static inline int float ## s ## _compare_internal( float ## s a, float ## s b,      \
7166                                       int is_quiet STATUS_PARAM )            \
7167 {                                                                            \
7168     flag aSign, bSign;                                                       \
7169     uint ## s ## _t av, bv;                                                  \
7170     a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
7171     b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
7172                                                                              \
7173     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
7174          extractFloat ## s ## Frac( a ) ) ||                                 \
7175         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
7176           extractFloat ## s ## Frac( b ) )) {                                \
7177         if (!is_quiet ||                                                     \
7178             float ## s ## _is_signaling_nan( a ) ||                          \
7179             float ## s ## _is_signaling_nan( b ) ) {                         \
7180             float_raise( float_flag_invalid STATUS_VAR);                     \
7181         }                                                                    \
7182         return float_relation_unordered;                                     \
7183     }                                                                        \
7184     aSign = extractFloat ## s ## Sign( a );                                  \
7185     bSign = extractFloat ## s ## Sign( b );                                  \
7186     av = float ## s ## _val(a);                                              \
7187     bv = float ## s ## _val(b);                                              \
7188     if ( aSign != bSign ) {                                                  \
7189         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
7190             /* zero case */                                                  \
7191             return float_relation_equal;                                     \
7192         } else {                                                             \
7193             return 1 - (2 * aSign);                                          \
7194         }                                                                    \
7195     } else {                                                                 \
7196         if (av == bv) {                                                      \
7197             return float_relation_equal;                                     \
7198         } else {                                                             \
7199             return 1 - 2 * (aSign ^ ( av < bv ));                            \
7200         }                                                                    \
7201     }                                                                        \
7202 }                                                                            \
7203                                                                              \
7204 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
7205 {                                                                            \
7206     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
7207 }                                                                            \
7208                                                                              \
7209 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
7210 {                                                                            \
7211     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
7212 }
7213
7214 COMPARE(32, 0xff)
7215 COMPARE(64, 0x7ff)
7216
7217 static inline int floatx80_compare_internal( floatx80 a, floatx80 b,
7218                                       int is_quiet STATUS_PARAM )
7219 {
7220     flag aSign, bSign;
7221
7222     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7223           ( extractFloatx80Frac( a )<<1 ) ) ||
7224         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7225           ( extractFloatx80Frac( b )<<1 ) )) {
7226         if (!is_quiet ||
7227             floatx80_is_signaling_nan( a ) ||
7228             floatx80_is_signaling_nan( b ) ) {
7229             float_raise( float_flag_invalid STATUS_VAR);
7230         }
7231         return float_relation_unordered;
7232     }
7233     aSign = extractFloatx80Sign( a );
7234     bSign = extractFloatx80Sign( b );
7235     if ( aSign != bSign ) {
7236
7237         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7238              ( ( a.low | b.low ) == 0 ) ) {
7239             /* zero case */
7240             return float_relation_equal;
7241         } else {
7242             return 1 - (2 * aSign);
7243         }
7244     } else {
7245         if (a.low == b.low && a.high == b.high) {
7246             return float_relation_equal;
7247         } else {
7248             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7249         }
7250     }
7251 }
7252
7253 int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
7254 {
7255     return floatx80_compare_internal(a, b, 0 STATUS_VAR);
7256 }
7257
7258 int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
7259 {
7260     return floatx80_compare_internal(a, b, 1 STATUS_VAR);
7261 }
7262
7263 static inline int float128_compare_internal( float128 a, float128 b,
7264                                       int is_quiet STATUS_PARAM )
7265 {
7266     flag aSign, bSign;
7267
7268     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7269           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7270         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7271           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7272         if (!is_quiet ||
7273             float128_is_signaling_nan( a ) ||
7274             float128_is_signaling_nan( b ) ) {
7275             float_raise( float_flag_invalid STATUS_VAR);
7276         }
7277         return float_relation_unordered;
7278     }
7279     aSign = extractFloat128Sign( a );
7280     bSign = extractFloat128Sign( b );
7281     if ( aSign != bSign ) {
7282         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7283             /* zero case */
7284             return float_relation_equal;
7285         } else {
7286             return 1 - (2 * aSign);
7287         }
7288     } else {
7289         if (a.low == b.low && a.high == b.high) {
7290             return float_relation_equal;
7291         } else {
7292             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7293         }
7294     }
7295 }
7296
7297 int float128_compare( float128 a, float128 b STATUS_PARAM )
7298 {
7299     return float128_compare_internal(a, b, 0 STATUS_VAR);
7300 }
7301
7302 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
7303 {
7304     return float128_compare_internal(a, b, 1 STATUS_VAR);
7305 }
7306
7307 /* min() and max() functions. These can't be implemented as
7308  * 'compare and pick one input' because that would mishandle
7309  * NaNs and +0 vs -0.
7310  *
7311  * minnum() and maxnum() functions. These are similar to the min()
7312  * and max() functions but if one of the arguments is a QNaN and
7313  * the other is numerical then the numerical argument is returned.
7314  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7315  * and maxNum() operations. min() and max() are the typical min/max
7316  * semantics provided by many CPUs which predate that specification.
7317  *
7318  * minnummag() and maxnummag() functions correspond to minNumMag()
7319  * and minNumMag() from the IEEE-754 2008.
7320  */
7321 #define MINMAX(s)                                                       \
7322 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
7323                                                int ismin, int isieee,   \
7324                                                int ismag STATUS_PARAM)  \
7325 {                                                                       \
7326     flag aSign, bSign;                                                  \
7327     uint ## s ## _t av, bv, aav, abv;                                   \
7328     a = float ## s ## _squash_input_denormal(a STATUS_VAR);             \
7329     b = float ## s ## _squash_input_denormal(b STATUS_VAR);             \
7330     if (float ## s ## _is_any_nan(a) ||                                 \
7331         float ## s ## _is_any_nan(b)) {                                 \
7332         if (isieee) {                                                   \
7333             if (float ## s ## _is_quiet_nan(a) &&                       \
7334                 !float ## s ##_is_any_nan(b)) {                         \
7335                 return b;                                               \
7336             } else if (float ## s ## _is_quiet_nan(b) &&                \
7337                        !float ## s ## _is_any_nan(a)) {                 \
7338                 return a;                                               \
7339             }                                                           \
7340         }                                                               \
7341         return propagateFloat ## s ## NaN(a, b STATUS_VAR);             \
7342     }                                                                   \
7343     aSign = extractFloat ## s ## Sign(a);                               \
7344     bSign = extractFloat ## s ## Sign(b);                               \
7345     av = float ## s ## _val(a);                                         \
7346     bv = float ## s ## _val(b);                                         \
7347     if (ismag) {                                                        \
7348         aav = float ## s ## _abs(av);                                   \
7349         abv = float ## s ## _abs(bv);                                   \
7350         if (aav != abv) {                                               \
7351             if (ismin) {                                                \
7352                 return (aav < abv) ? a : b;                             \
7353             } else {                                                    \
7354                 return (aav < abv) ? b : a;                             \
7355             }                                                           \
7356         }                                                               \
7357     }                                                                   \
7358     if (aSign != bSign) {                                               \
7359         if (ismin) {                                                    \
7360             return aSign ? a : b;                                       \
7361         } else {                                                        \
7362             return aSign ? b : a;                                       \
7363         }                                                               \
7364     } else {                                                            \
7365         if (ismin) {                                                    \
7366             return (aSign ^ (av < bv)) ? a : b;                         \
7367         } else {                                                        \
7368             return (aSign ^ (av < bv)) ? b : a;                         \
7369         }                                                               \
7370     }                                                                   \
7371 }                                                                       \
7372                                                                         \
7373 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM)  \
7374 {                                                                       \
7375     return float ## s ## _minmax(a, b, 1, 0, 0 STATUS_VAR);             \
7376 }                                                                       \
7377                                                                         \
7378 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM)  \
7379 {                                                                       \
7380     return float ## s ## _minmax(a, b, 0, 0, 0 STATUS_VAR);             \
7381 }                                                                       \
7382                                                                         \
7383 float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \
7384 {                                                                       \
7385     return float ## s ## _minmax(a, b, 1, 1, 0 STATUS_VAR);             \
7386 }                                                                       \
7387                                                                         \
7388 float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \
7389 {                                                                       \
7390     return float ## s ## _minmax(a, b, 0, 1, 0 STATUS_VAR);             \
7391 }                                                                       \
7392                                                                         \
7393 float ## s float ## s ## _minnummag(float ## s a, float ## s b STATUS_PARAM) \
7394 {                                                                       \
7395     return float ## s ## _minmax(a, b, 1, 1, 1 STATUS_VAR);             \
7396 }                                                                       \
7397                                                                         \
7398 float ## s float ## s ## _maxnummag(float ## s a, float ## s b STATUS_PARAM) \
7399 {                                                                       \
7400     return float ## s ## _minmax(a, b, 0, 1, 1 STATUS_VAR);             \
7401 }
7402
7403 MINMAX(32)
7404 MINMAX(64)
7405
7406
7407 /* Multiply A by 2 raised to the power N.  */
7408 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
7409 {
7410     flag aSign;
7411     int16_t aExp;
7412     uint32_t aSig;
7413
7414     a = float32_squash_input_denormal(a STATUS_VAR);
7415     aSig = extractFloat32Frac( a );
7416     aExp = extractFloat32Exp( a );
7417     aSign = extractFloat32Sign( a );
7418
7419     if ( aExp == 0xFF ) {
7420         if ( aSig ) {
7421             return propagateFloat32NaN( a, a STATUS_VAR );
7422         }
7423         return a;
7424     }
7425     if (aExp != 0) {
7426         aSig |= 0x00800000;
7427     } else if (aSig == 0) {
7428         return a;
7429     } else {
7430         aExp++;
7431     }
7432
7433     if (n > 0x200) {
7434         n = 0x200;
7435     } else if (n < -0x200) {
7436         n = -0x200;
7437     }
7438
7439     aExp += n - 1;
7440     aSig <<= 7;
7441     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
7442 }
7443
7444 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
7445 {
7446     flag aSign;
7447     int16_t aExp;
7448     uint64_t aSig;
7449
7450     a = float64_squash_input_denormal(a STATUS_VAR);
7451     aSig = extractFloat64Frac( a );
7452     aExp = extractFloat64Exp( a );
7453     aSign = extractFloat64Sign( a );
7454
7455     if ( aExp == 0x7FF ) {
7456         if ( aSig ) {
7457             return propagateFloat64NaN( a, a STATUS_VAR );
7458         }
7459         return a;
7460     }
7461     if (aExp != 0) {
7462         aSig |= LIT64( 0x0010000000000000 );
7463     } else if (aSig == 0) {
7464         return a;
7465     } else {
7466         aExp++;
7467     }
7468
7469     if (n > 0x1000) {
7470         n = 0x1000;
7471     } else if (n < -0x1000) {
7472         n = -0x1000;
7473     }
7474
7475     aExp += n - 1;
7476     aSig <<= 10;
7477     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
7478 }
7479
7480 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
7481 {
7482     flag aSign;
7483     int32_t aExp;
7484     uint64_t aSig;
7485
7486     aSig = extractFloatx80Frac( a );
7487     aExp = extractFloatx80Exp( a );
7488     aSign = extractFloatx80Sign( a );
7489
7490     if ( aExp == 0x7FFF ) {
7491         if ( aSig<<1 ) {
7492             return propagateFloatx80NaN( a, a STATUS_VAR );
7493         }
7494         return a;
7495     }
7496
7497     if (aExp == 0) {
7498         if (aSig == 0) {
7499             return a;
7500         }
7501         aExp++;
7502     }
7503
7504     if (n > 0x10000) {
7505         n = 0x10000;
7506     } else if (n < -0x10000) {
7507         n = -0x10000;
7508     }
7509
7510     aExp += n;
7511     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
7512                                           aSign, aExp, aSig, 0 STATUS_VAR );
7513 }
7514
7515 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
7516 {
7517     flag aSign;
7518     int32_t aExp;
7519     uint64_t aSig0, aSig1;
7520
7521     aSig1 = extractFloat128Frac1( a );
7522     aSig0 = extractFloat128Frac0( a );
7523     aExp = extractFloat128Exp( a );
7524     aSign = extractFloat128Sign( a );
7525     if ( aExp == 0x7FFF ) {
7526         if ( aSig0 | aSig1 ) {
7527             return propagateFloat128NaN( a, a STATUS_VAR );
7528         }
7529         return a;
7530     }
7531     if (aExp != 0) {
7532         aSig0 |= LIT64( 0x0001000000000000 );
7533     } else if (aSig0 == 0 && aSig1 == 0) {
7534         return a;
7535     } else {
7536         aExp++;
7537     }
7538
7539     if (n > 0x10000) {
7540         n = 0x10000;
7541     } else if (n < -0x10000) {
7542         n = -0x10000;
7543     }
7544
7545     aExp += n - 1;
7546     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7547                                           STATUS_VAR );
7548
7549 }