fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * The code in this source file is derived from release 2a of the SoftFloat
   5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6  * some later contributions) are provided under that license, as detailed below.
   7  * It has subsequently been modified by contributors to the QEMU Project,
   8  * so some portions are provided under:
   9  *  the SoftFloat-2a license
  10  *  the BSD license
  11  *  GPL-v2-or-later
  12  *
  13  * Any future contributions to this file after December 1st 2014 will be
  14  * taken to be licensed under the Softfloat-2a license unless specifically
  15  * indicated otherwise.
  16  */
  17
  18 /*
  19 ===============================================================================
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* BSD licensing:
  48  * Copyright (c) 2006, Fabrice Bellard
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are met:
  53  *
  54  * 1. Redistributions of source code must retain the above copyright notice,
  55  * this list of conditions and the following disclaimer.
  56  *
  57  * 2. Redistributions in binary form must reproduce the above copyright notice,
  58  * this list of conditions and the following disclaimer in the documentation
  59  * and/or other materials provided with the distribution.
  60  *
  61  * 3. Neither the name of the copyright holder nor the names of its contributors
  62  * may be used to endorse or promote products derived from this software without
  63  * specific prior written permission.
  64  *
  65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75  * THE POSSIBILITY OF SUCH DAMAGE.
  76  */
  77
  78 /* Portions of this work are licensed under the terms of the GNU GPL,
  79  * version 2 or later. See the COPYING file in the top-level directory.
  80  */
  81
  82 /* softfloat (and in particular the code in softfloat-specialize.h) is
  83  * target-dependent and needs the TARGET_* macros.
  84  */
  85 #include "qemu/osdep.h"
  86 #include "qemu/bitops.h"
  87 #include "fpu/softfloat.h"
  88
  89 /* We only need stdlib for abort() */
  90
  91 /*----------------------------------------------------------------------------
  92 | Primitive arithmetic functions, including multi-word arithmetic, and
  93 | division and square root approximations.  (Can be specialized to target if
  94 | desired.)
  95 *----------------------------------------------------------------------------*/
  96 #include "fpu/softfloat-macros.h"
  97
  98 /*----------------------------------------------------------------------------
  99 | Functions and definitions to determine:  (1) whether tininess for underflow
 100 | is detected before or after rounding by default, (2) what (if anything)
 101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
 102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 103 | are propagated from function inputs to output.  These details are target-
 104 | specific.
 105 *----------------------------------------------------------------------------*/
 106 #include "softfloat-specialize.h"
 107
 108 /*----------------------------------------------------------------------------
 109 | Returns the fraction bits of the half-precision floating-point value `a'.
 110 *----------------------------------------------------------------------------*/
 111
 112 static inline uint32_t extractFloat16Frac(float16 a)
 113 {
 114     return float16_val(a) & 0x3ff;
 115 }
 116
 117 /*----------------------------------------------------------------------------
 118 | Returns the exponent bits of the half-precision floating-point value `a'.
 119 *----------------------------------------------------------------------------*/
 120
 121 static inline int extractFloat16Exp(float16 a)
 122 {
 123     return (float16_val(a) >> 10) & 0x1f;
 124 }
 125
 126 /*----------------------------------------------------------------------------
 127 | Returns the sign bit of the single-precision floating-point value `a'.
 128 *----------------------------------------------------------------------------*/
 129
 130 static inline flag extractFloat16Sign(float16 a)
 131 {
 132     return float16_val(a)>>15;
 133 }
 134
 135 /*----------------------------------------------------------------------------
 136 | Returns the fraction bits of the single-precision floating-point value `a'.
 137 *----------------------------------------------------------------------------*/
 138
 139 static inline uint32_t extractFloat32Frac(float32 a)
 140 {
 141     return float32_val(a) & 0x007FFFFF;
 142 }
 143
 144 /*----------------------------------------------------------------------------
 145 | Returns the exponent bits of the single-precision floating-point value `a'.
 146 *----------------------------------------------------------------------------*/
 147
 148 static inline int extractFloat32Exp(float32 a)
 149 {
 150     return (float32_val(a) >> 23) & 0xFF;
 151 }
 152
 153 /*----------------------------------------------------------------------------
 154 | Returns the sign bit of the single-precision floating-point value `a'.
 155 *----------------------------------------------------------------------------*/
 156
 157 static inline flag extractFloat32Sign(float32 a)
 158 {
 159     return float32_val(a) >> 31;
 160 }
 161
 162 /*----------------------------------------------------------------------------
 163 | Returns the fraction bits of the double-precision floating-point value `a'.
 164 *----------------------------------------------------------------------------*/
 165
 166 static inline uint64_t extractFloat64Frac(float64 a)
 167 {
 168     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
 169 }
 170
 171 /*----------------------------------------------------------------------------
 172 | Returns the exponent bits of the double-precision floating-point value `a'.
 173 *----------------------------------------------------------------------------*/
 174
 175 static inline int extractFloat64Exp(float64 a)
 176 {
 177     return (float64_val(a) >> 52) & 0x7FF;
 178 }
 179
 180 /*----------------------------------------------------------------------------
 181 | Returns the sign bit of the double-precision floating-point value `a'.
 182 *----------------------------------------------------------------------------*/
 183
 184 static inline flag extractFloat64Sign(float64 a)
 185 {
 186     return float64_val(a) >> 63;
 187 }
 188
 189 /*
 190  * Classify a floating point number. Everything above float_class_qnan
 191  * is a NaN so cls >= float_class_qnan is any NaN.
 192  */
 193
 194 typedef enum __attribute__ ((__packed__)) {
 195     float_class_unclassified,
 196     float_class_zero,
 197     float_class_normal,
 198     float_class_inf,
 199     float_class_qnan,  /* all NaNs from here */
 200     float_class_snan,
 201     float_class_dnan,
 202     float_class_msnan, /* maybe silenced */
 203 } FloatClass;
 204
 205 /*
 206  * Structure holding all of the decomposed parts of a float. The
 207  * exponent is unbiased and the fraction is normalized. All
 208  * calculations are done with a 64 bit fraction and then rounded as
 209  * appropriate for the final format.
 210  *
 211  * Thanks to the packed FloatClass a decent compiler should be able to
 212  * fit the whole structure into registers and avoid using the stack
 213  * for parameter passing.
 214  */
 215
 216 typedef struct {
 217     uint64_t frac;
 218     int32_t  exp;
 219     FloatClass cls;
 220     bool sign;
 221 } FloatParts;
 222
 223 #define DECOMPOSED_BINARY_POINT    (64 - 2)
 224 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 225 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
 226
 227 /* Structure holding all of the relevant parameters for a format.
 228  *   exp_size: the size of the exponent field
 229  *   exp_bias: the offset applied to the exponent field
 230  *   exp_max: the maximum normalised exponent
 231  *   frac_size: the size of the fraction field
 232  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
 233  * The following are computed based the size of fraction
 234  *   frac_lsb: least significant bit of fraction
 235  *   fram_lsbm1: the bit bellow the least significant bit (for rounding)
 236  *   round_mask/roundeven_mask: masks used for rounding
 237  */
 238 typedef struct {
 239     int exp_size;
 240     int exp_bias;
 241     int exp_max;
 242     int frac_size;
 243     int frac_shift;
 244     uint64_t frac_lsb;
 245     uint64_t frac_lsbm1;
 246     uint64_t round_mask;
 247     uint64_t roundeven_mask;
 248 } FloatFmt;
 249
 250 /* Expand fields based on the size of exponent and fraction */
 251 #define FLOAT_PARAMS(E, F)                                           \
 252     .exp_size       = E,                                             \
 253     .exp_bias       = ((1 << E) - 1) >> 1,                           \
 254     .exp_max        = (1 << E) - 1,                                  \
 255     .frac_size      = F,                                             \
 256     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
 257     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
 258     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
 259     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
 260     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
 261
 262 static const FloatFmt float16_params = {
 263     FLOAT_PARAMS(5, 10)
 264 };
 265
 266 static const FloatFmt float32_params = {
 267     FLOAT_PARAMS(8, 23)
 268 };
 269
 270 static const FloatFmt float64_params = {
 271     FLOAT_PARAMS(11, 52)
 272 };
 273
 274 /* Unpack a float to parts, but do not canonicalize.  */
 275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
 276 {
 277     const int sign_pos = fmt.frac_size + fmt.exp_size;
 278
 279     return (FloatParts) {
 280         .cls = float_class_unclassified,
 281         .sign = extract64(raw, sign_pos, 1),
 282         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
 283         .frac = extract64(raw, 0, fmt.frac_size),
 284     };
 285 }
 286
 287 static inline FloatParts float16_unpack_raw(float16 f)
 288 {
 289     return unpack_raw(float16_params, f);
 290 }
 291
 292 static inline FloatParts float32_unpack_raw(float32 f)
 293 {
 294     return unpack_raw(float32_params, f);
 295 }
 296
 297 static inline FloatParts float64_unpack_raw(float64 f)
 298 {
 299     return unpack_raw(float64_params, f);
 300 }
 301
 302 /* Pack a float from parts, but do not canonicalize.  */
 303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
 304 {
 305     const int sign_pos = fmt.frac_size + fmt.exp_size;
 306     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
 307     return deposit64(ret, sign_pos, 1, p.sign);
 308 }
 309
 310 static inline float16 float16_pack_raw(FloatParts p)
 311 {
 312     return make_float16(pack_raw(float16_params, p));
 313 }
 314
 315 static inline float32 float32_pack_raw(FloatParts p)
 316 {
 317     return make_float32(pack_raw(float32_params, p));
 318 }
 319
 320 static inline float64 float64_pack_raw(FloatParts p)
 321 {
 322     return make_float64(pack_raw(float64_params, p));
 323 }
 324
 325 /* Canonicalize EXP and FRAC, setting CLS.  */
 326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
 327                                float_status *status)
 328 {
 329     if (part.exp == parm->exp_max) {
 330         if (part.frac == 0) {
 331             part.cls = float_class_inf;
 332         } else {
 333 #ifdef NO_SIGNALING_NANS
 334             part.cls = float_class_qnan;
 335 #else
 336             int64_t msb = part.frac << (parm->frac_shift + 2);
 337             if ((msb < 0) == status->snan_bit_is_one) {
 338                 part.cls = float_class_snan;
 339             } else {
 340                 part.cls = float_class_qnan;
 341             }
 342 #endif
 343         }
 344     } else if (part.exp == 0) {
 345         if (likely(part.frac == 0)) {
 346             part.cls = float_class_zero;
 347         } else if (status->flush_inputs_to_zero) {
 348             float_raise(float_flag_input_denormal, status);
 349             part.cls = float_class_zero;
 350             part.frac = 0;
 351         } else {
 352             int shift = clz64(part.frac) - 1;
 353             part.cls = float_class_normal;
 354             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
 355             part.frac <<= shift;
 356         }
 357     } else {
 358         part.cls = float_class_normal;
 359         part.exp -= parm->exp_bias;
 360         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
 361     }
 362     return part;
 363 }
 364
 365 /* Round and uncanonicalize a floating-point number by parts. There
 366  * are FRAC_SHIFT bits that may require rounding at the bottom of the
 367  * fraction; these bits will be removed. The exponent will be biased
 368  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
 369  */
 370
 371 static FloatParts round_canonical(FloatParts p, float_status *s,
 372                                   const FloatFmt *parm)
 373 {
 374     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
 375     const uint64_t round_mask = parm->round_mask;
 376     const uint64_t roundeven_mask = parm->roundeven_mask;
 377     const int exp_max = parm->exp_max;
 378     const int frac_shift = parm->frac_shift;
 379     uint64_t frac, inc;
 380     int exp, flags = 0;
 381     bool overflow_norm;
 382
 383     frac = p.frac;
 384     exp = p.exp;
 385
 386     switch (p.cls) {
 387     case float_class_normal:
 388         switch (s->float_rounding_mode) {
 389         case float_round_nearest_even:
 390             overflow_norm = false;
 391             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
 392             break;
 393         case float_round_ties_away:
 394             overflow_norm = false;
 395             inc = frac_lsbm1;
 396             break;
 397         case float_round_to_zero:
 398             overflow_norm = true;
 399             inc = 0;
 400             break;
 401         case float_round_up:
 402             inc = p.sign ? 0 : round_mask;
 403             overflow_norm = p.sign;
 404             break;
 405         case float_round_down:
 406             inc = p.sign ? round_mask : 0;
 407             overflow_norm = !p.sign;
 408             break;
 409         default:
 410             g_assert_not_reached();
 411         }
 412
 413         exp += parm->exp_bias;
 414         if (likely(exp > 0)) {
 415             if (frac & round_mask) {
 416                 flags |= float_flag_inexact;
 417                 frac += inc;
 418                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
 419                     frac >>= 1;
 420                     exp++;
 421                 }
 422             }
 423             frac >>= frac_shift;
 424
 425             if (unlikely(exp >= exp_max)) {
 426                 flags |= float_flag_overflow | float_flag_inexact;
 427                 if (overflow_norm) {
 428                     exp = exp_max - 1;
 429                     frac = -1;
 430                 } else {
 431                     p.cls = float_class_inf;
 432                     goto do_inf;
 433                 }
 434             }
 435         } else if (s->flush_to_zero) {
 436             flags |= float_flag_output_denormal;
 437             p.cls = float_class_zero;
 438             goto do_zero;
 439         } else {
 440             bool is_tiny = (s->float_detect_tininess
 441                             == float_tininess_before_rounding)
 442                         || (exp < 0)
 443                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
 444
 445             shift64RightJamming(frac, 1 - exp, &frac);
 446             if (frac & round_mask) {
 447                 /* Need to recompute round-to-even.  */
 448                 if (s->float_rounding_mode == float_round_nearest_even) {
 449                     inc = ((frac & roundeven_mask) != frac_lsbm1
 450                            ? frac_lsbm1 : 0);
 451                 }
 452                 flags |= float_flag_inexact;
 453                 frac += inc;
 454             }
 455
 456             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
 457             frac >>= frac_shift;
 458
 459             if (is_tiny && (flags & float_flag_inexact)) {
 460                 flags |= float_flag_underflow;
 461             }
 462             if (exp == 0 && frac == 0) {
 463                 p.cls = float_class_zero;
 464             }
 465         }
 466         break;
 467
 468     case float_class_zero:
 469     do_zero:
 470         exp = 0;
 471         frac = 0;
 472         break;
 473
 474     case float_class_inf:
 475     do_inf:
 476         exp = exp_max;
 477         frac = 0;
 478         break;
 479
 480     case float_class_qnan:
 481     case float_class_snan:
 482         exp = exp_max;
 483         break;
 484
 485     default:
 486         g_assert_not_reached();
 487     }
 488
 489     float_raise(flags, s);
 490     p.exp = exp;
 491     p.frac = frac;
 492     return p;
 493 }
 494
 495 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
 496 {
 497     return canonicalize(float16_unpack_raw(f), &float16_params, s);
 498 }
 499
 500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
 501 {
 502     switch (p.cls) {
 503     case float_class_dnan:
 504         return float16_default_nan(s);
 505     case float_class_msnan:
 506         return float16_maybe_silence_nan(float16_pack_raw(p), s);
 507     default:
 508         p = round_canonical(p, s, &float16_params);
 509         return float16_pack_raw(p);
 510     }
 511 }
 512
 513 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
 514 {
 515     return canonicalize(float32_unpack_raw(f), &float32_params, s);
 516 }
 517
 518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
 519 {
 520     switch (p.cls) {
 521     case float_class_dnan:
 522         return float32_default_nan(s);
 523     case float_class_msnan:
 524         return float32_maybe_silence_nan(float32_pack_raw(p), s);
 525     default:
 526         p = round_canonical(p, s, &float32_params);
 527         return float32_pack_raw(p);
 528     }
 529 }
 530
 531 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
 532 {
 533     return canonicalize(float64_unpack_raw(f), &float64_params, s);
 534 }
 535
 536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
 537 {
 538     switch (p.cls) {
 539     case float_class_dnan:
 540         return float64_default_nan(s);
 541     case float_class_msnan:
 542         return float64_maybe_silence_nan(float64_pack_raw(p), s);
 543     default:
 544         p = round_canonical(p, s, &float64_params);
 545         return float64_pack_raw(p);
 546     }
 547 }
 548
 549 /* Simple helpers for checking if what NaN we have */
 550 static bool is_nan(FloatClass c)
 551 {
 552     return unlikely(c >= float_class_qnan);
 553 }
 554 static bool is_snan(FloatClass c)
 555 {
 556     return c == float_class_snan;
 557 }
 558 static bool is_qnan(FloatClass c)
 559 {
 560     return c == float_class_qnan;
 561 }
 562
 563 static FloatParts return_nan(FloatParts a, float_status *s)
 564 {
 565     switch (a.cls) {
 566     case float_class_snan:
 567         s->float_exception_flags |= float_flag_invalid;
 568         a.cls = float_class_msnan;
 569         /* fall through */
 570     case float_class_qnan:
 571         if (s->default_nan_mode) {
 572             a.cls = float_class_dnan;
 573         }
 574         break;
 575
 576     default:
 577         g_assert_not_reached();
 578     }
 579     return a;
 580 }
 581
 582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
 583 {
 584     if (is_snan(a.cls) || is_snan(b.cls)) {
 585         s->float_exception_flags |= float_flag_invalid;
 586     }
 587
 588     if (s->default_nan_mode) {
 589         a.cls = float_class_dnan;
 590     } else {
 591         if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
 592                     is_qnan(b.cls), is_snan(b.cls),
 593                     a.frac > b.frac ||
 594                     (a.frac == b.frac && a.sign < b.sign))) {
 595             a = b;
 596         }
 597         a.cls = float_class_msnan;
 598     }
 599     return a;
 600 }
 601
 602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
 603                                   bool inf_zero, float_status *s)
 604 {
 605     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
 606         s->float_exception_flags |= float_flag_invalid;
 607     }
 608
 609     if (s->default_nan_mode) {
 610         a.cls = float_class_dnan;
 611     } else {
 612         switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
 613                               is_qnan(b.cls), is_snan(b.cls),
 614                               is_qnan(c.cls), is_snan(c.cls),
 615                               inf_zero, s)) {
 616         case 0:
 617             break;
 618         case 1:
 619             a = b;
 620             break;
 621         case 2:
 622             a = c;
 623             break;
 624         case 3:
 625             a.cls = float_class_dnan;
 626             return a;
 627         default:
 628             g_assert_not_reached();
 629         }
 630
 631         a.cls = float_class_msnan;
 632     }
 633     return a;
 634 }
 635
 636 /*
 637  * Returns the result of adding or subtracting the values of the
 638  * floating-point values `a' and `b'. The operation is performed
 639  * according to the IEC/IEEE Standard for Binary Floating-Point
 640  * Arithmetic.
 641  */
 642
 643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
 644                                 float_status *s)
 645 {
 646     bool a_sign = a.sign;
 647     bool b_sign = b.sign ^ subtract;
 648
 649     if (a_sign != b_sign) {
 650         /* Subtraction */
 651
 652         if (a.cls == float_class_normal && b.cls == float_class_normal) {
 653             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
 654                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 655                 a.frac = a.frac - b.frac;
 656             } else {
 657                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 658                 a.frac = b.frac - a.frac;
 659                 a.exp = b.exp;
 660                 a_sign ^= 1;
 661             }
 662
 663             if (a.frac == 0) {
 664                 a.cls = float_class_zero;
 665                 a.sign = s->float_rounding_mode == float_round_down;
 666             } else {
 667                 int shift = clz64(a.frac) - 1;
 668                 a.frac = a.frac << shift;
 669                 a.exp = a.exp - shift;
 670                 a.sign = a_sign;
 671             }
 672             return a;
 673         }
 674         if (is_nan(a.cls) || is_nan(b.cls)) {
 675             return pick_nan(a, b, s);
 676         }
 677         if (a.cls == float_class_inf) {
 678             if (b.cls == float_class_inf) {
 679                 float_raise(float_flag_invalid, s);
 680                 a.cls = float_class_dnan;
 681             }
 682             return a;
 683         }
 684         if (a.cls == float_class_zero && b.cls == float_class_zero) {
 685             a.sign = s->float_rounding_mode == float_round_down;
 686             return a;
 687         }
 688         if (a.cls == float_class_zero || b.cls == float_class_inf) {
 689             b.sign = a_sign ^ 1;
 690             return b;
 691         }
 692         if (b.cls == float_class_zero) {
 693             return a;
 694         }
 695     } else {
 696         /* Addition */
 697         if (a.cls == float_class_normal && b.cls == float_class_normal) {
 698             if (a.exp > b.exp) {
 699                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 700             } else if (a.exp < b.exp) {
 701                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 702                 a.exp = b.exp;
 703             }
 704             a.frac += b.frac;
 705             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
 706                 a.frac >>= 1;
 707                 a.exp += 1;
 708             }
 709             return a;
 710         }
 711         if (is_nan(a.cls) || is_nan(b.cls)) {
 712             return pick_nan(a, b, s);
 713         }
 714         if (a.cls == float_class_inf || b.cls == float_class_zero) {
 715             return a;
 716         }
 717         if (b.cls == float_class_inf || a.cls == float_class_zero) {
 718             b.sign = b_sign;
 719             return b;
 720         }
 721     }
 722     g_assert_not_reached();
 723 }
 724
 725 /*
 726  * Returns the result of adding or subtracting the floating-point
 727  * values `a' and `b'. The operation is performed according to the
 728  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 729  */
 730
 731 float16  __attribute__((flatten)) float16_add(float16 a, float16 b,
 732                                               float_status *status)
 733 {
 734     FloatParts pa = float16_unpack_canonical(a, status);
 735     FloatParts pb = float16_unpack_canonical(b, status);
 736     FloatParts pr = addsub_floats(pa, pb, false, status);
 737
 738     return float16_round_pack_canonical(pr, status);
 739 }
 740
 741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
 742                                              float_status *status)
 743 {
 744     FloatParts pa = float32_unpack_canonical(a, status);
 745     FloatParts pb = float32_unpack_canonical(b, status);
 746     FloatParts pr = addsub_floats(pa, pb, false, status);
 747
 748     return float32_round_pack_canonical(pr, status);
 749 }
 750
 751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
 752                                              float_status *status)
 753 {
 754     FloatParts pa = float64_unpack_canonical(a, status);
 755     FloatParts pb = float64_unpack_canonical(b, status);
 756     FloatParts pr = addsub_floats(pa, pb, false, status);
 757
 758     return float64_round_pack_canonical(pr, status);
 759 }
 760
 761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
 762                                              float_status *status)
 763 {
 764     FloatParts pa = float16_unpack_canonical(a, status);
 765     FloatParts pb = float16_unpack_canonical(b, status);
 766     FloatParts pr = addsub_floats(pa, pb, true, status);
 767
 768     return float16_round_pack_canonical(pr, status);
 769 }
 770
 771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
 772                                              float_status *status)
 773 {
 774     FloatParts pa = float32_unpack_canonical(a, status);
 775     FloatParts pb = float32_unpack_canonical(b, status);
 776     FloatParts pr = addsub_floats(pa, pb, true, status);
 777
 778     return float32_round_pack_canonical(pr, status);
 779 }
 780
 781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
 782                                              float_status *status)
 783 {
 784     FloatParts pa = float64_unpack_canonical(a, status);
 785     FloatParts pb = float64_unpack_canonical(b, status);
 786     FloatParts pr = addsub_floats(pa, pb, true, status);
 787
 788     return float64_round_pack_canonical(pr, status);
 789 }
 790
 791 /*
 792  * Returns the result of multiplying the floating-point values `a' and
 793  * `b'. The operation is performed according to the IEC/IEEE Standard
 794  * for Binary Floating-Point Arithmetic.
 795  */
 796
 797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
 798 {
 799     bool sign = a.sign ^ b.sign;
 800
 801     if (a.cls == float_class_normal && b.cls == float_class_normal) {
 802         uint64_t hi, lo;
 803         int exp = a.exp + b.exp;
 804
 805         mul64To128(a.frac, b.frac, &hi, &lo);
 806         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 807         if (lo & DECOMPOSED_OVERFLOW_BIT) {
 808             shift64RightJamming(lo, 1, &lo);
 809             exp += 1;
 810         }
 811
 812         /* Re-use a */
 813         a.exp = exp;
 814         a.sign = sign;
 815         a.frac = lo;
 816         return a;
 817     }
 818     /* handle all the NaN cases */
 819     if (is_nan(a.cls) || is_nan(b.cls)) {
 820         return pick_nan(a, b, s);
 821     }
 822     /* Inf * Zero == NaN */
 823     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
 824         (a.cls == float_class_zero && b.cls == float_class_inf)) {
 825         s->float_exception_flags |= float_flag_invalid;
 826         a.cls = float_class_dnan;
 827         a.sign = sign;
 828         return a;
 829     }
 830     /* Multiply by 0 or Inf */
 831     if (a.cls == float_class_inf || a.cls == float_class_zero) {
 832         a.sign = sign;
 833         return a;
 834     }
 835     if (b.cls == float_class_inf || b.cls == float_class_zero) {
 836         b.sign = sign;
 837         return b;
 838     }
 839     g_assert_not_reached();
 840 }
 841
 842 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
 843                                              float_status *status)
 844 {
 845     FloatParts pa = float16_unpack_canonical(a, status);
 846     FloatParts pb = float16_unpack_canonical(b, status);
 847     FloatParts pr = mul_floats(pa, pb, status);
 848
 849     return float16_round_pack_canonical(pr, status);
 850 }
 851
 852 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
 853                                              float_status *status)
 854 {
 855     FloatParts pa = float32_unpack_canonical(a, status);
 856     FloatParts pb = float32_unpack_canonical(b, status);
 857     FloatParts pr = mul_floats(pa, pb, status);
 858
 859     return float32_round_pack_canonical(pr, status);
 860 }
 861
 862 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
 863                                              float_status *status)
 864 {
 865     FloatParts pa = float64_unpack_canonical(a, status);
 866     FloatParts pb = float64_unpack_canonical(b, status);
 867     FloatParts pr = mul_floats(pa, pb, status);
 868
 869     return float64_round_pack_canonical(pr, status);
 870 }
 871
 872 /*
 873  * Returns the result of multiplying the floating-point values `a' and
 874  * `b' then adding 'c', with no intermediate rounding step after the
 875  * multiplication. The operation is performed according to the
 876  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
 877  * The flags argument allows the caller to select negation of the
 878  * addend, the intermediate product, or the final result. (The
 879  * difference between this and having the caller do a separate
 880  * negation is that negating externally will flip the sign bit on
 881  * NaNs.)
 882  */
 883
 884 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
 885                                 int flags, float_status *s)
 886 {
 887     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
 888                     ((1 << float_class_inf) | (1 << float_class_zero));
 889     bool p_sign;
 890     bool sign_flip = flags & float_muladd_negate_result;
 891     FloatClass p_class;
 892     uint64_t hi, lo;
 893     int p_exp;
 894
 895     /* It is implementation-defined whether the cases of (0,inf,qnan)
 896      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
 897      * they return if they do), so we have to hand this information
 898      * off to the target-specific pick-a-NaN routine.
 899      */
 900     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
 901         return pick_nan_muladd(a, b, c, inf_zero, s);
 902     }
 903
 904     if (inf_zero) {
 905         s->float_exception_flags |= float_flag_invalid;
 906         a.cls = float_class_dnan;
 907         return a;
 908     }
 909
 910     if (flags & float_muladd_negate_c) {
 911         c.sign ^= 1;
 912     }
 913
 914     p_sign = a.sign ^ b.sign;
 915
 916     if (flags & float_muladd_negate_product) {
 917         p_sign ^= 1;
 918     }
 919
 920     if (a.cls == float_class_inf || b.cls == float_class_inf) {
 921         p_class = float_class_inf;
 922     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
 923         p_class = float_class_zero;
 924     } else {
 925         p_class = float_class_normal;
 926     }
 927
 928     if (c.cls == float_class_inf) {
 929         if (p_class == float_class_inf && p_sign != c.sign) {
 930             s->float_exception_flags |= float_flag_invalid;
 931             a.cls = float_class_dnan;
 932         } else {
 933             a.cls = float_class_inf;
 934             a.sign = c.sign ^ sign_flip;
 935         }
 936         return a;
 937     }
 938
 939     if (p_class == float_class_inf) {
 940         a.cls = float_class_inf;
 941         a.sign = p_sign ^ sign_flip;
 942         return a;
 943     }
 944
 945     if (p_class == float_class_zero) {
 946         if (c.cls == float_class_zero) {
 947             if (p_sign != c.sign) {
 948                 p_sign = s->float_rounding_mode == float_round_down;
 949             }
 950             c.sign = p_sign;
 951         } else if (flags & float_muladd_halve_result) {
 952             c.exp -= 1;
 953         }
 954         c.sign ^= sign_flip;
 955         return c;
 956     }
 957
 958     /* a & b should be normals now... */
 959     assert(a.cls == float_class_normal &&
 960            b.cls == float_class_normal);
 961
 962     p_exp = a.exp + b.exp;
 963
 964     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
 965      * result.
 966      */
 967     mul64To128(a.frac, b.frac, &hi, &lo);
 968     /* binary point now at bit 124 */
 969
 970     /* check for overflow */
 971     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
 972         shift128RightJamming(hi, lo, 1, &hi, &lo);
 973         p_exp += 1;
 974     }
 975
 976     /* + add/sub */
 977     if (c.cls == float_class_zero) {
 978         /* move binary point back to 62 */
 979         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 980     } else {
 981         int exp_diff = p_exp - c.exp;
 982         if (p_sign == c.sign) {
 983             /* Addition */
 984             if (exp_diff <= 0) {
 985                 shift128RightJamming(hi, lo,
 986                                      DECOMPOSED_BINARY_POINT - exp_diff,
 987                                      &hi, &lo);
 988                 lo += c.frac;
 989                 p_exp = c.exp;
 990             } else {
 991                 uint64_t c_hi, c_lo;
 992                 /* shift c to the same binary point as the product (124) */
 993                 c_hi = c.frac >> 2;
 994                 c_lo = 0;
 995                 shift128RightJamming(c_hi, c_lo,
 996                                      exp_diff,
 997                                      &c_hi, &c_lo);
 998                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
 999                 /* move binary point back to 62 */
1000                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1001             }
1002
1003             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1004                 shift64RightJamming(lo, 1, &lo);
1005                 p_exp += 1;
1006             }
1007
1008         } else {
1009             /* Subtraction */
1010             uint64_t c_hi, c_lo;
1011             /* make C binary point match product at bit 124 */
1012             c_hi = c.frac >> 2;
1013             c_lo = 0;
1014
1015             if (exp_diff <= 0) {
1016                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1017                 if (exp_diff == 0
1018                     &&
1019                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1020                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1021                 } else {
1022                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1023                     p_sign ^= 1;
1024                     p_exp = c.exp;
1025                 }
1026             } else {
1027                 shift128RightJamming(c_hi, c_lo,
1028                                      exp_diff,
1029                                      &c_hi, &c_lo);
1030                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1031             }
1032
1033             if (hi == 0 && lo == 0) {
1034                 a.cls = float_class_zero;
1035                 a.sign = s->float_rounding_mode == float_round_down;
1036                 a.sign ^= sign_flip;
1037                 return a;
1038             } else {
1039                 int shift;
1040                 if (hi != 0) {
1041                     shift = clz64(hi);
1042                 } else {
1043                     shift = clz64(lo) + 64;
1044                 }
1045                 /* Normalizing to a binary point of 124 is the
1046                    correct adjust for the exponent.  However since we're
1047                    shifting, we might as well put the binary point back
1048                    at 62 where we really want it.  Therefore shift as
1049                    if we're leaving 1 bit at the top of the word, but
1050                    adjust the exponent as if we're leaving 3 bits.  */
1051                 shift -= 1;
1052                 if (shift >= 64) {
1053                     lo = lo << (shift - 64);
1054                 } else {
1055                     hi = (hi << shift) | (lo >> (64 - shift));
1056                     lo = hi | ((lo << shift) != 0);
1057                 }
1058                 p_exp -= shift - 2;
1059             }
1060         }
1061     }
1062
1063     if (flags & float_muladd_halve_result) {
1064         p_exp -= 1;
1065     }
1066
1067     /* finally prepare our result */
1068     a.cls = float_class_normal;
1069     a.sign = p_sign ^ sign_flip;
1070     a.exp = p_exp;
1071     a.frac = lo;
1072
1073     return a;
1074 }
1075
1076 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1077                                                 int flags, float_status *status)
1078 {
1079     FloatParts pa = float16_unpack_canonical(a, status);
1080     FloatParts pb = float16_unpack_canonical(b, status);
1081     FloatParts pc = float16_unpack_canonical(c, status);
1082     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1083
1084     return float16_round_pack_canonical(pr, status);
1085 }
1086
1087 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1088                                                 int flags, float_status *status)
1089 {
1090     FloatParts pa = float32_unpack_canonical(a, status);
1091     FloatParts pb = float32_unpack_canonical(b, status);
1092     FloatParts pc = float32_unpack_canonical(c, status);
1093     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1094
1095     return float32_round_pack_canonical(pr, status);
1096 }
1097
1098 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1099                                                 int flags, float_status *status)
1100 {
1101     FloatParts pa = float64_unpack_canonical(a, status);
1102     FloatParts pb = float64_unpack_canonical(b, status);
1103     FloatParts pc = float64_unpack_canonical(c, status);
1104     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1105
1106     return float64_round_pack_canonical(pr, status);
1107 }
1108
1109 /*
1110  * Returns the result of dividing the floating-point value `a' by the
1111  * corresponding value `b'. The operation is performed according to
1112  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1113  */
1114
1115 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1116 {
1117     bool sign = a.sign ^ b.sign;
1118
1119     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1120         uint64_t temp_lo, temp_hi;
1121         int exp = a.exp - b.exp;
1122         if (a.frac < b.frac) {
1123             exp -= 1;
1124             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1125                               &temp_hi, &temp_lo);
1126         } else {
1127             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1128                               &temp_hi, &temp_lo);
1129         }
1130         /* LSB of quot is set if inexact which roundandpack will use
1131          * to set flags. Yet again we re-use a for the result */
1132         a.frac = div128To64(temp_lo, temp_hi, b.frac);
1133         a.sign = sign;
1134         a.exp = exp;
1135         return a;
1136     }
1137     /* handle all the NaN cases */
1138     if (is_nan(a.cls) || is_nan(b.cls)) {
1139         return pick_nan(a, b, s);
1140     }
1141     /* 0/0 or Inf/Inf */
1142     if (a.cls == b.cls
1143         &&
1144         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1145         s->float_exception_flags |= float_flag_invalid;
1146         a.cls = float_class_dnan;
1147         return a;
1148     }
1149     /* Inf / x or 0 / x */
1150     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1151         a.sign = sign;
1152         return a;
1153     }
1154     /* Div 0 => Inf */
1155     if (b.cls == float_class_zero) {
1156         s->float_exception_flags |= float_flag_divbyzero;
1157         a.cls = float_class_inf;
1158         a.sign = sign;
1159         return a;
1160     }
1161     /* Div by Inf */
1162     if (b.cls == float_class_inf) {
1163         a.cls = float_class_zero;
1164         a.sign = sign;
1165         return a;
1166     }
1167     g_assert_not_reached();
1168 }
1169
1170 float16 float16_div(float16 a, float16 b, float_status *status)
1171 {
1172     FloatParts pa = float16_unpack_canonical(a, status);
1173     FloatParts pb = float16_unpack_canonical(b, status);
1174     FloatParts pr = div_floats(pa, pb, status);
1175
1176     return float16_round_pack_canonical(pr, status);
1177 }
1178
1179 float32 float32_div(float32 a, float32 b, float_status *status)
1180 {
1181     FloatParts pa = float32_unpack_canonical(a, status);
1182     FloatParts pb = float32_unpack_canonical(b, status);
1183     FloatParts pr = div_floats(pa, pb, status);
1184
1185     return float32_round_pack_canonical(pr, status);
1186 }
1187
1188 float64 float64_div(float64 a, float64 b, float_status *status)
1189 {
1190     FloatParts pa = float64_unpack_canonical(a, status);
1191     FloatParts pb = float64_unpack_canonical(b, status);
1192     FloatParts pr = div_floats(pa, pb, status);
1193
1194     return float64_round_pack_canonical(pr, status);
1195 }
1196
1197 /*
1198  * Rounds the floating-point value `a' to an integer, and returns the
1199  * result as a floating-point value. The operation is performed
1200  * according to the IEC/IEEE Standard for Binary Floating-Point
1201  * Arithmetic.
1202  */
1203
1204 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1205 {
1206     if (is_nan(a.cls)) {
1207         return return_nan(a, s);
1208     }
1209
1210     switch (a.cls) {
1211     case float_class_zero:
1212     case float_class_inf:
1213     case float_class_qnan:
1214         /* already "integral" */
1215         break;
1216     case float_class_normal:
1217         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1218             /* already integral */
1219             break;
1220         }
1221         if (a.exp < 0) {
1222             bool one;
1223             /* all fractional */
1224             s->float_exception_flags |= float_flag_inexact;
1225             switch (rounding_mode) {
1226             case float_round_nearest_even:
1227                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1228                 break;
1229             case float_round_ties_away:
1230                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1231                 break;
1232             case float_round_to_zero:
1233                 one = false;
1234                 break;
1235             case float_round_up:
1236                 one = !a.sign;
1237                 break;
1238             case float_round_down:
1239                 one = a.sign;
1240                 break;
1241             default:
1242                 g_assert_not_reached();
1243             }
1244
1245             if (one) {
1246                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1247                 a.exp = 0;
1248             } else {
1249                 a.cls = float_class_zero;
1250             }
1251         } else {
1252             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1253             uint64_t frac_lsbm1 = frac_lsb >> 1;
1254             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1255             uint64_t rnd_mask = rnd_even_mask >> 1;
1256             uint64_t inc;
1257
1258             switch (rounding_mode) {
1259             case float_round_nearest_even:
1260                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1261                 break;
1262             case float_round_ties_away:
1263                 inc = frac_lsbm1;
1264                 break;
1265             case float_round_to_zero:
1266                 inc = 0;
1267                 break;
1268             case float_round_up:
1269                 inc = a.sign ? 0 : rnd_mask;
1270                 break;
1271             case float_round_down:
1272                 inc = a.sign ? rnd_mask : 0;
1273                 break;
1274             default:
1275                 g_assert_not_reached();
1276             }
1277
1278             if (a.frac & rnd_mask) {
1279                 s->float_exception_flags |= float_flag_inexact;
1280                 a.frac += inc;
1281                 a.frac &= ~rnd_mask;
1282                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1283                     a.frac >>= 1;
1284                     a.exp++;
1285                 }
1286             }
1287         }
1288         break;
1289     default:
1290         g_assert_not_reached();
1291     }
1292     return a;
1293 }
1294
1295 float16 float16_round_to_int(float16 a, float_status *s)
1296 {
1297     FloatParts pa = float16_unpack_canonical(a, s);
1298     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1299     return float16_round_pack_canonical(pr, s);
1300 }
1301
1302 float32 float32_round_to_int(float32 a, float_status *s)
1303 {
1304     FloatParts pa = float32_unpack_canonical(a, s);
1305     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1306     return float32_round_pack_canonical(pr, s);
1307 }
1308
1309 float64 float64_round_to_int(float64 a, float_status *s)
1310 {
1311     FloatParts pa = float64_unpack_canonical(a, s);
1312     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1313     return float64_round_pack_canonical(pr, s);
1314 }
1315
1316 float64 float64_trunc_to_int(float64 a, float_status *s)
1317 {
1318     FloatParts pa = float64_unpack_canonical(a, s);
1319     FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1320     return float64_round_pack_canonical(pr, s);
1321 }
1322
1323 /*
1324  * Returns the result of converting the floating-point value `a' to
1325  * the two's complement integer format. The conversion is performed
1326  * according to the IEC/IEEE Standard for Binary Floating-Point
1327  * Arithmetic---which means in particular that the conversion is
1328  * rounded according to the current rounding mode. If `a' is a NaN,
1329  * the largest positive integer is returned. Otherwise, if the
1330  * conversion overflows, the largest integer with the same sign as `a'
1331  * is returned.
1332 */
1333
1334 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1335                                      int64_t min, int64_t max,
1336                                      float_status *s)
1337 {
1338     uint64_t r;
1339     int orig_flags = get_float_exception_flags(s);
1340     FloatParts p = round_to_int(in, rmode, s);
1341
1342     switch (p.cls) {
1343     case float_class_snan:
1344     case float_class_qnan:
1345     case float_class_dnan:
1346     case float_class_msnan:
1347         s->float_exception_flags = orig_flags | float_flag_invalid;
1348         return max;
1349     case float_class_inf:
1350         s->float_exception_flags = orig_flags | float_flag_invalid;
1351         return p.sign ? min : max;
1352     case float_class_zero:
1353         return 0;
1354     case float_class_normal:
1355         if (p.exp < DECOMPOSED_BINARY_POINT) {
1356             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1357         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1358             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1359         } else {
1360             r = UINT64_MAX;
1361         }
1362         if (p.sign) {
1363             if (r < -(uint64_t) min) {
1364                 return -r;
1365             } else {
1366                 s->float_exception_flags = orig_flags | float_flag_invalid;
1367                 return min;
1368             }
1369         } else {
1370             if (r < max) {
1371                 return r;
1372             } else {
1373                 s->float_exception_flags = orig_flags | float_flag_invalid;
1374                 return max;
1375             }
1376         }
1377     default:
1378         g_assert_not_reached();
1379     }
1380 }
1381
1382 #define FLOAT_TO_INT(fsz, isz)                                          \
1383 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a,         \
1384                                                 float_status *s)        \
1385 {                                                                       \
1386     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1387     return round_to_int_and_pack(p, s->float_rounding_mode,             \
1388                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1389                                  s);                                    \
1390 }                                                                       \
1391                                                                         \
1392 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero       \
1393  (float ## fsz a, float_status *s)                                      \
1394 {                                                                       \
1395     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1396     return round_to_int_and_pack(p, float_round_to_zero,                \
1397                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1398                                  s);                                    \
1399 }
1400
1401 FLOAT_TO_INT(16, 16)
1402 FLOAT_TO_INT(16, 32)
1403 FLOAT_TO_INT(16, 64)
1404
1405 FLOAT_TO_INT(32, 16)
1406 FLOAT_TO_INT(32, 32)
1407 FLOAT_TO_INT(32, 64)
1408
1409 FLOAT_TO_INT(64, 16)
1410 FLOAT_TO_INT(64, 32)
1411 FLOAT_TO_INT(64, 64)
1412
1413 #undef FLOAT_TO_INT
1414
1415 /*
1416  *  Returns the result of converting the floating-point value `a' to
1417  *  the unsigned integer format. The conversion is performed according
1418  *  to the IEC/IEEE Standard for Binary Floating-Point
1419  *  Arithmetic---which means in particular that the conversion is
1420  *  rounded according to the current rounding mode. If `a' is a NaN,
1421  *  the largest unsigned integer is returned. Otherwise, if the
1422  *  conversion overflows, the largest unsigned integer is returned. If
1423  *  the 'a' is negative, the result is rounded and zero is returned;
1424  *  values that do not round to zero will raise the inexact exception
1425  *  flag.
1426  */
1427
1428 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1429                                        float_status *s)
1430 {
1431     int orig_flags = get_float_exception_flags(s);
1432     FloatParts p = round_to_int(in, rmode, s);
1433
1434     switch (p.cls) {
1435     case float_class_snan:
1436     case float_class_qnan:
1437     case float_class_dnan:
1438     case float_class_msnan:
1439         s->float_exception_flags = orig_flags | float_flag_invalid;
1440         return max;
1441     case float_class_inf:
1442         s->float_exception_flags = orig_flags | float_flag_invalid;
1443         return p.sign ? 0 : max;
1444     case float_class_zero:
1445         return 0;
1446     case float_class_normal:
1447     {
1448         uint64_t r;
1449         if (p.sign) {
1450             s->float_exception_flags = orig_flags | float_flag_invalid;
1451             return 0;
1452         }
1453
1454         if (p.exp < DECOMPOSED_BINARY_POINT) {
1455             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1456         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1457             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1458         } else {
1459             s->float_exception_flags = orig_flags | float_flag_invalid;
1460             return max;
1461         }
1462
1463         /* For uint64 this will never trip, but if p.exp is too large
1464          * to shift a decomposed fraction we shall have exited via the
1465          * 3rd leg above.
1466          */
1467         if (r > max) {
1468             s->float_exception_flags = orig_flags | float_flag_invalid;
1469             return max;
1470         } else {
1471             return r;
1472         }
1473     }
1474     default:
1475         g_assert_not_reached();
1476     }
1477 }
1478
1479 #define FLOAT_TO_UINT(fsz, isz) \
1480 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a,       \
1481                                                   float_status *s)      \
1482 {                                                                       \
1483     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1484     return round_to_uint_and_pack(p, s->float_rounding_mode,            \
1485                                  UINT ## isz ## _MAX, s);               \
1486 }                                                                       \
1487                                                                         \
1488 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero     \
1489  (float ## fsz a, float_status *s)                                      \
1490 {                                                                       \
1491     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1492     return round_to_uint_and_pack(p, float_round_to_zero,               \
1493                                   UINT ## isz ## _MAX, s);              \
1494 }
1495
1496 FLOAT_TO_UINT(16, 16)
1497 FLOAT_TO_UINT(16, 32)
1498 FLOAT_TO_UINT(16, 64)
1499
1500 FLOAT_TO_UINT(32, 16)
1501 FLOAT_TO_UINT(32, 32)
1502 FLOAT_TO_UINT(32, 64)
1503
1504 FLOAT_TO_UINT(64, 16)
1505 FLOAT_TO_UINT(64, 32)
1506 FLOAT_TO_UINT(64, 64)
1507
1508 #undef FLOAT_TO_UINT
1509
1510 /*
1511  * Integer to float conversions
1512  *
1513  * Returns the result of converting the two's complement integer `a'
1514  * to the floating-point format. The conversion is performed according
1515  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1516  */
1517
1518 static FloatParts int_to_float(int64_t a, float_status *status)
1519 {
1520     FloatParts r;
1521     if (a == 0) {
1522         r.cls = float_class_zero;
1523         r.sign = false;
1524     } else if (a == (1ULL << 63)) {
1525         r.cls = float_class_normal;
1526         r.sign = true;
1527         r.frac = DECOMPOSED_IMPLICIT_BIT;
1528         r.exp = 63;
1529     } else {
1530         uint64_t f;
1531         if (a < 0) {
1532             f = -a;
1533             r.sign = true;
1534         } else {
1535             f = a;
1536             r.sign = false;
1537         }
1538         int shift = clz64(f) - 1;
1539         r.cls = float_class_normal;
1540         r.exp = (DECOMPOSED_BINARY_POINT - shift);
1541         r.frac = f << shift;
1542     }
1543
1544     return r;
1545 }
1546
1547 float16 int64_to_float16(int64_t a, float_status *status)
1548 {
1549     FloatParts pa = int_to_float(a, status);
1550     return float16_round_pack_canonical(pa, status);
1551 }
1552
1553 float16 int32_to_float16(int32_t a, float_status *status)
1554 {
1555     return int64_to_float16(a, status);
1556 }
1557
1558 float16 int16_to_float16(int16_t a, float_status *status)
1559 {
1560     return int64_to_float16(a, status);
1561 }
1562
1563 float32 int64_to_float32(int64_t a, float_status *status)
1564 {
1565     FloatParts pa = int_to_float(a, status);
1566     return float32_round_pack_canonical(pa, status);
1567 }
1568
1569 float32 int32_to_float32(int32_t a, float_status *status)
1570 {
1571     return int64_to_float32(a, status);
1572 }
1573
1574 float32 int16_to_float32(int16_t a, float_status *status)
1575 {
1576     return int64_to_float32(a, status);
1577 }
1578
1579 float64 int64_to_float64(int64_t a, float_status *status)
1580 {
1581     FloatParts pa = int_to_float(a, status);
1582     return float64_round_pack_canonical(pa, status);
1583 }
1584
1585 float64 int32_to_float64(int32_t a, float_status *status)
1586 {
1587     return int64_to_float64(a, status);
1588 }
1589
1590 float64 int16_to_float64(int16_t a, float_status *status)
1591 {
1592     return int64_to_float64(a, status);
1593 }
1594
1595
1596 /*
1597  * Unsigned Integer to float conversions
1598  *
1599  * Returns the result of converting the unsigned integer `a' to the
1600  * floating-point format. The conversion is performed according to the
1601  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1602  */
1603
1604 static FloatParts uint_to_float(uint64_t a, float_status *status)
1605 {
1606     FloatParts r = { .sign = false};
1607
1608     if (a == 0) {
1609         r.cls = float_class_zero;
1610     } else {
1611         int spare_bits = clz64(a) - 1;
1612         r.cls = float_class_normal;
1613         r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1614         if (spare_bits < 0) {
1615             shift64RightJamming(a, -spare_bits, &a);
1616             r.frac = a;
1617         } else {
1618             r.frac = a << spare_bits;
1619         }
1620     }
1621
1622     return r;
1623 }
1624
1625 float16 uint64_to_float16(uint64_t a, float_status *status)
1626 {
1627     FloatParts pa = uint_to_float(a, status);
1628     return float16_round_pack_canonical(pa, status);
1629 }
1630
1631 float16 uint32_to_float16(uint32_t a, float_status *status)
1632 {
1633     return uint64_to_float16(a, status);
1634 }
1635
1636 float16 uint16_to_float16(uint16_t a, float_status *status)
1637 {
1638     return uint64_to_float16(a, status);
1639 }
1640
1641 float32 uint64_to_float32(uint64_t a, float_status *status)
1642 {
1643     FloatParts pa = uint_to_float(a, status);
1644     return float32_round_pack_canonical(pa, status);
1645 }
1646
1647 float32 uint32_to_float32(uint32_t a, float_status *status)
1648 {
1649     return uint64_to_float32(a, status);
1650 }
1651
1652 float32 uint16_to_float32(uint16_t a, float_status *status)
1653 {
1654     return uint64_to_float32(a, status);
1655 }
1656
1657 float64 uint64_to_float64(uint64_t a, float_status *status)
1658 {
1659     FloatParts pa = uint_to_float(a, status);
1660     return float64_round_pack_canonical(pa, status);
1661 }
1662
1663 float64 uint32_to_float64(uint32_t a, float_status *status)
1664 {
1665     return uint64_to_float64(a, status);
1666 }
1667
1668 float64 uint16_to_float64(uint16_t a, float_status *status)
1669 {
1670     return uint64_to_float64(a, status);
1671 }
1672
1673 /* Float Min/Max */
1674 /* min() and max() functions. These can't be implemented as
1675  * 'compare and pick one input' because that would mishandle
1676  * NaNs and +0 vs -0.
1677  *
1678  * minnum() and maxnum() functions. These are similar to the min()
1679  * and max() functions but if one of the arguments is a QNaN and
1680  * the other is numerical then the numerical argument is returned.
1681  * SNaNs will get quietened before being returned.
1682  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1683  * and maxNum() operations. min() and max() are the typical min/max
1684  * semantics provided by many CPUs which predate that specification.
1685  *
1686  * minnummag() and maxnummag() functions correspond to minNumMag()
1687  * and minNumMag() from the IEEE-754 2008.
1688  */
1689 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1690                                 bool ieee, bool ismag, float_status *s)
1691 {
1692     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1693         if (ieee) {
1694             /* Takes two floating-point values `a' and `b', one of
1695              * which is a NaN, and returns the appropriate NaN
1696              * result. If either `a' or `b' is a signaling NaN,
1697              * the invalid exception is raised.
1698              */
1699             if (is_snan(a.cls) || is_snan(b.cls)) {
1700                 return pick_nan(a, b, s);
1701             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1702                 return b;
1703             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1704                 return a;
1705             }
1706         }
1707         return pick_nan(a, b, s);
1708     } else {
1709         int a_exp, b_exp;
1710
1711         switch (a.cls) {
1712         case float_class_normal:
1713             a_exp = a.exp;
1714             break;
1715         case float_class_inf:
1716             a_exp = INT_MAX;
1717             break;
1718         case float_class_zero:
1719             a_exp = INT_MIN;
1720             break;
1721         default:
1722             g_assert_not_reached();
1723             break;
1724         }
1725         switch (b.cls) {
1726         case float_class_normal:
1727             b_exp = b.exp;
1728             break;
1729         case float_class_inf:
1730             b_exp = INT_MAX;
1731             break;
1732         case float_class_zero:
1733             b_exp = INT_MIN;
1734             break;
1735         default:
1736             g_assert_not_reached();
1737             break;
1738         }
1739
1740         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
1741             bool a_less = a_exp < b_exp;
1742             if (a_exp == b_exp) {
1743                 a_less = a.frac < b.frac;
1744             }
1745             return a_less ^ ismin ? b : a;
1746         }
1747
1748         if (a.sign == b.sign) {
1749             bool a_less = a_exp < b_exp;
1750             if (a_exp == b_exp) {
1751                 a_less = a.frac < b.frac;
1752             }
1753             return a.sign ^ a_less ^ ismin ? b : a;
1754         } else {
1755             return a.sign ^ ismin ? b : a;
1756         }
1757     }
1758 }
1759
1760 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
1761 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
1762                                      float_status *s)                   \
1763 {                                                                       \
1764     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1765     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1766     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
1767                                                                         \
1768     return float ## sz ## _round_pack_canonical(pr, s);                 \
1769 }
1770
1771 MINMAX(16, min, true, false, false)
1772 MINMAX(16, minnum, true, true, false)
1773 MINMAX(16, minnummag, true, true, true)
1774 MINMAX(16, max, false, false, false)
1775 MINMAX(16, maxnum, false, true, false)
1776 MINMAX(16, maxnummag, false, true, true)
1777
1778 MINMAX(32, min, true, false, false)
1779 MINMAX(32, minnum, true, true, false)
1780 MINMAX(32, minnummag, true, true, true)
1781 MINMAX(32, max, false, false, false)
1782 MINMAX(32, maxnum, false, true, false)
1783 MINMAX(32, maxnummag, false, true, true)
1784
1785 MINMAX(64, min, true, false, false)
1786 MINMAX(64, minnum, true, true, false)
1787 MINMAX(64, minnummag, true, true, true)
1788 MINMAX(64, max, false, false, false)
1789 MINMAX(64, maxnum, false, true, false)
1790 MINMAX(64, maxnummag, false, true, true)
1791
1792 #undef MINMAX
1793
1794 /* Floating point compare */
1795 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1796                           float_status *s)
1797 {
1798     if (is_nan(a.cls) || is_nan(b.cls)) {
1799         if (!is_quiet ||
1800             a.cls == float_class_snan ||
1801             b.cls == float_class_snan) {
1802             s->float_exception_flags |= float_flag_invalid;
1803         }
1804         return float_relation_unordered;
1805     }
1806
1807     if (a.cls == float_class_zero) {
1808         if (b.cls == float_class_zero) {
1809             return float_relation_equal;
1810         }
1811         return b.sign ? float_relation_greater : float_relation_less;
1812     } else if (b.cls == float_class_zero) {
1813         return a.sign ? float_relation_less : float_relation_greater;
1814     }
1815
1816     /* The only really important thing about infinity is its sign. If
1817      * both are infinities the sign marks the smallest of the two.
1818      */
1819     if (a.cls == float_class_inf) {
1820         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1821             return float_relation_equal;
1822         }
1823         return a.sign ? float_relation_less : float_relation_greater;
1824     } else if (b.cls == float_class_inf) {
1825         return b.sign ? float_relation_greater : float_relation_less;
1826     }
1827
1828     if (a.sign != b.sign) {
1829         return a.sign ? float_relation_less : float_relation_greater;
1830     }
1831
1832     if (a.exp == b.exp) {
1833         if (a.frac == b.frac) {
1834             return float_relation_equal;
1835         }
1836         if (a.sign) {
1837             return a.frac > b.frac ?
1838                 float_relation_less : float_relation_greater;
1839         } else {
1840             return a.frac > b.frac ?
1841                 float_relation_greater : float_relation_less;
1842         }
1843     } else {
1844         if (a.sign) {
1845             return a.exp > b.exp ? float_relation_less : float_relation_greater;
1846         } else {
1847             return a.exp > b.exp ? float_relation_greater : float_relation_less;
1848         }
1849     }
1850 }
1851
1852 #define COMPARE(sz)                                                     \
1853 int float ## sz ## _compare(float ## sz a, float ## sz b,               \
1854                             float_status *s)                            \
1855 {                                                                       \
1856     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1857     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1858     return compare_floats(pa, pb, false, s);                            \
1859 }                                                                       \
1860 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b,         \
1861                                   float_status *s)                      \
1862 {                                                                       \
1863     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1864     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1865     return compare_floats(pa, pb, true, s);                             \
1866 }
1867
1868 COMPARE(16)
1869 COMPARE(32)
1870 COMPARE(64)
1871
1872 #undef COMPARE
1873
1874 /* Multiply A by 2 raised to the power N.  */
1875 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1876 {
1877     if (unlikely(is_nan(a.cls))) {
1878         return return_nan(a, s);
1879     }
1880     if (a.cls == float_class_normal) {
1881         /* The largest float type (even though not supported by FloatParts)
1882          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
1883          * still allows rounding to infinity, without allowing overflow
1884          * within the int32_t that backs FloatParts.exp.
1885          */
1886         n = MIN(MAX(n, -0x10000), 0x10000);
1887         a.exp += n;
1888     }
1889     return a;
1890 }
1891
1892 float16 float16_scalbn(float16 a, int n, float_status *status)
1893 {
1894     FloatParts pa = float16_unpack_canonical(a, status);
1895     FloatParts pr = scalbn_decomposed(pa, n, status);
1896     return float16_round_pack_canonical(pr, status);
1897 }
1898
1899 float32 float32_scalbn(float32 a, int n, float_status *status)
1900 {
1901     FloatParts pa = float32_unpack_canonical(a, status);
1902     FloatParts pr = scalbn_decomposed(pa, n, status);
1903     return float32_round_pack_canonical(pr, status);
1904 }
1905
1906 float64 float64_scalbn(float64 a, int n, float_status *status)
1907 {
1908     FloatParts pa = float64_unpack_canonical(a, status);
1909     FloatParts pr = scalbn_decomposed(pa, n, status);
1910     return float64_round_pack_canonical(pr, status);
1911 }
1912
1913 /*
1914  * Square Root
1915  *
1916  * The old softfloat code did an approximation step before zeroing in
1917  * on the final result. However for simpleness we just compute the
1918  * square root by iterating down from the implicit bit to enough extra
1919  * bits to ensure we get a correctly rounded result.
1920  *
1921  * This does mean however the calculation is slower than before,
1922  * especially for 64 bit floats.
1923  */
1924
1925 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
1926 {
1927     uint64_t a_frac, r_frac, s_frac;
1928     int bit, last_bit;
1929
1930     if (is_nan(a.cls)) {
1931         return return_nan(a, s);
1932     }
1933     if (a.cls == float_class_zero) {
1934         return a;  /* sqrt(+-0) = +-0 */
1935     }
1936     if (a.sign) {
1937         s->float_exception_flags |= float_flag_invalid;
1938         a.cls = float_class_dnan;
1939         return a;
1940     }
1941     if (a.cls == float_class_inf) {
1942         return a;  /* sqrt(+inf) = +inf */
1943     }
1944
1945     assert(a.cls == float_class_normal);
1946
1947     /* We need two overflow bits at the top. Adding room for that is a
1948      * right shift. If the exponent is odd, we can discard the low bit
1949      * by multiplying the fraction by 2; that's a left shift. Combine
1950      * those and we shift right if the exponent is even.
1951      */
1952     a_frac = a.frac;
1953     if (!(a.exp & 1)) {
1954         a_frac >>= 1;
1955     }
1956     a.exp >>= 1;
1957
1958     /* Bit-by-bit computation of sqrt.  */
1959     r_frac = 0;
1960     s_frac = 0;
1961
1962     /* Iterate from implicit bit down to the 3 extra bits to compute a
1963      * properly rounded result. Remember we've inserted one more bit
1964      * at the top, so these positions are one less.
1965      */
1966     bit = DECOMPOSED_BINARY_POINT - 1;
1967     last_bit = MAX(p->frac_shift - 4, 0);
1968     do {
1969         uint64_t q = 1ULL << bit;
1970         uint64_t t_frac = s_frac + q;
1971         if (t_frac <= a_frac) {
1972             s_frac = t_frac + q;
1973             a_frac -= t_frac;
1974             r_frac += q;
1975         }
1976         a_frac <<= 1;
1977     } while (--bit >= last_bit);
1978
1979     /* Undo the right shift done above. If there is any remaining
1980      * fraction, the result is inexact. Set the sticky bit.
1981      */
1982     a.frac = (r_frac << 1) + (a_frac != 0);
1983
1984     return a;
1985 }
1986
1987 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
1988 {
1989     FloatParts pa = float16_unpack_canonical(a, status);
1990     FloatParts pr = sqrt_float(pa, status, &float16_params);
1991     return float16_round_pack_canonical(pr, status);
1992 }
1993
1994 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
1995 {
1996     FloatParts pa = float32_unpack_canonical(a, status);
1997     FloatParts pr = sqrt_float(pa, status, &float32_params);
1998     return float32_round_pack_canonical(pr, status);
1999 }
2000
2001 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
2002 {
2003     FloatParts pa = float64_unpack_canonical(a, status);
2004     FloatParts pr = sqrt_float(pa, status, &float64_params);
2005     return float64_round_pack_canonical(pr, status);
2006 }
2007
2008
2009 /*----------------------------------------------------------------------------
2010 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2011 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2012 | input.  If `zSign' is 1, the input is negated before being converted to an
2013 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
2014 | is simply rounded to an integer, with the inexact exception raised if the
2015 | input cannot be represented exactly as an integer.  However, if the fixed-
2016 | point input is too large, the invalid exception is raised and the largest
2017 | positive or negative integer is returned.
2018 *----------------------------------------------------------------------------*/
2019
2020 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2021 {
2022     int8_t roundingMode;
2023     flag roundNearestEven;
2024     int8_t roundIncrement, roundBits;
2025     int32_t z;
2026
2027     roundingMode = status->float_rounding_mode;
2028     roundNearestEven = ( roundingMode == float_round_nearest_even );
2029     switch (roundingMode) {
2030     case float_round_nearest_even:
2031     case float_round_ties_away:
2032         roundIncrement = 0x40;
2033         break;
2034     case float_round_to_zero:
2035         roundIncrement = 0;
2036         break;
2037     case float_round_up:
2038         roundIncrement = zSign ? 0 : 0x7f;
2039         break;
2040     case float_round_down:
2041         roundIncrement = zSign ? 0x7f : 0;
2042         break;
2043     default:
2044         abort();
2045     }
2046     roundBits = absZ & 0x7F;
2047     absZ = ( absZ + roundIncrement )>>7;
2048     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2049     z = absZ;
2050     if ( zSign ) z = - z;
2051     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2052         float_raise(float_flag_invalid, status);
2053         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2054     }
2055     if (roundBits) {
2056         status->float_exception_flags |= float_flag_inexact;
2057     }
2058     return z;
2059
2060 }
2061
2062 /*----------------------------------------------------------------------------
2063 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2064 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2065 | and returns the properly rounded 64-bit integer corresponding to the input.
2066 | If `zSign' is 1, the input is negated before being converted to an integer.
2067 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2068 | the inexact exception raised if the input cannot be represented exactly as
2069 | an integer.  However, if the fixed-point input is too large, the invalid
2070 | exception is raised and the largest positive or negative integer is
2071 | returned.
2072 *----------------------------------------------------------------------------*/
2073
2074 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2075                                float_status *status)
2076 {
2077     int8_t roundingMode;
2078     flag roundNearestEven, increment;
2079     int64_t z;
2080
2081     roundingMode = status->float_rounding_mode;
2082     roundNearestEven = ( roundingMode == float_round_nearest_even );
2083     switch (roundingMode) {
2084     case float_round_nearest_even:
2085     case float_round_ties_away:
2086         increment = ((int64_t) absZ1 < 0);
2087         break;
2088     case float_round_to_zero:
2089         increment = 0;
2090         break;
2091     case float_round_up:
2092         increment = !zSign && absZ1;
2093         break;
2094     case float_round_down:
2095         increment = zSign && absZ1;
2096         break;
2097     default:
2098         abort();
2099     }
2100     if ( increment ) {
2101         ++absZ0;
2102         if ( absZ0 == 0 ) goto overflow;
2103         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2104     }
2105     z = absZ0;
2106     if ( zSign ) z = - z;
2107     if ( z && ( ( z < 0 ) ^ zSign ) ) {
2108  overflow:
2109         float_raise(float_flag_invalid, status);
2110         return
2111               zSign ? (int64_t) LIT64( 0x8000000000000000 )
2112             : LIT64( 0x7FFFFFFFFFFFFFFF );
2113     }
2114     if (absZ1) {
2115         status->float_exception_flags |= float_flag_inexact;
2116     }
2117     return z;
2118
2119 }
2120
2121 /*----------------------------------------------------------------------------
2122 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2123 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2124 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2125 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
2126 | with the inexact exception raised if the input cannot be represented exactly
2127 | as an integer.  However, if the fixed-point input is too large, the invalid
2128 | exception is raised and the largest unsigned integer is returned.
2129 *----------------------------------------------------------------------------*/
2130
2131 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2132                                 uint64_t absZ1, float_status *status)
2133 {
2134     int8_t roundingMode;
2135     flag roundNearestEven, increment;
2136
2137     roundingMode = status->float_rounding_mode;
2138     roundNearestEven = (roundingMode == float_round_nearest_even);
2139     switch (roundingMode) {
2140     case float_round_nearest_even:
2141     case float_round_ties_away:
2142         increment = ((int64_t)absZ1 < 0);
2143         break;
2144     case float_round_to_zero:
2145         increment = 0;
2146         break;
2147     case float_round_up:
2148         increment = !zSign && absZ1;
2149         break;
2150     case float_round_down:
2151         increment = zSign && absZ1;
2152         break;
2153     default:
2154         abort();
2155     }
2156     if (increment) {
2157         ++absZ0;
2158         if (absZ0 == 0) {
2159             float_raise(float_flag_invalid, status);
2160             return LIT64(0xFFFFFFFFFFFFFFFF);
2161         }
2162         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2163     }
2164
2165     if (zSign && absZ0) {
2166         float_raise(float_flag_invalid, status);
2167         return 0;
2168     }
2169
2170     if (absZ1) {
2171         status->float_exception_flags |= float_flag_inexact;
2172     }
2173     return absZ0;
2174 }
2175
2176 /*----------------------------------------------------------------------------
2177 | If `a' is denormal and we are in flush-to-zero mode then set the
2178 | input-denormal exception and return zero. Otherwise just return the value.
2179 *----------------------------------------------------------------------------*/
2180 float32 float32_squash_input_denormal(float32 a, float_status *status)
2181 {
2182     if (status->flush_inputs_to_zero) {
2183         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2184             float_raise(float_flag_input_denormal, status);
2185             return make_float32(float32_val(a) & 0x80000000);
2186         }
2187     }
2188     return a;
2189 }
2190
2191 /*----------------------------------------------------------------------------
2192 | Normalizes the subnormal single-precision floating-point value represented
2193 | by the denormalized significand `aSig'.  The normalized exponent and
2194 | significand are stored at the locations pointed to by `zExpPtr' and
2195 | `zSigPtr', respectively.
2196 *----------------------------------------------------------------------------*/
2197
2198 static void
2199  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2200 {
2201     int8_t shiftCount;
2202
2203     shiftCount = countLeadingZeros32( aSig ) - 8;
2204     *zSigPtr = aSig<<shiftCount;
2205     *zExpPtr = 1 - shiftCount;
2206
2207 }
2208
2209 /*----------------------------------------------------------------------------
2210 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2211 | and significand `zSig', and returns the proper single-precision floating-
2212 | point value corresponding to the abstract input.  Ordinarily, the abstract
2213 | value is simply rounded and packed into the single-precision format, with
2214 | the inexact exception raised if the abstract input cannot be represented
2215 | exactly.  However, if the abstract value is too large, the overflow and
2216 | inexact exceptions are raised and an infinity or maximal finite value is
2217 | returned.  If the abstract value is too small, the input value is rounded to
2218 | a subnormal number, and the underflow and inexact exceptions are raised if
2219 | the abstract input cannot be represented exactly as a subnormal single-
2220 | precision floating-point number.
2221 |     The input significand `zSig' has its binary point between bits 30
2222 | and 29, which is 7 bits to the left of the usual location.  This shifted
2223 | significand must be normalized or smaller.  If `zSig' is not normalized,
2224 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2225 | and it must not require rounding.  In the usual case that `zSig' is
2226 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2227 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2228 | Binary Floating-Point Arithmetic.
2229 *----------------------------------------------------------------------------*/
2230
2231 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2232                                    float_status *status)
2233 {
2234     int8_t roundingMode;
2235     flag roundNearestEven;
2236     int8_t roundIncrement, roundBits;
2237     flag isTiny;
2238
2239     roundingMode = status->float_rounding_mode;
2240     roundNearestEven = ( roundingMode == float_round_nearest_even );
2241     switch (roundingMode) {
2242     case float_round_nearest_even:
2243     case float_round_ties_away:
2244         roundIncrement = 0x40;
2245         break;
2246     case float_round_to_zero:
2247         roundIncrement = 0;
2248         break;
2249     case float_round_up:
2250         roundIncrement = zSign ? 0 : 0x7f;
2251         break;
2252     case float_round_down:
2253         roundIncrement = zSign ? 0x7f : 0;
2254         break;
2255     default:
2256         abort();
2257         break;
2258     }
2259     roundBits = zSig & 0x7F;
2260     if ( 0xFD <= (uint16_t) zExp ) {
2261         if (    ( 0xFD < zExp )
2262              || (    ( zExp == 0xFD )
2263                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2264            ) {
2265             float_raise(float_flag_overflow | float_flag_inexact, status);
2266             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2267         }
2268         if ( zExp < 0 ) {
2269             if (status->flush_to_zero) {
2270                 float_raise(float_flag_output_denormal, status);
2271                 return packFloat32(zSign, 0, 0);
2272             }
2273             isTiny =
2274                 (status->float_detect_tininess
2275                  == float_tininess_before_rounding)
2276                 || ( zExp < -1 )
2277                 || ( zSig + roundIncrement < 0x80000000 );
2278             shift32RightJamming( zSig, - zExp, &zSig );
2279             zExp = 0;
2280             roundBits = zSig & 0x7F;
2281             if (isTiny && roundBits) {
2282                 float_raise(float_flag_underflow, status);
2283             }
2284         }
2285     }
2286     if (roundBits) {
2287         status->float_exception_flags |= float_flag_inexact;
2288     }
2289     zSig = ( zSig + roundIncrement )>>7;
2290     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2291     if ( zSig == 0 ) zExp = 0;
2292     return packFloat32( zSign, zExp, zSig );
2293
2294 }
2295
2296 /*----------------------------------------------------------------------------
2297 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2298 | and significand `zSig', and returns the proper single-precision floating-
2299 | point value corresponding to the abstract input.  This routine is just like
2300 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2301 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2302 | floating-point exponent.
2303 *----------------------------------------------------------------------------*/
2304
2305 static float32
2306  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2307                               float_status *status)
2308 {
2309     int8_t shiftCount;
2310
2311     shiftCount = countLeadingZeros32( zSig ) - 1;
2312     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2313                                status);
2314
2315 }
2316
2317 /*----------------------------------------------------------------------------
2318 | If `a' is denormal and we are in flush-to-zero mode then set the
2319 | input-denormal exception and return zero. Otherwise just return the value.
2320 *----------------------------------------------------------------------------*/
2321 float64 float64_squash_input_denormal(float64 a, float_status *status)
2322 {
2323     if (status->flush_inputs_to_zero) {
2324         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2325             float_raise(float_flag_input_denormal, status);
2326             return make_float64(float64_val(a) & (1ULL << 63));
2327         }
2328     }
2329     return a;
2330 }
2331
2332 /*----------------------------------------------------------------------------
2333 | Normalizes the subnormal double-precision floating-point value represented
2334 | by the denormalized significand `aSig'.  The normalized exponent and
2335 | significand are stored at the locations pointed to by `zExpPtr' and
2336 | `zSigPtr', respectively.
2337 *----------------------------------------------------------------------------*/
2338
2339 static void
2340  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2341 {
2342     int8_t shiftCount;
2343
2344     shiftCount = countLeadingZeros64( aSig ) - 11;
2345     *zSigPtr = aSig<<shiftCount;
2346     *zExpPtr = 1 - shiftCount;
2347
2348 }
2349
2350 /*----------------------------------------------------------------------------
2351 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2352 | double-precision floating-point value, returning the result.  After being
2353 | shifted into the proper positions, the three fields are simply added
2354 | together to form the result.  This means that any integer portion of `zSig'
2355 | will be added into the exponent.  Since a properly normalized significand
2356 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2357 | than the desired result exponent whenever `zSig' is a complete, normalized
2358 | significand.
2359 *----------------------------------------------------------------------------*/
2360
2361 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2362 {
2363
2364     return make_float64(
2365         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2366
2367 }
2368
2369 /*----------------------------------------------------------------------------
2370 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2371 | and significand `zSig', and returns the proper double-precision floating-
2372 | point value corresponding to the abstract input.  Ordinarily, the abstract
2373 | value is simply rounded and packed into the double-precision format, with
2374 | the inexact exception raised if the abstract input cannot be represented
2375 | exactly.  However, if the abstract value is too large, the overflow and
2376 | inexact exceptions are raised and an infinity or maximal finite value is
2377 | returned.  If the abstract value is too small, the input value is rounded to
2378 | a subnormal number, and the underflow and inexact exceptions are raised if
2379 | the abstract input cannot be represented exactly as a subnormal double-
2380 | precision floating-point number.
2381 |     The input significand `zSig' has its binary point between bits 62
2382 | and 61, which is 10 bits to the left of the usual location.  This shifted
2383 | significand must be normalized or smaller.  If `zSig' is not normalized,
2384 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2385 | and it must not require rounding.  In the usual case that `zSig' is
2386 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2387 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2388 | Binary Floating-Point Arithmetic.
2389 *----------------------------------------------------------------------------*/
2390
2391 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2392                                    float_status *status)
2393 {
2394     int8_t roundingMode;
2395     flag roundNearestEven;
2396     int roundIncrement, roundBits;
2397     flag isTiny;
2398
2399     roundingMode = status->float_rounding_mode;
2400     roundNearestEven = ( roundingMode == float_round_nearest_even );
2401     switch (roundingMode) {
2402     case float_round_nearest_even:
2403     case float_round_ties_away:
2404         roundIncrement = 0x200;
2405         break;
2406     case float_round_to_zero:
2407         roundIncrement = 0;
2408         break;
2409     case float_round_up:
2410         roundIncrement = zSign ? 0 : 0x3ff;
2411         break;
2412     case float_round_down:
2413         roundIncrement = zSign ? 0x3ff : 0;
2414         break;
2415     case float_round_to_odd:
2416         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2417         break;
2418     default:
2419         abort();
2420     }
2421     roundBits = zSig & 0x3FF;
2422     if ( 0x7FD <= (uint16_t) zExp ) {
2423         if (    ( 0x7FD < zExp )
2424              || (    ( zExp == 0x7FD )
2425                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2426            ) {
2427             bool overflow_to_inf = roundingMode != float_round_to_odd &&
2428                                    roundIncrement != 0;
2429             float_raise(float_flag_overflow | float_flag_inexact, status);
2430             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2431         }
2432         if ( zExp < 0 ) {
2433             if (status->flush_to_zero) {
2434                 float_raise(float_flag_output_denormal, status);
2435                 return packFloat64(zSign, 0, 0);
2436             }
2437             isTiny =
2438                    (status->float_detect_tininess
2439                     == float_tininess_before_rounding)
2440                 || ( zExp < -1 )
2441                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2442             shift64RightJamming( zSig, - zExp, &zSig );
2443             zExp = 0;
2444             roundBits = zSig & 0x3FF;
2445             if (isTiny && roundBits) {
2446                 float_raise(float_flag_underflow, status);
2447             }
2448             if (roundingMode == float_round_to_odd) {
2449                 /*
2450                  * For round-to-odd case, the roundIncrement depends on
2451                  * zSig which just changed.
2452                  */
2453                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2454             }
2455         }
2456     }
2457     if (roundBits) {
2458         status->float_exception_flags |= float_flag_inexact;
2459     }
2460     zSig = ( zSig + roundIncrement )>>10;
2461     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2462     if ( zSig == 0 ) zExp = 0;
2463     return packFloat64( zSign, zExp, zSig );
2464
2465 }
2466
2467 /*----------------------------------------------------------------------------
2468 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2469 | and significand `zSig', and returns the proper double-precision floating-
2470 | point value corresponding to the abstract input.  This routine is just like
2471 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2472 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2473 | floating-point exponent.
2474 *----------------------------------------------------------------------------*/
2475
2476 static float64
2477  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2478                               float_status *status)
2479 {
2480     int8_t shiftCount;
2481
2482     shiftCount = countLeadingZeros64( zSig ) - 1;
2483     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2484                                status);
2485
2486 }
2487
2488 /*----------------------------------------------------------------------------
2489 | Normalizes the subnormal extended double-precision floating-point value
2490 | represented by the denormalized significand `aSig'.  The normalized exponent
2491 | and significand are stored at the locations pointed to by `zExpPtr' and
2492 | `zSigPtr', respectively.
2493 *----------------------------------------------------------------------------*/
2494
2495 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2496                                 uint64_t *zSigPtr)
2497 {
2498     int8_t shiftCount;
2499
2500     shiftCount = countLeadingZeros64( aSig );
2501     *zSigPtr = aSig<<shiftCount;
2502     *zExpPtr = 1 - shiftCount;
2503 }
2504
2505 /*----------------------------------------------------------------------------
2506 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2507 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2508 | and returns the proper extended double-precision floating-point value
2509 | corresponding to the abstract input.  Ordinarily, the abstract value is
2510 | rounded and packed into the extended double-precision format, with the
2511 | inexact exception raised if the abstract input cannot be represented
2512 | exactly.  However, if the abstract value is too large, the overflow and
2513 | inexact exceptions are raised and an infinity or maximal finite value is
2514 | returned.  If the abstract value is too small, the input value is rounded to
2515 | a subnormal number, and the underflow and inexact exceptions are raised if
2516 | the abstract input cannot be represented exactly as a subnormal extended
2517 | double-precision floating-point number.
2518 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
2519 | number of bits as single or double precision, respectively.  Otherwise, the
2520 | result is rounded to the full precision of the extended double-precision
2521 | format.
2522 |     The input significand must be normalized or smaller.  If the input
2523 | significand is not normalized, `zExp' must be 0; in that case, the result
2524 | returned is a subnormal number, and it must not require rounding.  The
2525 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2526 | Floating-Point Arithmetic.
2527 *----------------------------------------------------------------------------*/
2528
2529 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2530                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2531                               float_status *status)
2532 {
2533     int8_t roundingMode;
2534     flag roundNearestEven, increment, isTiny;
2535     int64_t roundIncrement, roundMask, roundBits;
2536
2537     roundingMode = status->float_rounding_mode;
2538     roundNearestEven = ( roundingMode == float_round_nearest_even );
2539     if ( roundingPrecision == 80 ) goto precision80;
2540     if ( roundingPrecision == 64 ) {
2541         roundIncrement = LIT64( 0x0000000000000400 );
2542         roundMask = LIT64( 0x00000000000007FF );
2543     }
2544     else if ( roundingPrecision == 32 ) {
2545         roundIncrement = LIT64( 0x0000008000000000 );
2546         roundMask = LIT64( 0x000000FFFFFFFFFF );
2547     }
2548     else {
2549         goto precision80;
2550     }
2551     zSig0 |= ( zSig1 != 0 );
2552     switch (roundingMode) {
2553     case float_round_nearest_even:
2554     case float_round_ties_away:
2555         break;
2556     case float_round_to_zero:
2557         roundIncrement = 0;
2558         break;
2559     case float_round_up:
2560         roundIncrement = zSign ? 0 : roundMask;
2561         break;
2562     case float_round_down:
2563         roundIncrement = zSign ? roundMask : 0;
2564         break;
2565     default:
2566         abort();
2567     }
2568     roundBits = zSig0 & roundMask;
2569     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2570         if (    ( 0x7FFE < zExp )
2571              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2572            ) {
2573             goto overflow;
2574         }
2575         if ( zExp <= 0 ) {
2576             if (status->flush_to_zero) {
2577                 float_raise(float_flag_output_denormal, status);
2578                 return packFloatx80(zSign, 0, 0);
2579             }
2580             isTiny =
2581                    (status->float_detect_tininess
2582                     == float_tininess_before_rounding)
2583                 || ( zExp < 0 )
2584                 || ( zSig0 <= zSig0 + roundIncrement );
2585             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2586             zExp = 0;
2587             roundBits = zSig0 & roundMask;
2588             if (isTiny && roundBits) {
2589                 float_raise(float_flag_underflow, status);
2590             }
2591             if (roundBits) {
2592                 status->float_exception_flags |= float_flag_inexact;
2593             }
2594             zSig0 += roundIncrement;
2595             if ( (int64_t) zSig0 < 0 ) zExp = 1;
2596             roundIncrement = roundMask + 1;
2597             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2598                 roundMask |= roundIncrement;
2599             }
2600             zSig0 &= ~ roundMask;
2601             return packFloatx80( zSign, zExp, zSig0 );
2602         }
2603     }
2604     if (roundBits) {
2605         status->float_exception_flags |= float_flag_inexact;
2606     }
2607     zSig0 += roundIncrement;
2608     if ( zSig0 < roundIncrement ) {
2609         ++zExp;
2610         zSig0 = LIT64( 0x8000000000000000 );
2611     }
2612     roundIncrement = roundMask + 1;
2613     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2614         roundMask |= roundIncrement;
2615     }
2616     zSig0 &= ~ roundMask;
2617     if ( zSig0 == 0 ) zExp = 0;
2618     return packFloatx80( zSign, zExp, zSig0 );
2619  precision80:
2620     switch (roundingMode) {
2621     case float_round_nearest_even:
2622     case float_round_ties_away:
2623         increment = ((int64_t)zSig1 < 0);
2624         break;
2625     case float_round_to_zero:
2626         increment = 0;
2627         break;
2628     case float_round_up:
2629         increment = !zSign && zSig1;
2630         break;
2631     case float_round_down:
2632         increment = zSign && zSig1;
2633         break;
2634     default:
2635         abort();
2636     }
2637     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2638         if (    ( 0x7FFE < zExp )
2639              || (    ( zExp == 0x7FFE )
2640                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2641                   && increment
2642                 )
2643            ) {
2644             roundMask = 0;
2645  overflow:
2646             float_raise(float_flag_overflow | float_flag_inexact, status);
2647             if (    ( roundingMode == float_round_to_zero )
2648                  || ( zSign && ( roundingMode == float_round_up ) )
2649                  || ( ! zSign && ( roundingMode == float_round_down ) )
2650                ) {
2651                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2652             }
2653             return packFloatx80(zSign,
2654                                 floatx80_infinity_high,
2655                                 floatx80_infinity_low);
2656         }
2657         if ( zExp <= 0 ) {
2658             isTiny =
2659                    (status->float_detect_tininess
2660                     == float_tininess_before_rounding)
2661                 || ( zExp < 0 )
2662                 || ! increment
2663                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2664             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2665             zExp = 0;
2666             if (isTiny && zSig1) {
2667                 float_raise(float_flag_underflow, status);
2668             }
2669             if (zSig1) {
2670                 status->float_exception_flags |= float_flag_inexact;
2671             }
2672             switch (roundingMode) {
2673             case float_round_nearest_even:
2674             case float_round_ties_away:
2675                 increment = ((int64_t)zSig1 < 0);
2676                 break;
2677             case float_round_to_zero:
2678                 increment = 0;
2679                 break;
2680             case float_round_up:
2681                 increment = !zSign && zSig1;
2682                 break;
2683             case float_round_down:
2684                 increment = zSign && zSig1;
2685                 break;
2686             default:
2687                 abort();
2688             }
2689             if ( increment ) {
2690                 ++zSig0;
2691                 zSig0 &=
2692                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2693                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2694             }
2695             return packFloatx80( zSign, zExp, zSig0 );
2696         }
2697     }
2698     if (zSig1) {
2699         status->float_exception_flags |= float_flag_inexact;
2700     }
2701     if ( increment ) {
2702         ++zSig0;
2703         if ( zSig0 == 0 ) {
2704             ++zExp;
2705             zSig0 = LIT64( 0x8000000000000000 );
2706         }
2707         else {
2708             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2709         }
2710     }
2711     else {
2712         if ( zSig0 == 0 ) zExp = 0;
2713     }
2714     return packFloatx80( zSign, zExp, zSig0 );
2715
2716 }
2717
2718 /*----------------------------------------------------------------------------
2719 | Takes an abstract floating-point value having sign `zSign', exponent
2720 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2721 | and returns the proper extended double-precision floating-point value
2722 | corresponding to the abstract input.  This routine is just like
2723 | `roundAndPackFloatx80' except that the input significand does not have to be
2724 | normalized.
2725 *----------------------------------------------------------------------------*/
2726
2727 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2728                                        flag zSign, int32_t zExp,
2729                                        uint64_t zSig0, uint64_t zSig1,
2730                                        float_status *status)
2731 {
2732     int8_t shiftCount;
2733
2734     if ( zSig0 == 0 ) {
2735         zSig0 = zSig1;
2736         zSig1 = 0;
2737         zExp -= 64;
2738     }
2739     shiftCount = countLeadingZeros64( zSig0 );
2740     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2741     zExp -= shiftCount;
2742     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2743                                 zSig0, zSig1, status);
2744
2745 }
2746
2747 /*----------------------------------------------------------------------------
2748 | Returns the least-significant 64 fraction bits of the quadruple-precision
2749 | floating-point value `a'.
2750 *----------------------------------------------------------------------------*/
2751
2752 static inline uint64_t extractFloat128Frac1( float128 a )
2753 {
2754
2755     return a.low;
2756
2757 }
2758
2759 /*----------------------------------------------------------------------------
2760 | Returns the most-significant 48 fraction bits of the quadruple-precision
2761 | floating-point value `a'.
2762 *----------------------------------------------------------------------------*/
2763
2764 static inline uint64_t extractFloat128Frac0( float128 a )
2765 {
2766
2767     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2768
2769 }
2770
2771 /*----------------------------------------------------------------------------
2772 | Returns the exponent bits of the quadruple-precision floating-point value
2773 | `a'.
2774 *----------------------------------------------------------------------------*/
2775
2776 static inline int32_t extractFloat128Exp( float128 a )
2777 {
2778
2779     return ( a.high>>48 ) & 0x7FFF;
2780
2781 }
2782
2783 /*----------------------------------------------------------------------------
2784 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2785 *----------------------------------------------------------------------------*/
2786
2787 static inline flag extractFloat128Sign( float128 a )
2788 {
2789
2790     return a.high>>63;
2791
2792 }
2793
2794 /*----------------------------------------------------------------------------
2795 | Normalizes the subnormal quadruple-precision floating-point value
2796 | represented by the denormalized significand formed by the concatenation of
2797 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
2798 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
2799 | significand are stored at the location pointed to by `zSig0Ptr', and the
2800 | least significant 64 bits of the normalized significand are stored at the
2801 | location pointed to by `zSig1Ptr'.
2802 *----------------------------------------------------------------------------*/
2803
2804 static void
2805  normalizeFloat128Subnormal(
2806      uint64_t aSig0,
2807      uint64_t aSig1,
2808      int32_t *zExpPtr,
2809      uint64_t *zSig0Ptr,
2810      uint64_t *zSig1Ptr
2811  )
2812 {
2813     int8_t shiftCount;
2814
2815     if ( aSig0 == 0 ) {
2816         shiftCount = countLeadingZeros64( aSig1 ) - 15;
2817         if ( shiftCount < 0 ) {
2818             *zSig0Ptr = aSig1>>( - shiftCount );
2819             *zSig1Ptr = aSig1<<( shiftCount & 63 );
2820         }
2821         else {
2822             *zSig0Ptr = aSig1<<shiftCount;
2823             *zSig1Ptr = 0;
2824         }
2825         *zExpPtr = - shiftCount - 63;
2826     }
2827     else {
2828         shiftCount = countLeadingZeros64( aSig0 ) - 15;
2829         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2830         *zExpPtr = 1 - shiftCount;
2831     }
2832
2833 }
2834
2835 /*----------------------------------------------------------------------------
2836 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2837 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2838 | floating-point value, returning the result.  After being shifted into the
2839 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2840 | added together to form the most significant 32 bits of the result.  This
2841 | means that any integer portion of `zSig0' will be added into the exponent.
2842 | Since a properly normalized significand will have an integer portion equal
2843 | to 1, the `zExp' input should be 1 less than the desired result exponent
2844 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2845 | significand.
2846 *----------------------------------------------------------------------------*/
2847
2848 static inline float128
2849  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
2850 {
2851     float128 z;
2852
2853     z.low = zSig1;
2854     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
2855     return z;
2856
2857 }
2858
2859 /*----------------------------------------------------------------------------
2860 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2861 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2862 | and `zSig2', and returns the proper quadruple-precision floating-point value
2863 | corresponding to the abstract input.  Ordinarily, the abstract value is
2864 | simply rounded and packed into the quadruple-precision format, with the
2865 | inexact exception raised if the abstract input cannot be represented
2866 | exactly.  However, if the abstract value is too large, the overflow and
2867 | inexact exceptions are raised and an infinity or maximal finite value is
2868 | returned.  If the abstract value is too small, the input value is rounded to
2869 | a subnormal number, and the underflow and inexact exceptions are raised if
2870 | the abstract input cannot be represented exactly as a subnormal quadruple-
2871 | precision floating-point number.
2872 |     The input significand must be normalized or smaller.  If the input
2873 | significand is not normalized, `zExp' must be 0; in that case, the result
2874 | returned is a subnormal number, and it must not require rounding.  In the
2875 | usual case that the input significand is normalized, `zExp' must be 1 less
2876 | than the ``true'' floating-point exponent.  The handling of underflow and
2877 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2878 *----------------------------------------------------------------------------*/
2879
2880 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
2881                                      uint64_t zSig0, uint64_t zSig1,
2882                                      uint64_t zSig2, float_status *status)
2883 {
2884     int8_t roundingMode;
2885     flag roundNearestEven, increment, isTiny;
2886
2887     roundingMode = status->float_rounding_mode;
2888     roundNearestEven = ( roundingMode == float_round_nearest_even );
2889     switch (roundingMode) {
2890     case float_round_nearest_even:
2891     case float_round_ties_away:
2892         increment = ((int64_t)zSig2 < 0);
2893         break;
2894     case float_round_to_zero:
2895         increment = 0;
2896         break;
2897     case float_round_up:
2898         increment = !zSign && zSig2;
2899         break;
2900     case float_round_down:
2901         increment = zSign && zSig2;
2902         break;
2903     case float_round_to_odd:
2904         increment = !(zSig1 & 0x1) && zSig2;
2905         break;
2906     default:
2907         abort();
2908     }
2909     if ( 0x7FFD <= (uint32_t) zExp ) {
2910         if (    ( 0x7FFD < zExp )
2911              || (    ( zExp == 0x7FFD )
2912                   && eq128(
2913                          LIT64( 0x0001FFFFFFFFFFFF ),
2914                          LIT64( 0xFFFFFFFFFFFFFFFF ),
2915                          zSig0,
2916                          zSig1
2917                      )
2918                   && increment
2919                 )
2920            ) {
2921             float_raise(float_flag_overflow | float_flag_inexact, status);
2922             if (    ( roundingMode == float_round_to_zero )
2923                  || ( zSign && ( roundingMode == float_round_up ) )
2924                  || ( ! zSign && ( roundingMode == float_round_down ) )
2925                  || (roundingMode == float_round_to_odd)
2926                ) {
2927                 return
2928                     packFloat128(
2929                         zSign,
2930                         0x7FFE,
2931                         LIT64( 0x0000FFFFFFFFFFFF ),
2932                         LIT64( 0xFFFFFFFFFFFFFFFF )
2933                     );
2934             }
2935             return packFloat128( zSign, 0x7FFF, 0, 0 );
2936         }
2937         if ( zExp < 0 ) {
2938             if (status->flush_to_zero) {
2939                 float_raise(float_flag_output_denormal, status);
2940                 return packFloat128(zSign, 0, 0, 0);
2941             }
2942             isTiny =
2943                    (status->float_detect_tininess
2944                     == float_tininess_before_rounding)
2945                 || ( zExp < -1 )
2946                 || ! increment
2947                 || lt128(
2948                        zSig0,
2949                        zSig1,
2950                        LIT64( 0x0001FFFFFFFFFFFF ),
2951                        LIT64( 0xFFFFFFFFFFFFFFFF )
2952                    );
2953             shift128ExtraRightJamming(
2954                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2955             zExp = 0;
2956             if (isTiny && zSig2) {
2957                 float_raise(float_flag_underflow, status);
2958             }
2959             switch (roundingMode) {
2960             case float_round_nearest_even:
2961             case float_round_ties_away:
2962                 increment = ((int64_t)zSig2 < 0);
2963                 break;
2964             case float_round_to_zero:
2965                 increment = 0;
2966                 break;
2967             case float_round_up:
2968                 increment = !zSign && zSig2;
2969                 break;
2970             case float_round_down:
2971                 increment = zSign && zSig2;
2972                 break;
2973             case float_round_to_odd:
2974                 increment = !(zSig1 & 0x1) && zSig2;
2975                 break;
2976             default:
2977                 abort();
2978             }
2979         }
2980     }
2981     if (zSig2) {
2982         status->float_exception_flags |= float_flag_inexact;
2983     }
2984     if ( increment ) {
2985         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2986         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2987     }
2988     else {
2989         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2990     }
2991     return packFloat128( zSign, zExp, zSig0, zSig1 );
2992
2993 }
2994
2995 /*----------------------------------------------------------------------------
2996 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2997 | and significand formed by the concatenation of `zSig0' and `zSig1', and
2998 | returns the proper quadruple-precision floating-point value corresponding
2999 | to the abstract input.  This routine is just like `roundAndPackFloat128'
3000 | except that the input significand has fewer bits and does not have to be
3001 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
3002 | point exponent.
3003 *----------------------------------------------------------------------------*/
3004
3005 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3006                                               uint64_t zSig0, uint64_t zSig1,
3007                                               float_status *status)
3008 {
3009     int8_t shiftCount;
3010     uint64_t zSig2;
3011
3012     if ( zSig0 == 0 ) {
3013         zSig0 = zSig1;
3014         zSig1 = 0;
3015         zExp -= 64;
3016     }
3017     shiftCount = countLeadingZeros64( zSig0 ) - 15;
3018     if ( 0 <= shiftCount ) {
3019         zSig2 = 0;
3020         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3021     }
3022     else {
3023         shift128ExtraRightJamming(
3024             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3025     }
3026     zExp -= shiftCount;
3027     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3028
3029 }
3030
3031
3032 /*----------------------------------------------------------------------------
3033 | Returns the result of converting the 32-bit two's complement integer `a'
3034 | to the extended double-precision floating-point format.  The conversion
3035 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3036 | Arithmetic.
3037 *----------------------------------------------------------------------------*/
3038
3039 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3040 {
3041     flag zSign;
3042     uint32_t absA;
3043     int8_t shiftCount;
3044     uint64_t zSig;
3045
3046     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3047     zSign = ( a < 0 );
3048     absA = zSign ? - a : a;
3049     shiftCount = countLeadingZeros32( absA ) + 32;
3050     zSig = absA;
3051     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3052
3053 }
3054
3055 /*----------------------------------------------------------------------------
3056 | Returns the result of converting the 32-bit two's complement integer `a' to
3057 | the quadruple-precision floating-point format.  The conversion is performed
3058 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3059 *----------------------------------------------------------------------------*/
3060
3061 float128 int32_to_float128(int32_t a, float_status *status)
3062 {
3063     flag zSign;
3064     uint32_t absA;
3065     int8_t shiftCount;
3066     uint64_t zSig0;
3067
3068     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3069     zSign = ( a < 0 );
3070     absA = zSign ? - a : a;
3071     shiftCount = countLeadingZeros32( absA ) + 17;
3072     zSig0 = absA;
3073     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3074
3075 }
3076
3077 /*----------------------------------------------------------------------------
3078 | Returns the result of converting the 64-bit two's complement integer `a'
3079 | to the extended double-precision floating-point format.  The conversion
3080 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3081 | Arithmetic.
3082 *----------------------------------------------------------------------------*/
3083
3084 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3085 {
3086     flag zSign;
3087     uint64_t absA;
3088     int8_t shiftCount;
3089
3090     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3091     zSign = ( a < 0 );
3092     absA = zSign ? - a : a;
3093     shiftCount = countLeadingZeros64( absA );
3094     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3095
3096 }
3097
3098 /*----------------------------------------------------------------------------
3099 | Returns the result of converting the 64-bit two's complement integer `a' to
3100 | the quadruple-precision floating-point format.  The conversion is performed
3101 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3102 *----------------------------------------------------------------------------*/
3103
3104 float128 int64_to_float128(int64_t a, float_status *status)
3105 {
3106     flag zSign;
3107     uint64_t absA;
3108     int8_t shiftCount;
3109     int32_t zExp;
3110     uint64_t zSig0, zSig1;
3111
3112     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3113     zSign = ( a < 0 );
3114     absA = zSign ? - a : a;
3115     shiftCount = countLeadingZeros64( absA ) + 49;
3116     zExp = 0x406E - shiftCount;
3117     if ( 64 <= shiftCount ) {
3118         zSig1 = 0;
3119         zSig0 = absA;
3120         shiftCount -= 64;
3121     }
3122     else {
3123         zSig1 = absA;
3124         zSig0 = 0;
3125     }
3126     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3127     return packFloat128( zSign, zExp, zSig0, zSig1 );
3128
3129 }
3130
3131 /*----------------------------------------------------------------------------
3132 | Returns the result of converting the 64-bit unsigned integer `a'
3133 | to the quadruple-precision floating-point format.  The conversion is performed
3134 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3135 *----------------------------------------------------------------------------*/
3136
3137 float128 uint64_to_float128(uint64_t a, float_status *status)
3138 {
3139     if (a == 0) {
3140         return float128_zero;
3141     }
3142     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
3143 }
3144
3145
3146
3147
3148 /*----------------------------------------------------------------------------
3149 | Returns the result of converting the single-precision floating-point value
3150 | `a' to the double-precision floating-point format.  The conversion is
3151 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3152 | Arithmetic.
3153 *----------------------------------------------------------------------------*/
3154
3155 float64 float32_to_float64(float32 a, float_status *status)
3156 {
3157     flag aSign;
3158     int aExp;
3159     uint32_t aSig;
3160     a = float32_squash_input_denormal(a, status);
3161
3162     aSig = extractFloat32Frac( a );
3163     aExp = extractFloat32Exp( a );
3164     aSign = extractFloat32Sign( a );
3165     if ( aExp == 0xFF ) {
3166         if (aSig) {
3167             return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
3168         }
3169         return packFloat64( aSign, 0x7FF, 0 );
3170     }
3171     if ( aExp == 0 ) {
3172         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
3173         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3174         --aExp;
3175     }
3176     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
3177
3178 }
3179
3180 /*----------------------------------------------------------------------------
3181 | Returns the result of converting the single-precision floating-point value
3182 | `a' to the extended double-precision floating-point format.  The conversion
3183 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3184 | Arithmetic.
3185 *----------------------------------------------------------------------------*/
3186
3187 floatx80 float32_to_floatx80(float32 a, float_status *status)
3188 {
3189     flag aSign;
3190     int aExp;
3191     uint32_t aSig;
3192
3193     a = float32_squash_input_denormal(a, status);
3194     aSig = extractFloat32Frac( a );
3195     aExp = extractFloat32Exp( a );
3196     aSign = extractFloat32Sign( a );
3197     if ( aExp == 0xFF ) {
3198         if (aSig) {
3199             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3200         }
3201         return packFloatx80(aSign,
3202                             floatx80_infinity_high,
3203                             floatx80_infinity_low);
3204     }
3205     if ( aExp == 0 ) {
3206         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3207         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3208     }
3209     aSig |= 0x00800000;
3210     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3211
3212 }
3213
3214 /*----------------------------------------------------------------------------
3215 | Returns the result of converting the single-precision floating-point value
3216 | `a' to the double-precision floating-point format.  The conversion is
3217 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3218 | Arithmetic.
3219 *----------------------------------------------------------------------------*/
3220
3221 float128 float32_to_float128(float32 a, float_status *status)
3222 {
3223     flag aSign;
3224     int aExp;
3225     uint32_t aSig;
3226
3227     a = float32_squash_input_denormal(a, status);
3228     aSig = extractFloat32Frac( a );
3229     aExp = extractFloat32Exp( a );
3230     aSign = extractFloat32Sign( a );
3231     if ( aExp == 0xFF ) {
3232         if (aSig) {
3233             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3234         }
3235         return packFloat128( aSign, 0x7FFF, 0, 0 );
3236     }
3237     if ( aExp == 0 ) {
3238         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3239         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3240         --aExp;
3241     }
3242     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3243
3244 }
3245
3246 /*----------------------------------------------------------------------------
3247 | Returns the remainder of the single-precision floating-point value `a'
3248 | with respect to the corresponding value `b'.  The operation is performed
3249 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3250 *----------------------------------------------------------------------------*/
3251
3252 float32 float32_rem(float32 a, float32 b, float_status *status)
3253 {
3254     flag aSign, zSign;
3255     int aExp, bExp, expDiff;
3256     uint32_t aSig, bSig;
3257     uint32_t q;
3258     uint64_t aSig64, bSig64, q64;
3259     uint32_t alternateASig;
3260     int32_t sigMean;
3261     a = float32_squash_input_denormal(a, status);
3262     b = float32_squash_input_denormal(b, status);
3263
3264     aSig = extractFloat32Frac( a );
3265     aExp = extractFloat32Exp( a );
3266     aSign = extractFloat32Sign( a );
3267     bSig = extractFloat32Frac( b );
3268     bExp = extractFloat32Exp( b );
3269     if ( aExp == 0xFF ) {
3270         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3271             return propagateFloat32NaN(a, b, status);
3272         }
3273         float_raise(float_flag_invalid, status);
3274         return float32_default_nan(status);
3275     }
3276     if ( bExp == 0xFF ) {
3277         if (bSig) {
3278             return propagateFloat32NaN(a, b, status);
3279         }
3280         return a;
3281     }
3282     if ( bExp == 0 ) {
3283         if ( bSig == 0 ) {
3284             float_raise(float_flag_invalid, status);
3285             return float32_default_nan(status);
3286         }
3287         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3288     }
3289     if ( aExp == 0 ) {
3290         if ( aSig == 0 ) return a;
3291         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3292     }
3293     expDiff = aExp - bExp;
3294     aSig |= 0x00800000;
3295     bSig |= 0x00800000;
3296     if ( expDiff < 32 ) {
3297         aSig <<= 8;
3298         bSig <<= 8;
3299         if ( expDiff < 0 ) {
3300             if ( expDiff < -1 ) return a;
3301             aSig >>= 1;
3302         }
3303         q = ( bSig <= aSig );
3304         if ( q ) aSig -= bSig;
3305         if ( 0 < expDiff ) {
3306             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3307             q >>= 32 - expDiff;
3308             bSig >>= 2;
3309             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3310         }
3311         else {
3312             aSig >>= 2;
3313             bSig >>= 2;
3314         }
3315     }
3316     else {
3317         if ( bSig <= aSig ) aSig -= bSig;
3318         aSig64 = ( (uint64_t) aSig )<<40;
3319         bSig64 = ( (uint64_t) bSig )<<40;
3320         expDiff -= 64;
3321         while ( 0 < expDiff ) {
3322             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3323             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3324             aSig64 = - ( ( bSig * q64 )<<38 );
3325             expDiff -= 62;
3326         }
3327         expDiff += 64;
3328         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3329         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3330         q = q64>>( 64 - expDiff );
3331         bSig <<= 6;
3332         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3333     }
3334     do {
3335         alternateASig = aSig;
3336         ++q;
3337         aSig -= bSig;
3338     } while ( 0 <= (int32_t) aSig );
3339     sigMean = aSig + alternateASig;
3340     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3341         aSig = alternateASig;
3342     }
3343     zSign = ( (int32_t) aSig < 0 );
3344     if ( zSign ) aSig = - aSig;
3345     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3346 }
3347
3348
3349
3350 /*----------------------------------------------------------------------------
3351 | Returns the binary exponential of the single-precision floating-point value
3352 | `a'. The operation is performed according to the IEC/IEEE Standard for
3353 | Binary Floating-Point Arithmetic.
3354 |
3355 | Uses the following identities:
3356 |
3357 | 1. -------------------------------------------------------------------------
3358 |      x    x*ln(2)
3359 |     2  = e
3360 |
3361 | 2. -------------------------------------------------------------------------
3362 |                      2     3     4     5           n
3363 |      x        x     x     x     x     x           x
3364 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3365 |               1!    2!    3!    4!    5!          n!
3366 *----------------------------------------------------------------------------*/
3367
3368 static const float64 float32_exp2_coefficients[15] =
3369 {
3370     const_float64( 0x3ff0000000000000ll ), /*  1 */
3371     const_float64( 0x3fe0000000000000ll ), /*  2 */
3372     const_float64( 0x3fc5555555555555ll ), /*  3 */
3373     const_float64( 0x3fa5555555555555ll ), /*  4 */
3374     const_float64( 0x3f81111111111111ll ), /*  5 */
3375     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
3376     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
3377     const_float64( 0x3efa01a01a01a01all ), /*  8 */
3378     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
3379     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3380     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3381     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3382     const_float64( 0x3de6124613a86d09ll ), /* 13 */
3383     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3384     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3385 };
3386
3387 float32 float32_exp2(float32 a, float_status *status)
3388 {
3389     flag aSign;
3390     int aExp;
3391     uint32_t aSig;
3392     float64 r, x, xn;
3393     int i;
3394     a = float32_squash_input_denormal(a, status);
3395
3396     aSig = extractFloat32Frac( a );
3397     aExp = extractFloat32Exp( a );
3398     aSign = extractFloat32Sign( a );
3399
3400     if ( aExp == 0xFF) {
3401         if (aSig) {
3402             return propagateFloat32NaN(a, float32_zero, status);
3403         }
3404         return (aSign) ? float32_zero : a;
3405     }
3406     if (aExp == 0) {
3407         if (aSig == 0) return float32_one;
3408     }
3409
3410     float_raise(float_flag_inexact, status);
3411
3412     /* ******************************* */
3413     /* using float64 for approximation */
3414     /* ******************************* */
3415     x = float32_to_float64(a, status);
3416     x = float64_mul(x, float64_ln2, status);
3417
3418     xn = x;
3419     r = float64_one;
3420     for (i = 0 ; i < 15 ; i++) {
3421         float64 f;
3422
3423         f = float64_mul(xn, float32_exp2_coefficients[i], status);
3424         r = float64_add(r, f, status);
3425
3426         xn = float64_mul(xn, x, status);
3427     }
3428
3429     return float64_to_float32(r, status);
3430 }
3431
3432 /*----------------------------------------------------------------------------
3433 | Returns the binary log of the single-precision floating-point value `a'.
3434 | The operation is performed according to the IEC/IEEE Standard for Binary
3435 | Floating-Point Arithmetic.
3436 *----------------------------------------------------------------------------*/
3437 float32 float32_log2(float32 a, float_status *status)
3438 {
3439     flag aSign, zSign;
3440     int aExp;
3441     uint32_t aSig, zSig, i;
3442
3443     a = float32_squash_input_denormal(a, status);
3444     aSig = extractFloat32Frac( a );
3445     aExp = extractFloat32Exp( a );
3446     aSign = extractFloat32Sign( a );
3447
3448     if ( aExp == 0 ) {
3449         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3450         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3451     }
3452     if ( aSign ) {
3453         float_raise(float_flag_invalid, status);
3454         return float32_default_nan(status);
3455     }
3456     if ( aExp == 0xFF ) {
3457         if (aSig) {
3458             return propagateFloat32NaN(a, float32_zero, status);
3459         }
3460         return a;
3461     }
3462
3463     aExp -= 0x7F;
3464     aSig |= 0x00800000;
3465     zSign = aExp < 0;
3466     zSig = aExp << 23;
3467
3468     for (i = 1 << 22; i > 0; i >>= 1) {
3469         aSig = ( (uint64_t)aSig * aSig ) >> 23;
3470         if ( aSig & 0x01000000 ) {
3471             aSig >>= 1;
3472             zSig |= i;
3473         }
3474     }
3475
3476     if ( zSign )
3477         zSig = -zSig;
3478
3479     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3480 }
3481
3482 /*----------------------------------------------------------------------------
3483 | Returns 1 if the single-precision floating-point value `a' is equal to
3484 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3485 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3486 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3487 *----------------------------------------------------------------------------*/
3488
3489 int float32_eq(float32 a, float32 b, float_status *status)
3490 {
3491     uint32_t av, bv;
3492     a = float32_squash_input_denormal(a, status);
3493     b = float32_squash_input_denormal(b, status);
3494
3495     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3496          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3497        ) {
3498         float_raise(float_flag_invalid, status);
3499         return 0;
3500     }
3501     av = float32_val(a);
3502     bv = float32_val(b);
3503     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3504 }
3505
3506 /*----------------------------------------------------------------------------
3507 | Returns 1 if the single-precision floating-point value `a' is less than
3508 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
3509 | exception is raised if either operand is a NaN.  The comparison is performed
3510 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3511 *----------------------------------------------------------------------------*/
3512
3513 int float32_le(float32 a, float32 b, float_status *status)
3514 {
3515     flag aSign, bSign;
3516     uint32_t av, bv;
3517     a = float32_squash_input_denormal(a, status);
3518     b = float32_squash_input_denormal(b, status);
3519
3520     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3521          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3522        ) {
3523         float_raise(float_flag_invalid, status);
3524         return 0;
3525     }
3526     aSign = extractFloat32Sign( a );
3527     bSign = extractFloat32Sign( b );
3528     av = float32_val(a);
3529     bv = float32_val(b);
3530     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3531     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3532
3533 }
3534
3535 /*----------------------------------------------------------------------------
3536 | Returns 1 if the single-precision floating-point value `a' is less than
3537 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3538 | raised if either operand is a NaN.  The comparison is performed according
3539 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3540 *----------------------------------------------------------------------------*/
3541
3542 int float32_lt(float32 a, float32 b, float_status *status)
3543 {
3544     flag aSign, bSign;
3545     uint32_t av, bv;
3546     a = float32_squash_input_denormal(a, status);
3547     b = float32_squash_input_denormal(b, status);
3548
3549     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3550          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3551        ) {
3552         float_raise(float_flag_invalid, status);
3553         return 0;
3554     }
3555     aSign = extractFloat32Sign( a );
3556     bSign = extractFloat32Sign( b );
3557     av = float32_val(a);
3558     bv = float32_val(b);
3559     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3560     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3561
3562 }
3563
3564 /*----------------------------------------------------------------------------
3565 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3566 | be compared, and 0 otherwise.  The invalid exception is raised if either
3567 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
3568 | Standard for Binary Floating-Point Arithmetic.
3569 *----------------------------------------------------------------------------*/
3570
3571 int float32_unordered(float32 a, float32 b, float_status *status)
3572 {
3573     a = float32_squash_input_denormal(a, status);
3574     b = float32_squash_input_denormal(b, status);
3575
3576     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3577          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3578        ) {
3579         float_raise(float_flag_invalid, status);
3580         return 1;
3581     }
3582     return 0;
3583 }
3584
3585 /*----------------------------------------------------------------------------
3586 | Returns 1 if the single-precision floating-point value `a' is equal to
3587 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3588 | exception.  The comparison is performed according to the IEC/IEEE Standard
3589 | for Binary Floating-Point Arithmetic.
3590 *----------------------------------------------------------------------------*/
3591
3592 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3593 {
3594     a = float32_squash_input_denormal(a, status);
3595     b = float32_squash_input_denormal(b, status);
3596
3597     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3598          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3599        ) {
3600         if (float32_is_signaling_nan(a, status)
3601          || float32_is_signaling_nan(b, status)) {
3602             float_raise(float_flag_invalid, status);
3603         }
3604         return 0;
3605     }
3606     return ( float32_val(a) == float32_val(b) ) ||
3607             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3608 }
3609
3610 /*----------------------------------------------------------------------------
3611 | Returns 1 if the single-precision floating-point value `a' is less than or
3612 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3613 | cause an exception.  Otherwise, the comparison is performed according to the
3614 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3615 *----------------------------------------------------------------------------*/
3616
3617 int float32_le_quiet(float32 a, float32 b, float_status *status)
3618 {
3619     flag aSign, bSign;
3620     uint32_t av, bv;
3621     a = float32_squash_input_denormal(a, status);
3622     b = float32_squash_input_denormal(b, status);
3623
3624     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3625          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3626        ) {
3627         if (float32_is_signaling_nan(a, status)
3628          || float32_is_signaling_nan(b, status)) {
3629             float_raise(float_flag_invalid, status);
3630         }
3631         return 0;
3632     }
3633     aSign = extractFloat32Sign( a );
3634     bSign = extractFloat32Sign( b );
3635     av = float32_val(a);
3636     bv = float32_val(b);
3637     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3638     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3639
3640 }
3641
3642 /*----------------------------------------------------------------------------
3643 | Returns 1 if the single-precision floating-point value `a' is less than
3644 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3645 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3646 | Standard for Binary Floating-Point Arithmetic.
3647 *----------------------------------------------------------------------------*/
3648
3649 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3650 {
3651     flag aSign, bSign;
3652     uint32_t av, bv;
3653     a = float32_squash_input_denormal(a, status);
3654     b = float32_squash_input_denormal(b, status);
3655
3656     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3657          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3658        ) {
3659         if (float32_is_signaling_nan(a, status)
3660          || float32_is_signaling_nan(b, status)) {
3661             float_raise(float_flag_invalid, status);
3662         }
3663         return 0;
3664     }
3665     aSign = extractFloat32Sign( a );
3666     bSign = extractFloat32Sign( b );
3667     av = float32_val(a);
3668     bv = float32_val(b);
3669     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3670     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3671
3672 }
3673
3674 /*----------------------------------------------------------------------------
3675 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3676 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3677 | comparison is performed according to the IEC/IEEE Standard for Binary
3678 | Floating-Point Arithmetic.
3679 *----------------------------------------------------------------------------*/
3680
3681 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3682 {
3683     a = float32_squash_input_denormal(a, status);
3684     b = float32_squash_input_denormal(b, status);
3685
3686     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3687          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3688        ) {
3689         if (float32_is_signaling_nan(a, status)
3690          || float32_is_signaling_nan(b, status)) {
3691             float_raise(float_flag_invalid, status);
3692         }
3693         return 1;
3694     }
3695     return 0;
3696 }
3697
3698
3699 /*----------------------------------------------------------------------------
3700 | Returns the result of converting the double-precision floating-point value
3701 | `a' to the single-precision floating-point format.  The conversion is
3702 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3703 | Arithmetic.
3704 *----------------------------------------------------------------------------*/
3705
3706 float32 float64_to_float32(float64 a, float_status *status)
3707 {
3708     flag aSign;
3709     int aExp;
3710     uint64_t aSig;
3711     uint32_t zSig;
3712     a = float64_squash_input_denormal(a, status);
3713
3714     aSig = extractFloat64Frac( a );
3715     aExp = extractFloat64Exp( a );
3716     aSign = extractFloat64Sign( a );
3717     if ( aExp == 0x7FF ) {
3718         if (aSig) {
3719             return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3720         }
3721         return packFloat32( aSign, 0xFF, 0 );
3722     }
3723     shift64RightJamming( aSig, 22, &aSig );
3724     zSig = aSig;
3725     if ( aExp || zSig ) {
3726         zSig |= 0x40000000;
3727         aExp -= 0x381;
3728     }
3729     return roundAndPackFloat32(aSign, aExp, zSig, status);
3730
3731 }
3732
3733
3734 /*----------------------------------------------------------------------------
3735 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3736 | half-precision floating-point value, returning the result.  After being
3737 | shifted into the proper positions, the three fields are simply added
3738 | together to form the result.  This means that any integer portion of `zSig'
3739 | will be added into the exponent.  Since a properly normalized significand
3740 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3741 | than the desired result exponent whenever `zSig' is a complete, normalized
3742 | significand.
3743 *----------------------------------------------------------------------------*/
3744 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3745 {
3746     return make_float16(
3747         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3748 }
3749
3750 /*----------------------------------------------------------------------------
3751 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3752 | and significand `zSig', and returns the proper half-precision floating-
3753 | point value corresponding to the abstract input.  Ordinarily, the abstract
3754 | value is simply rounded and packed into the half-precision format, with
3755 | the inexact exception raised if the abstract input cannot be represented
3756 | exactly.  However, if the abstract value is too large, the overflow and
3757 | inexact exceptions are raised and an infinity or maximal finite value is
3758 | returned.  If the abstract value is too small, the input value is rounded to
3759 | a subnormal number, and the underflow and inexact exceptions are raised if
3760 | the abstract input cannot be represented exactly as a subnormal half-
3761 | precision floating-point number.
3762 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3763 | ARM-style "alternative representation", which omits the NaN and Inf
3764 | encodings in order to raise the maximum representable exponent by one.
3765 |     The input significand `zSig' has its binary point between bits 22
3766 | and 23, which is 13 bits to the left of the usual location.  This shifted
3767 | significand must be normalized or smaller.  If `zSig' is not normalized,
3768 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3769 | and it must not require rounding.  In the usual case that `zSig' is
3770 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3771 | Note the slightly odd position of the binary point in zSig compared with the
3772 | other roundAndPackFloat functions. This should probably be fixed if we
3773 | need to implement more float16 routines than just conversion.
3774 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3775 | Binary Floating-Point Arithmetic.
3776 *----------------------------------------------------------------------------*/
3777
3778 static float16 roundAndPackFloat16(flag zSign, int zExp,
3779                                    uint32_t zSig, flag ieee,
3780                                    float_status *status)
3781 {
3782     int maxexp = ieee ? 29 : 30;
3783     uint32_t mask;
3784     uint32_t increment;
3785     bool rounding_bumps_exp;
3786     bool is_tiny = false;
3787
3788     /* Calculate the mask of bits of the mantissa which are not
3789      * representable in half-precision and will be lost.
3790      */
3791     if (zExp < 1) {
3792         /* Will be denormal in halfprec */
3793         mask = 0x00ffffff;
3794         if (zExp >= -11) {
3795             mask >>= 11 + zExp;
3796         }
3797     } else {
3798         /* Normal number in halfprec */
3799         mask = 0x00001fff;
3800     }
3801
3802     switch (status->float_rounding_mode) {
3803     case float_round_nearest_even:
3804         increment = (mask + 1) >> 1;
3805         if ((zSig & mask) == increment) {
3806             increment = zSig & (increment << 1);
3807         }
3808         break;
3809     case float_round_ties_away:
3810         increment = (mask + 1) >> 1;
3811         break;
3812     case float_round_up:
3813         increment = zSign ? 0 : mask;
3814         break;
3815     case float_round_down:
3816         increment = zSign ? mask : 0;
3817         break;
3818     default: /* round_to_zero */
3819         increment = 0;
3820         break;
3821     }
3822
3823     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3824
3825     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3826         if (ieee) {
3827             float_raise(float_flag_overflow | float_flag_inexact, status);
3828             return packFloat16(zSign, 0x1f, 0);
3829         } else {
3830             float_raise(float_flag_invalid, status);
3831             return packFloat16(zSign, 0x1f, 0x3ff);
3832         }
3833     }
3834
3835     if (zExp < 0) {
3836         /* Note that flush-to-zero does not affect half-precision results */
3837         is_tiny =
3838             (status->float_detect_tininess == float_tininess_before_rounding)
3839             || (zExp < -1)
3840             || (!rounding_bumps_exp);
3841     }
3842     if (zSig & mask) {
3843         float_raise(float_flag_inexact, status);
3844         if (is_tiny) {
3845             float_raise(float_flag_underflow, status);
3846         }
3847     }
3848
3849     zSig += increment;
3850     if (rounding_bumps_exp) {
3851         zSig >>= 1;
3852         zExp++;
3853     }
3854
3855     if (zExp < -10) {
3856         return packFloat16(zSign, 0, 0);
3857     }
3858     if (zExp < 0) {
3859         zSig >>= -zExp;
3860         zExp = 0;
3861     }
3862     return packFloat16(zSign, zExp, zSig >> 13);
3863 }
3864
3865 /*----------------------------------------------------------------------------
3866 | If `a' is denormal and we are in flush-to-zero mode then set the
3867 | input-denormal exception and return zero. Otherwise just return the value.
3868 *----------------------------------------------------------------------------*/
3869 float16 float16_squash_input_denormal(float16 a, float_status *status)
3870 {
3871     if (status->flush_inputs_to_zero) {
3872         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3873             float_raise(float_flag_input_denormal, status);
3874             return make_float16(float16_val(a) & 0x8000);
3875         }
3876     }
3877     return a;
3878 }
3879
3880 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3881                                       uint32_t *zSigPtr)
3882 {
3883     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3884     *zSigPtr = aSig << shiftCount;
3885     *zExpPtr = 1 - shiftCount;
3886 }
3887
3888 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3889    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3890
3891 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3892 {
3893     flag aSign;
3894     int aExp;
3895     uint32_t aSig;
3896
3897     aSign = extractFloat16Sign(a);
3898     aExp = extractFloat16Exp(a);
3899     aSig = extractFloat16Frac(a);
3900
3901     if (aExp == 0x1f && ieee) {
3902         if (aSig) {
3903             return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3904         }
3905         return packFloat32(aSign, 0xff, 0);
3906     }
3907     if (aExp == 0) {
3908         if (aSig == 0) {
3909             return packFloat32(aSign, 0, 0);
3910         }
3911
3912         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3913         aExp--;
3914     }
3915     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3916 }
3917
3918 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3919 {
3920     flag aSign;
3921     int aExp;
3922     uint32_t aSig;
3923
3924     a = float32_squash_input_denormal(a, status);
3925
3926     aSig = extractFloat32Frac( a );
3927     aExp = extractFloat32Exp( a );
3928     aSign = extractFloat32Sign( a );
3929     if ( aExp == 0xFF ) {
3930         if (aSig) {
3931             /* Input is a NaN */
3932             if (!ieee) {
3933                 float_raise(float_flag_invalid, status);
3934                 return packFloat16(aSign, 0, 0);
3935             }
3936             return commonNaNToFloat16(
3937                 float32ToCommonNaN(a, status), status);
3938         }
3939         /* Infinity */
3940         if (!ieee) {
3941             float_raise(float_flag_invalid, status);
3942             return packFloat16(aSign, 0x1f, 0x3ff);
3943         }
3944         return packFloat16(aSign, 0x1f, 0);
3945     }
3946     if (aExp == 0 && aSig == 0) {
3947         return packFloat16(aSign, 0, 0);
3948     }
3949     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3950      * even if the input is denormal; however this is harmless because
3951      * the largest possible single-precision denormal is still smaller
3952      * than the smallest representable half-precision denormal, and so we
3953      * will end up ignoring aSig and returning via the "always return zero"
3954      * codepath.
3955      */
3956     aSig |= 0x00800000;
3957     aExp -= 0x71;
3958
3959     return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3960 }
3961
3962 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3963 {
3964     flag aSign;
3965     int aExp;
3966     uint32_t aSig;
3967
3968     aSign = extractFloat16Sign(a);
3969     aExp = extractFloat16Exp(a);
3970     aSig = extractFloat16Frac(a);
3971
3972     if (aExp == 0x1f && ieee) {
3973         if (aSig) {
3974             return commonNaNToFloat64(
3975                 float16ToCommonNaN(a, status), status);
3976         }
3977         return packFloat64(aSign, 0x7ff, 0);
3978     }
3979     if (aExp == 0) {
3980         if (aSig == 0) {
3981             return packFloat64(aSign, 0, 0);
3982         }
3983
3984         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3985         aExp--;
3986     }
3987     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3988 }
3989
3990 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3991 {
3992     flag aSign;
3993     int aExp;
3994     uint64_t aSig;
3995     uint32_t zSig;
3996
3997     a = float64_squash_input_denormal(a, status);
3998
3999     aSig = extractFloat64Frac(a);
4000     aExp = extractFloat64Exp(a);
4001     aSign = extractFloat64Sign(a);
4002     if (aExp == 0x7FF) {
4003         if (aSig) {
4004             /* Input is a NaN */
4005             if (!ieee) {
4006                 float_raise(float_flag_invalid, status);
4007                 return packFloat16(aSign, 0, 0);
4008             }
4009             return commonNaNToFloat16(
4010                 float64ToCommonNaN(a, status), status);
4011         }
4012         /* Infinity */
4013         if (!ieee) {
4014             float_raise(float_flag_invalid, status);
4015             return packFloat16(aSign, 0x1f, 0x3ff);
4016         }
4017         return packFloat16(aSign, 0x1f, 0);
4018     }
4019     shift64RightJamming(aSig, 29, &aSig);
4020     zSig = aSig;
4021     if (aExp == 0 && zSig == 0) {
4022         return packFloat16(aSign, 0, 0);
4023     }
4024     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4025      * even if the input is denormal; however this is harmless because
4026      * the largest possible single-precision denormal is still smaller
4027      * than the smallest representable half-precision denormal, and so we
4028      * will end up ignoring aSig and returning via the "always return zero"
4029      * codepath.
4030      */
4031     zSig |= 0x00800000;
4032     aExp -= 0x3F1;
4033
4034     return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
4035 }
4036
4037 /*----------------------------------------------------------------------------
4038 | Returns the result of converting the double-precision floating-point value
4039 | `a' to the extended double-precision floating-point format.  The conversion
4040 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4041 | Arithmetic.
4042 *----------------------------------------------------------------------------*/
4043
4044 floatx80 float64_to_floatx80(float64 a, float_status *status)
4045 {
4046     flag aSign;
4047     int aExp;
4048     uint64_t aSig;
4049
4050     a = float64_squash_input_denormal(a, status);
4051     aSig = extractFloat64Frac( a );
4052     aExp = extractFloat64Exp( a );
4053     aSign = extractFloat64Sign( a );
4054     if ( aExp == 0x7FF ) {
4055         if (aSig) {
4056             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4057         }
4058         return packFloatx80(aSign,
4059                             floatx80_infinity_high,
4060                             floatx80_infinity_low);
4061     }
4062     if ( aExp == 0 ) {
4063         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4064         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4065     }
4066     return
4067         packFloatx80(
4068             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4069
4070 }
4071
4072 /*----------------------------------------------------------------------------
4073 | Returns the result of converting the double-precision floating-point value
4074 | `a' to the quadruple-precision floating-point format.  The conversion is
4075 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4076 | Arithmetic.
4077 *----------------------------------------------------------------------------*/
4078
4079 float128 float64_to_float128(float64 a, float_status *status)
4080 {
4081     flag aSign;
4082     int aExp;
4083     uint64_t aSig, zSig0, zSig1;
4084
4085     a = float64_squash_input_denormal(a, status);
4086     aSig = extractFloat64Frac( a );
4087     aExp = extractFloat64Exp( a );
4088     aSign = extractFloat64Sign( a );
4089     if ( aExp == 0x7FF ) {
4090         if (aSig) {
4091             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4092         }
4093         return packFloat128( aSign, 0x7FFF, 0, 0 );
4094     }
4095     if ( aExp == 0 ) {
4096         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4097         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4098         --aExp;
4099     }
4100     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4101     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4102
4103 }
4104
4105
4106 /*----------------------------------------------------------------------------
4107 | Returns the remainder of the double-precision floating-point value `a'
4108 | with respect to the corresponding value `b'.  The operation is performed
4109 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4110 *----------------------------------------------------------------------------*/
4111
4112 float64 float64_rem(float64 a, float64 b, float_status *status)
4113 {
4114     flag aSign, zSign;
4115     int aExp, bExp, expDiff;
4116     uint64_t aSig, bSig;
4117     uint64_t q, alternateASig;
4118     int64_t sigMean;
4119
4120     a = float64_squash_input_denormal(a, status);
4121     b = float64_squash_input_denormal(b, status);
4122     aSig = extractFloat64Frac( a );
4123     aExp = extractFloat64Exp( a );
4124     aSign = extractFloat64Sign( a );
4125     bSig = extractFloat64Frac( b );
4126     bExp = extractFloat64Exp( b );
4127     if ( aExp == 0x7FF ) {
4128         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4129             return propagateFloat64NaN(a, b, status);
4130         }
4131         float_raise(float_flag_invalid, status);
4132         return float64_default_nan(status);
4133     }
4134     if ( bExp == 0x7FF ) {
4135         if (bSig) {
4136             return propagateFloat64NaN(a, b, status);
4137         }
4138         return a;
4139     }
4140     if ( bExp == 0 ) {
4141         if ( bSig == 0 ) {
4142             float_raise(float_flag_invalid, status);
4143             return float64_default_nan(status);
4144         }
4145         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4146     }
4147     if ( aExp == 0 ) {
4148         if ( aSig == 0 ) return a;
4149         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4150     }
4151     expDiff = aExp - bExp;
4152     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4153     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4154     if ( expDiff < 0 ) {
4155         if ( expDiff < -1 ) return a;
4156         aSig >>= 1;
4157     }
4158     q = ( bSig <= aSig );
4159     if ( q ) aSig -= bSig;
4160     expDiff -= 64;
4161     while ( 0 < expDiff ) {
4162         q = estimateDiv128To64( aSig, 0, bSig );
4163         q = ( 2 < q ) ? q - 2 : 0;
4164         aSig = - ( ( bSig>>2 ) * q );
4165         expDiff -= 62;
4166     }
4167     expDiff += 64;
4168     if ( 0 < expDiff ) {
4169         q = estimateDiv128To64( aSig, 0, bSig );
4170         q = ( 2 < q ) ? q - 2 : 0;
4171         q >>= 64 - expDiff;
4172         bSig >>= 2;
4173         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4174     }
4175     else {
4176         aSig >>= 2;
4177         bSig >>= 2;
4178     }
4179     do {
4180         alternateASig = aSig;
4181         ++q;
4182         aSig -= bSig;
4183     } while ( 0 <= (int64_t) aSig );
4184     sigMean = aSig + alternateASig;
4185     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4186         aSig = alternateASig;
4187     }
4188     zSign = ( (int64_t) aSig < 0 );
4189     if ( zSign ) aSig = - aSig;
4190     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4191
4192 }
4193
4194 /*----------------------------------------------------------------------------
4195 | Returns the binary log of the double-precision floating-point value `a'.
4196 | The operation is performed according to the IEC/IEEE Standard for Binary
4197 | Floating-Point Arithmetic.
4198 *----------------------------------------------------------------------------*/
4199 float64 float64_log2(float64 a, float_status *status)
4200 {
4201     flag aSign, zSign;
4202     int aExp;
4203     uint64_t aSig, aSig0, aSig1, zSig, i;
4204     a = float64_squash_input_denormal(a, status);
4205
4206     aSig = extractFloat64Frac( a );
4207     aExp = extractFloat64Exp( a );
4208     aSign = extractFloat64Sign( a );
4209
4210     if ( aExp == 0 ) {
4211         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4212         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4213     }
4214     if ( aSign ) {
4215         float_raise(float_flag_invalid, status);
4216         return float64_default_nan(status);
4217     }
4218     if ( aExp == 0x7FF ) {
4219         if (aSig) {
4220             return propagateFloat64NaN(a, float64_zero, status);
4221         }
4222         return a;
4223     }
4224
4225     aExp -= 0x3FF;
4226     aSig |= LIT64( 0x0010000000000000 );
4227     zSign = aExp < 0;
4228     zSig = (uint64_t)aExp << 52;
4229     for (i = 1LL << 51; i > 0; i >>= 1) {
4230         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4231         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4232         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4233             aSig >>= 1;
4234             zSig |= i;
4235         }
4236     }
4237
4238     if ( zSign )
4239         zSig = -zSig;
4240     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4241 }
4242
4243 /*----------------------------------------------------------------------------
4244 | Returns 1 if the double-precision floating-point value `a' is equal to the
4245 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4246 | if either operand is a NaN.  Otherwise, the comparison is performed
4247 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4248 *----------------------------------------------------------------------------*/
4249
4250 int float64_eq(float64 a, float64 b, float_status *status)
4251 {
4252     uint64_t av, bv;
4253     a = float64_squash_input_denormal(a, status);
4254     b = float64_squash_input_denormal(b, status);
4255
4256     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4257          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4258        ) {
4259         float_raise(float_flag_invalid, status);
4260         return 0;
4261     }
4262     av = float64_val(a);
4263     bv = float64_val(b);
4264     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4265
4266 }
4267
4268 /*----------------------------------------------------------------------------
4269 | Returns 1 if the double-precision floating-point value `a' is less than or
4270 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4271 | exception is raised if either operand is a NaN.  The comparison is performed
4272 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4273 *----------------------------------------------------------------------------*/
4274
4275 int float64_le(float64 a, float64 b, float_status *status)
4276 {
4277     flag aSign, bSign;
4278     uint64_t av, bv;
4279     a = float64_squash_input_denormal(a, status);
4280     b = float64_squash_input_denormal(b, status);
4281
4282     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4283          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4284        ) {
4285         float_raise(float_flag_invalid, status);
4286         return 0;
4287     }
4288     aSign = extractFloat64Sign( a );
4289     bSign = extractFloat64Sign( b );
4290     av = float64_val(a);
4291     bv = float64_val(b);
4292     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4293     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4294
4295 }
4296
4297 /*----------------------------------------------------------------------------
4298 | Returns 1 if the double-precision floating-point value `a' is less than
4299 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4300 | raised if either operand is a NaN.  The comparison is performed according
4301 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4302 *----------------------------------------------------------------------------*/
4303
4304 int float64_lt(float64 a, float64 b, float_status *status)
4305 {
4306     flag aSign, bSign;
4307     uint64_t av, bv;
4308
4309     a = float64_squash_input_denormal(a, status);
4310     b = float64_squash_input_denormal(b, status);
4311     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4312          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4313        ) {
4314         float_raise(float_flag_invalid, status);
4315         return 0;
4316     }
4317     aSign = extractFloat64Sign( a );
4318     bSign = extractFloat64Sign( b );
4319     av = float64_val(a);
4320     bv = float64_val(b);
4321     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4322     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4323
4324 }
4325
4326 /*----------------------------------------------------------------------------
4327 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4328 | be compared, and 0 otherwise.  The invalid exception is raised if either
4329 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4330 | Standard for Binary Floating-Point Arithmetic.
4331 *----------------------------------------------------------------------------*/
4332
4333 int float64_unordered(float64 a, float64 b, float_status *status)
4334 {
4335     a = float64_squash_input_denormal(a, status);
4336     b = float64_squash_input_denormal(b, status);
4337
4338     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4339          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4340        ) {
4341         float_raise(float_flag_invalid, status);
4342         return 1;
4343     }
4344     return 0;
4345 }
4346
4347 /*----------------------------------------------------------------------------
4348 | Returns 1 if the double-precision floating-point value `a' is equal to the
4349 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4350 | exception.The comparison is performed according to the IEC/IEEE Standard
4351 | for Binary Floating-Point Arithmetic.
4352 *----------------------------------------------------------------------------*/
4353
4354 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4355 {
4356     uint64_t av, bv;
4357     a = float64_squash_input_denormal(a, status);
4358     b = float64_squash_input_denormal(b, status);
4359
4360     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4361          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4362        ) {
4363         if (float64_is_signaling_nan(a, status)
4364          || float64_is_signaling_nan(b, status)) {
4365             float_raise(float_flag_invalid, status);
4366         }
4367         return 0;
4368     }
4369     av = float64_val(a);
4370     bv = float64_val(b);
4371     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4372
4373 }
4374
4375 /*----------------------------------------------------------------------------
4376 | Returns 1 if the double-precision floating-point value `a' is less than or
4377 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4378 | cause an exception.  Otherwise, the comparison is performed according to the
4379 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4380 *----------------------------------------------------------------------------*/
4381
4382 int float64_le_quiet(float64 a, float64 b, float_status *status)
4383 {
4384     flag aSign, bSign;
4385     uint64_t av, bv;
4386     a = float64_squash_input_denormal(a, status);
4387     b = float64_squash_input_denormal(b, status);
4388
4389     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4390          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4391        ) {
4392         if (float64_is_signaling_nan(a, status)
4393          || float64_is_signaling_nan(b, status)) {
4394             float_raise(float_flag_invalid, status);
4395         }
4396         return 0;
4397     }
4398     aSign = extractFloat64Sign( a );
4399     bSign = extractFloat64Sign( b );
4400     av = float64_val(a);
4401     bv = float64_val(b);
4402     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4403     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4404
4405 }
4406
4407 /*----------------------------------------------------------------------------
4408 | Returns 1 if the double-precision floating-point value `a' is less than
4409 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4410 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4411 | Standard for Binary Floating-Point Arithmetic.
4412 *----------------------------------------------------------------------------*/
4413
4414 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4415 {
4416     flag aSign, bSign;
4417     uint64_t av, bv;
4418     a = float64_squash_input_denormal(a, status);
4419     b = float64_squash_input_denormal(b, status);
4420
4421     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4422          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4423        ) {
4424         if (float64_is_signaling_nan(a, status)
4425          || float64_is_signaling_nan(b, status)) {
4426             float_raise(float_flag_invalid, status);
4427         }
4428         return 0;
4429     }
4430     aSign = extractFloat64Sign( a );
4431     bSign = extractFloat64Sign( b );
4432     av = float64_val(a);
4433     bv = float64_val(b);
4434     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4435     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4436
4437 }
4438
4439 /*----------------------------------------------------------------------------
4440 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4441 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4442 | comparison is performed according to the IEC/IEEE Standard for Binary
4443 | Floating-Point Arithmetic.
4444 *----------------------------------------------------------------------------*/
4445
4446 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4447 {
4448     a = float64_squash_input_denormal(a, status);
4449     b = float64_squash_input_denormal(b, status);
4450
4451     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4452          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4453        ) {
4454         if (float64_is_signaling_nan(a, status)
4455          || float64_is_signaling_nan(b, status)) {
4456             float_raise(float_flag_invalid, status);
4457         }
4458         return 1;
4459     }
4460     return 0;
4461 }
4462
4463 /*----------------------------------------------------------------------------
4464 | Returns the result of converting the extended double-precision floating-
4465 | point value `a' to the 32-bit two's complement integer format.  The
4466 | conversion is performed according to the IEC/IEEE Standard for Binary
4467 | Floating-Point Arithmetic---which means in particular that the conversion
4468 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4469 | largest positive integer is returned.  Otherwise, if the conversion
4470 | overflows, the largest integer with the same sign as `a' is returned.
4471 *----------------------------------------------------------------------------*/
4472
4473 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4474 {
4475     flag aSign;
4476     int32_t aExp, shiftCount;
4477     uint64_t aSig;
4478
4479     if (floatx80_invalid_encoding(a)) {
4480         float_raise(float_flag_invalid, status);
4481         return 1 << 31;
4482     }
4483     aSig = extractFloatx80Frac( a );
4484     aExp = extractFloatx80Exp( a );
4485     aSign = extractFloatx80Sign( a );
4486     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4487     shiftCount = 0x4037 - aExp;
4488     if ( shiftCount <= 0 ) shiftCount = 1;
4489     shift64RightJamming( aSig, shiftCount, &aSig );
4490     return roundAndPackInt32(aSign, aSig, status);
4491
4492 }
4493
4494 /*----------------------------------------------------------------------------
4495 | Returns the result of converting the extended double-precision floating-
4496 | point value `a' to the 32-bit two's complement integer format.  The
4497 | conversion is performed according to the IEC/IEEE Standard for Binary
4498 | Floating-Point Arithmetic, except that the conversion is always rounded
4499 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4500 | Otherwise, if the conversion overflows, the largest integer with the same
4501 | sign as `a' is returned.
4502 *----------------------------------------------------------------------------*/
4503
4504 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4505 {
4506     flag aSign;
4507     int32_t aExp, shiftCount;
4508     uint64_t aSig, savedASig;
4509     int32_t z;
4510
4511     if (floatx80_invalid_encoding(a)) {
4512         float_raise(float_flag_invalid, status);
4513         return 1 << 31;
4514     }
4515     aSig = extractFloatx80Frac( a );
4516     aExp = extractFloatx80Exp( a );
4517     aSign = extractFloatx80Sign( a );
4518     if ( 0x401E < aExp ) {
4519         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4520         goto invalid;
4521     }
4522     else if ( aExp < 0x3FFF ) {
4523         if (aExp || aSig) {
4524             status->float_exception_flags |= float_flag_inexact;
4525         }
4526         return 0;
4527     }
4528     shiftCount = 0x403E - aExp;
4529     savedASig = aSig;
4530     aSig >>= shiftCount;
4531     z = aSig;
4532     if ( aSign ) z = - z;
4533     if ( ( z < 0 ) ^ aSign ) {
4534  invalid:
4535         float_raise(float_flag_invalid, status);
4536         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4537     }
4538     if ( ( aSig<<shiftCount ) != savedASig ) {
4539         status->float_exception_flags |= float_flag_inexact;
4540     }
4541     return z;
4542
4543 }
4544
4545 /*----------------------------------------------------------------------------
4546 | Returns the result of converting the extended double-precision floating-
4547 | point value `a' to the 64-bit two's complement integer format.  The
4548 | conversion is performed according to the IEC/IEEE Standard for Binary
4549 | Floating-Point Arithmetic---which means in particular that the conversion
4550 | is rounded according to the current rounding mode.  If `a' is a NaN,
4551 | the largest positive integer is returned.  Otherwise, if the conversion
4552 | overflows, the largest integer with the same sign as `a' is returned.
4553 *----------------------------------------------------------------------------*/
4554
4555 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4556 {
4557     flag aSign;
4558     int32_t aExp, shiftCount;
4559     uint64_t aSig, aSigExtra;
4560
4561     if (floatx80_invalid_encoding(a)) {
4562         float_raise(float_flag_invalid, status);
4563         return 1ULL << 63;
4564     }
4565     aSig = extractFloatx80Frac( a );
4566     aExp = extractFloatx80Exp( a );
4567     aSign = extractFloatx80Sign( a );
4568     shiftCount = 0x403E - aExp;
4569     if ( shiftCount <= 0 ) {
4570         if ( shiftCount ) {
4571             float_raise(float_flag_invalid, status);
4572             if (!aSign || floatx80_is_any_nan(a)) {
4573                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4574             }
4575             return (int64_t) LIT64( 0x8000000000000000 );
4576         }
4577         aSigExtra = 0;
4578     }
4579     else {
4580         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4581     }
4582     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4583
4584 }
4585
4586 /*----------------------------------------------------------------------------
4587 | Returns the result of converting the extended double-precision floating-
4588 | point value `a' to the 64-bit two's complement integer format.  The
4589 | conversion is performed according to the IEC/IEEE Standard for Binary
4590 | Floating-Point Arithmetic, except that the conversion is always rounded
4591 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4592 | Otherwise, if the conversion overflows, the largest integer with the same
4593 | sign as `a' is returned.
4594 *----------------------------------------------------------------------------*/
4595
4596 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4597 {
4598     flag aSign;
4599     int32_t aExp, shiftCount;
4600     uint64_t aSig;
4601     int64_t z;
4602
4603     if (floatx80_invalid_encoding(a)) {
4604         float_raise(float_flag_invalid, status);
4605         return 1ULL << 63;
4606     }
4607     aSig = extractFloatx80Frac( a );
4608     aExp = extractFloatx80Exp( a );
4609     aSign = extractFloatx80Sign( a );
4610     shiftCount = aExp - 0x403E;
4611     if ( 0 <= shiftCount ) {
4612         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4613         if ( ( a.high != 0xC03E ) || aSig ) {
4614             float_raise(float_flag_invalid, status);
4615             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4616                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4617             }
4618         }
4619         return (int64_t) LIT64( 0x8000000000000000 );
4620     }
4621     else if ( aExp < 0x3FFF ) {
4622         if (aExp | aSig) {
4623             status->float_exception_flags |= float_flag_inexact;
4624         }
4625         return 0;
4626     }
4627     z = aSig>>( - shiftCount );
4628     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4629         status->float_exception_flags |= float_flag_inexact;
4630     }
4631     if ( aSign ) z = - z;
4632     return z;
4633
4634 }
4635
4636 /*----------------------------------------------------------------------------
4637 | Returns the result of converting the extended double-precision floating-
4638 | point value `a' to the single-precision floating-point format.  The
4639 | conversion is performed according to the IEC/IEEE Standard for Binary
4640 | Floating-Point Arithmetic.
4641 *----------------------------------------------------------------------------*/
4642
4643 float32 floatx80_to_float32(floatx80 a, float_status *status)
4644 {
4645     flag aSign;
4646     int32_t aExp;
4647     uint64_t aSig;
4648
4649     if (floatx80_invalid_encoding(a)) {
4650         float_raise(float_flag_invalid, status);
4651         return float32_default_nan(status);
4652     }
4653     aSig = extractFloatx80Frac( a );
4654     aExp = extractFloatx80Exp( a );
4655     aSign = extractFloatx80Sign( a );
4656     if ( aExp == 0x7FFF ) {
4657         if ( (uint64_t) ( aSig<<1 ) ) {
4658             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4659         }
4660         return packFloat32( aSign, 0xFF, 0 );
4661     }
4662     shift64RightJamming( aSig, 33, &aSig );
4663     if ( aExp || aSig ) aExp -= 0x3F81;
4664     return roundAndPackFloat32(aSign, aExp, aSig, status);
4665
4666 }
4667
4668 /*----------------------------------------------------------------------------
4669 | Returns the result of converting the extended double-precision floating-
4670 | point value `a' to the double-precision floating-point format.  The
4671 | conversion is performed according to the IEC/IEEE Standard for Binary
4672 | Floating-Point Arithmetic.
4673 *----------------------------------------------------------------------------*/
4674
4675 float64 floatx80_to_float64(floatx80 a, float_status *status)
4676 {
4677     flag aSign;
4678     int32_t aExp;
4679     uint64_t aSig, zSig;
4680
4681     if (floatx80_invalid_encoding(a)) {
4682         float_raise(float_flag_invalid, status);
4683         return float64_default_nan(status);
4684     }
4685     aSig = extractFloatx80Frac( a );
4686     aExp = extractFloatx80Exp( a );
4687     aSign = extractFloatx80Sign( a );
4688     if ( aExp == 0x7FFF ) {
4689         if ( (uint64_t) ( aSig<<1 ) ) {
4690             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4691         }
4692         return packFloat64( aSign, 0x7FF, 0 );
4693     }
4694     shift64RightJamming( aSig, 1, &zSig );
4695     if ( aExp || aSig ) aExp -= 0x3C01;
4696     return roundAndPackFloat64(aSign, aExp, zSig, status);
4697
4698 }
4699
4700 /*----------------------------------------------------------------------------
4701 | Returns the result of converting the extended double-precision floating-
4702 | point value `a' to the quadruple-precision floating-point format.  The
4703 | conversion is performed according to the IEC/IEEE Standard for Binary
4704 | Floating-Point Arithmetic.
4705 *----------------------------------------------------------------------------*/
4706
4707 float128 floatx80_to_float128(floatx80 a, float_status *status)
4708 {
4709     flag aSign;
4710     int aExp;
4711     uint64_t aSig, zSig0, zSig1;
4712
4713     if (floatx80_invalid_encoding(a)) {
4714         float_raise(float_flag_invalid, status);
4715         return float128_default_nan(status);
4716     }
4717     aSig = extractFloatx80Frac( a );
4718     aExp = extractFloatx80Exp( a );
4719     aSign = extractFloatx80Sign( a );
4720     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4721         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4722     }
4723     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4724     return packFloat128( aSign, aExp, zSig0, zSig1 );
4725
4726 }
4727
4728 /*----------------------------------------------------------------------------
4729 | Rounds the extended double-precision floating-point value `a'
4730 | to the precision provided by floatx80_rounding_precision and returns the
4731 | result as an extended double-precision floating-point value.
4732 | The operation is performed according to the IEC/IEEE Standard for Binary
4733 | Floating-Point Arithmetic.
4734 *----------------------------------------------------------------------------*/
4735
4736 floatx80 floatx80_round(floatx80 a, float_status *status)
4737 {
4738     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4739                                 extractFloatx80Sign(a),
4740                                 extractFloatx80Exp(a),
4741                                 extractFloatx80Frac(a), 0, status);
4742 }
4743
4744 /*----------------------------------------------------------------------------
4745 | Rounds the extended double-precision floating-point value `a' to an integer,
4746 | and returns the result as an extended quadruple-precision floating-point
4747 | value.  The operation is performed according to the IEC/IEEE Standard for
4748 | Binary Floating-Point Arithmetic.
4749 *----------------------------------------------------------------------------*/
4750
4751 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4752 {
4753     flag aSign;
4754     int32_t aExp;
4755     uint64_t lastBitMask, roundBitsMask;
4756     floatx80 z;
4757
4758     if (floatx80_invalid_encoding(a)) {
4759         float_raise(float_flag_invalid, status);
4760         return floatx80_default_nan(status);
4761     }
4762     aExp = extractFloatx80Exp( a );
4763     if ( 0x403E <= aExp ) {
4764         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4765             return propagateFloatx80NaN(a, a, status);
4766         }
4767         return a;
4768     }
4769     if ( aExp < 0x3FFF ) {
4770         if (    ( aExp == 0 )
4771              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4772             return a;
4773         }
4774         status->float_exception_flags |= float_flag_inexact;
4775         aSign = extractFloatx80Sign( a );
4776         switch (status->float_rounding_mode) {
4777          case float_round_nearest_even:
4778             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4779                ) {
4780                 return
4781                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4782             }
4783             break;
4784         case float_round_ties_away:
4785             if (aExp == 0x3FFE) {
4786                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4787             }
4788             break;
4789          case float_round_down:
4790             return
4791                   aSign ?
4792                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4793                 : packFloatx80( 0, 0, 0 );
4794          case float_round_up:
4795             return
4796                   aSign ? packFloatx80( 1, 0, 0 )
4797                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4798         }
4799         return packFloatx80( aSign, 0, 0 );
4800     }
4801     lastBitMask = 1;
4802     lastBitMask <<= 0x403E - aExp;
4803     roundBitsMask = lastBitMask - 1;
4804     z = a;
4805     switch (status->float_rounding_mode) {
4806     case float_round_nearest_even:
4807         z.low += lastBitMask>>1;
4808         if ((z.low & roundBitsMask) == 0) {
4809             z.low &= ~lastBitMask;
4810         }
4811         break;
4812     case float_round_ties_away:
4813         z.low += lastBitMask >> 1;
4814         break;
4815     case float_round_to_zero:
4816         break;
4817     case float_round_up:
4818         if (!extractFloatx80Sign(z)) {
4819             z.low += roundBitsMask;
4820         }
4821         break;
4822     case float_round_down:
4823         if (extractFloatx80Sign(z)) {
4824             z.low += roundBitsMask;
4825         }
4826         break;
4827     default:
4828         abort();
4829     }
4830     z.low &= ~ roundBitsMask;
4831     if ( z.low == 0 ) {
4832         ++z.high;
4833         z.low = LIT64( 0x8000000000000000 );
4834     }
4835     if (z.low != a.low) {
4836         status->float_exception_flags |= float_flag_inexact;
4837     }
4838     return z;
4839
4840 }
4841
4842 /*----------------------------------------------------------------------------
4843 | Returns the result of adding the absolute values of the extended double-
4844 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4845 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4846 | The addition is performed according to the IEC/IEEE Standard for Binary
4847 | Floating-Point Arithmetic.
4848 *----------------------------------------------------------------------------*/
4849
4850 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4851                                 float_status *status)
4852 {
4853     int32_t aExp, bExp, zExp;
4854     uint64_t aSig, bSig, zSig0, zSig1;
4855     int32_t expDiff;
4856
4857     aSig = extractFloatx80Frac( a );
4858     aExp = extractFloatx80Exp( a );
4859     bSig = extractFloatx80Frac( b );
4860     bExp = extractFloatx80Exp( b );
4861     expDiff = aExp - bExp;
4862     if ( 0 < expDiff ) {
4863         if ( aExp == 0x7FFF ) {
4864             if ((uint64_t)(aSig << 1)) {
4865                 return propagateFloatx80NaN(a, b, status);
4866             }
4867             return a;
4868         }
4869         if ( bExp == 0 ) --expDiff;
4870         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4871         zExp = aExp;
4872     }
4873     else if ( expDiff < 0 ) {
4874         if ( bExp == 0x7FFF ) {
4875             if ((uint64_t)(bSig << 1)) {
4876                 return propagateFloatx80NaN(a, b, status);
4877             }
4878             return packFloatx80(zSign,
4879                                 floatx80_infinity_high,
4880                                 floatx80_infinity_low);
4881         }
4882         if ( aExp == 0 ) ++expDiff;
4883         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4884         zExp = bExp;
4885     }
4886     else {
4887         if ( aExp == 0x7FFF ) {
4888             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4889                 return propagateFloatx80NaN(a, b, status);
4890             }
4891             return a;
4892         }
4893         zSig1 = 0;
4894         zSig0 = aSig + bSig;
4895         if ( aExp == 0 ) {
4896             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4897             goto roundAndPack;
4898         }
4899         zExp = aExp;
4900         goto shiftRight1;
4901     }
4902     zSig0 = aSig + bSig;
4903     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4904  shiftRight1:
4905     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4906     zSig0 |= LIT64( 0x8000000000000000 );
4907     ++zExp;
4908  roundAndPack:
4909     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4910                                 zSign, zExp, zSig0, zSig1, status);
4911 }
4912
4913 /*----------------------------------------------------------------------------
4914 | Returns the result of subtracting the absolute values of the extended
4915 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4916 | difference is negated before being returned.  `zSign' is ignored if the
4917 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4918 | Standard for Binary Floating-Point Arithmetic.
4919 *----------------------------------------------------------------------------*/
4920
4921 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4922                                 float_status *status)
4923 {
4924     int32_t aExp, bExp, zExp;
4925     uint64_t aSig, bSig, zSig0, zSig1;
4926     int32_t expDiff;
4927
4928     aSig = extractFloatx80Frac( a );
4929     aExp = extractFloatx80Exp( a );
4930     bSig = extractFloatx80Frac( b );
4931     bExp = extractFloatx80Exp( b );
4932     expDiff = aExp - bExp;
4933     if ( 0 < expDiff ) goto aExpBigger;
4934     if ( expDiff < 0 ) goto bExpBigger;
4935     if ( aExp == 0x7FFF ) {
4936         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4937             return propagateFloatx80NaN(a, b, status);
4938         }
4939         float_raise(float_flag_invalid, status);
4940         return floatx80_default_nan(status);
4941     }
4942     if ( aExp == 0 ) {
4943         aExp = 1;
4944         bExp = 1;
4945     }
4946     zSig1 = 0;
4947     if ( bSig < aSig ) goto aBigger;
4948     if ( aSig < bSig ) goto bBigger;
4949     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
4950  bExpBigger:
4951     if ( bExp == 0x7FFF ) {
4952         if ((uint64_t)(bSig << 1)) {
4953             return propagateFloatx80NaN(a, b, status);
4954         }
4955         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
4956                             floatx80_infinity_low);
4957     }
4958     if ( aExp == 0 ) ++expDiff;
4959     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4960  bBigger:
4961     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4962     zExp = bExp;
4963     zSign ^= 1;
4964     goto normalizeRoundAndPack;
4965  aExpBigger:
4966     if ( aExp == 0x7FFF ) {
4967         if ((uint64_t)(aSig << 1)) {
4968             return propagateFloatx80NaN(a, b, status);
4969         }
4970         return a;
4971     }
4972     if ( bExp == 0 ) --expDiff;
4973     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4974  aBigger:
4975     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4976     zExp = aExp;
4977  normalizeRoundAndPack:
4978     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
4979                                          zSign, zExp, zSig0, zSig1, status);
4980 }
4981
4982 /*----------------------------------------------------------------------------
4983 | Returns the result of adding the extended double-precision floating-point
4984 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4985 | Standard for Binary Floating-Point Arithmetic.
4986 *----------------------------------------------------------------------------*/
4987
4988 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
4989 {
4990     flag aSign, bSign;
4991
4992     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4993         float_raise(float_flag_invalid, status);
4994         return floatx80_default_nan(status);
4995     }
4996     aSign = extractFloatx80Sign( a );
4997     bSign = extractFloatx80Sign( b );
4998     if ( aSign == bSign ) {
4999         return addFloatx80Sigs(a, b, aSign, status);
5000     }
5001     else {
5002         return subFloatx80Sigs(a, b, aSign, status);
5003     }
5004
5005 }
5006
5007 /*----------------------------------------------------------------------------
5008 | Returns the result of subtracting the extended double-precision floating-
5009 | point values `a' and `b'.  The operation is performed according to the
5010 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5011 *----------------------------------------------------------------------------*/
5012
5013 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5014 {
5015     flag aSign, bSign;
5016
5017     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5018         float_raise(float_flag_invalid, status);
5019         return floatx80_default_nan(status);
5020     }
5021     aSign = extractFloatx80Sign( a );
5022     bSign = extractFloatx80Sign( b );
5023     if ( aSign == bSign ) {
5024         return subFloatx80Sigs(a, b, aSign, status);
5025     }
5026     else {
5027         return addFloatx80Sigs(a, b, aSign, status);
5028     }
5029
5030 }
5031
5032 /*----------------------------------------------------------------------------
5033 | Returns the result of multiplying the extended double-precision floating-
5034 | point values `a' and `b'.  The operation is performed according to the
5035 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5036 *----------------------------------------------------------------------------*/
5037
5038 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5039 {
5040     flag aSign, bSign, zSign;
5041     int32_t aExp, bExp, zExp;
5042     uint64_t aSig, bSig, zSig0, zSig1;
5043
5044     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5045         float_raise(float_flag_invalid, status);
5046         return floatx80_default_nan(status);
5047     }
5048     aSig = extractFloatx80Frac( a );
5049     aExp = extractFloatx80Exp( a );
5050     aSign = extractFloatx80Sign( a );
5051     bSig = extractFloatx80Frac( b );
5052     bExp = extractFloatx80Exp( b );
5053     bSign = extractFloatx80Sign( b );
5054     zSign = aSign ^ bSign;
5055     if ( aExp == 0x7FFF ) {
5056         if (    (uint64_t) ( aSig<<1 )
5057              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5058             return propagateFloatx80NaN(a, b, status);
5059         }
5060         if ( ( bExp | bSig ) == 0 ) goto invalid;
5061         return packFloatx80(zSign, floatx80_infinity_high,
5062                                    floatx80_infinity_low);
5063     }
5064     if ( bExp == 0x7FFF ) {
5065         if ((uint64_t)(bSig << 1)) {
5066             return propagateFloatx80NaN(a, b, status);
5067         }
5068         if ( ( aExp | aSig ) == 0 ) {
5069  invalid:
5070             float_raise(float_flag_invalid, status);
5071             return floatx80_default_nan(status);
5072         }
5073         return packFloatx80(zSign, floatx80_infinity_high,
5074                                    floatx80_infinity_low);
5075     }
5076     if ( aExp == 0 ) {
5077         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5078         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5079     }
5080     if ( bExp == 0 ) {
5081         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5082         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5083     }
5084     zExp = aExp + bExp - 0x3FFE;
5085     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5086     if ( 0 < (int64_t) zSig0 ) {
5087         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5088         --zExp;
5089     }
5090     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5091                                 zSign, zExp, zSig0, zSig1, status);
5092 }
5093
5094 /*----------------------------------------------------------------------------
5095 | Returns the result of dividing the extended double-precision floating-point
5096 | value `a' by the corresponding value `b'.  The operation is performed
5097 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5098 *----------------------------------------------------------------------------*/
5099
5100 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5101 {
5102     flag aSign, bSign, zSign;
5103     int32_t aExp, bExp, zExp;
5104     uint64_t aSig, bSig, zSig0, zSig1;
5105     uint64_t rem0, rem1, rem2, term0, term1, term2;
5106
5107     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5108         float_raise(float_flag_invalid, status);
5109         return floatx80_default_nan(status);
5110     }
5111     aSig = extractFloatx80Frac( a );
5112     aExp = extractFloatx80Exp( a );
5113     aSign = extractFloatx80Sign( a );
5114     bSig = extractFloatx80Frac( b );
5115     bExp = extractFloatx80Exp( b );
5116     bSign = extractFloatx80Sign( b );
5117     zSign = aSign ^ bSign;
5118     if ( aExp == 0x7FFF ) {
5119         if ((uint64_t)(aSig << 1)) {
5120             return propagateFloatx80NaN(a, b, status);
5121         }
5122         if ( bExp == 0x7FFF ) {
5123             if ((uint64_t)(bSig << 1)) {
5124                 return propagateFloatx80NaN(a, b, status);
5125             }
5126             goto invalid;
5127         }
5128         return packFloatx80(zSign, floatx80_infinity_high,
5129                                    floatx80_infinity_low);
5130     }
5131     if ( bExp == 0x7FFF ) {
5132         if ((uint64_t)(bSig << 1)) {
5133             return propagateFloatx80NaN(a, b, status);
5134         }
5135         return packFloatx80( zSign, 0, 0 );
5136     }
5137     if ( bExp == 0 ) {
5138         if ( bSig == 0 ) {
5139             if ( ( aExp | aSig ) == 0 ) {
5140  invalid:
5141                 float_raise(float_flag_invalid, status);
5142                 return floatx80_default_nan(status);
5143             }
5144             float_raise(float_flag_divbyzero, status);
5145             return packFloatx80(zSign, floatx80_infinity_high,
5146                                        floatx80_infinity_low);
5147         }
5148         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5149     }
5150     if ( aExp == 0 ) {
5151         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5152         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5153     }
5154     zExp = aExp - bExp + 0x3FFE;
5155     rem1 = 0;
5156     if ( bSig <= aSig ) {
5157         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5158         ++zExp;
5159     }
5160     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5161     mul64To128( bSig, zSig0, &term0, &term1 );
5162     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5163     while ( (int64_t) rem0 < 0 ) {
5164         --zSig0;
5165         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5166     }
5167     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5168     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5169         mul64To128( bSig, zSig1, &term1, &term2 );
5170         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5171         while ( (int64_t) rem1 < 0 ) {
5172             --zSig1;
5173             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5174         }
5175         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5176     }
5177     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5178                                 zSign, zExp, zSig0, zSig1, status);
5179 }
5180
5181 /*----------------------------------------------------------------------------
5182 | Returns the remainder of the extended double-precision floating-point value
5183 | `a' with respect to the corresponding value `b'.  The operation is performed
5184 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5185 *----------------------------------------------------------------------------*/
5186
5187 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5188 {
5189     flag aSign, zSign;
5190     int32_t aExp, bExp, expDiff;
5191     uint64_t aSig0, aSig1, bSig;
5192     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5193
5194     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5195         float_raise(float_flag_invalid, status);
5196         return floatx80_default_nan(status);
5197     }
5198     aSig0 = extractFloatx80Frac( a );
5199     aExp = extractFloatx80Exp( a );
5200     aSign = extractFloatx80Sign( a );
5201     bSig = extractFloatx80Frac( b );
5202     bExp = extractFloatx80Exp( b );
5203     if ( aExp == 0x7FFF ) {
5204         if (    (uint64_t) ( aSig0<<1 )
5205              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5206             return propagateFloatx80NaN(a, b, status);
5207         }
5208         goto invalid;
5209     }
5210     if ( bExp == 0x7FFF ) {
5211         if ((uint64_t)(bSig << 1)) {
5212             return propagateFloatx80NaN(a, b, status);
5213         }
5214         return a;
5215     }
5216     if ( bExp == 0 ) {
5217         if ( bSig == 0 ) {
5218  invalid:
5219             float_raise(float_flag_invalid, status);
5220             return floatx80_default_nan(status);
5221         }
5222         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5223     }
5224     if ( aExp == 0 ) {
5225         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5226         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5227     }
5228     bSig |= LIT64( 0x8000000000000000 );
5229     zSign = aSign;
5230     expDiff = aExp - bExp;
5231     aSig1 = 0;
5232     if ( expDiff < 0 ) {
5233         if ( expDiff < -1 ) return a;
5234         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5235         expDiff = 0;
5236     }
5237     q = ( bSig <= aSig0 );
5238     if ( q ) aSig0 -= bSig;
5239     expDiff -= 64;
5240     while ( 0 < expDiff ) {
5241         q = estimateDiv128To64( aSig0, aSig1, bSig );
5242         q = ( 2 < q ) ? q - 2 : 0;
5243         mul64To128( bSig, q, &term0, &term1 );
5244         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5245         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5246         expDiff -= 62;
5247     }
5248     expDiff += 64;
5249     if ( 0 < expDiff ) {
5250         q = estimateDiv128To64( aSig0, aSig1, bSig );
5251         q = ( 2 < q ) ? q - 2 : 0;
5252         q >>= 64 - expDiff;
5253         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5254         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5255         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5256         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5257             ++q;
5258             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5259         }
5260     }
5261     else {
5262         term1 = 0;
5263         term0 = bSig;
5264     }
5265     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5266     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5267          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5268               && ( q & 1 ) )
5269        ) {
5270         aSig0 = alternateASig0;
5271         aSig1 = alternateASig1;
5272         zSign = ! zSign;
5273     }
5274     return
5275         normalizeRoundAndPackFloatx80(
5276             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5277
5278 }
5279
5280 /*----------------------------------------------------------------------------
5281 | Returns the square root of the extended double-precision floating-point
5282 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5283 | for Binary Floating-Point Arithmetic.
5284 *----------------------------------------------------------------------------*/
5285
5286 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5287 {
5288     flag aSign;
5289     int32_t aExp, zExp;
5290     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5291     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5292
5293     if (floatx80_invalid_encoding(a)) {
5294         float_raise(float_flag_invalid, status);
5295         return floatx80_default_nan(status);
5296     }
5297     aSig0 = extractFloatx80Frac( a );
5298     aExp = extractFloatx80Exp( a );
5299     aSign = extractFloatx80Sign( a );
5300     if ( aExp == 0x7FFF ) {
5301         if ((uint64_t)(aSig0 << 1)) {
5302             return propagateFloatx80NaN(a, a, status);
5303         }
5304         if ( ! aSign ) return a;
5305         goto invalid;
5306     }
5307     if ( aSign ) {
5308         if ( ( aExp | aSig0 ) == 0 ) return a;
5309  invalid:
5310         float_raise(float_flag_invalid, status);
5311         return floatx80_default_nan(status);
5312     }
5313     if ( aExp == 0 ) {
5314         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5315         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5316     }
5317     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5318     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5319     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5320     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5321     doubleZSig0 = zSig0<<1;
5322     mul64To128( zSig0, zSig0, &term0, &term1 );
5323     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5324     while ( (int64_t) rem0 < 0 ) {
5325         --zSig0;
5326         doubleZSig0 -= 2;
5327         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5328     }
5329     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5330     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5331         if ( zSig1 == 0 ) zSig1 = 1;
5332         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5333         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5334         mul64To128( zSig1, zSig1, &term2, &term3 );
5335         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5336         while ( (int64_t) rem1 < 0 ) {
5337             --zSig1;
5338             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5339             term3 |= 1;
5340             term2 |= doubleZSig0;
5341             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5342         }
5343         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5344     }
5345     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5346     zSig0 |= doubleZSig0;
5347     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5348                                 0, zExp, zSig0, zSig1, status);
5349 }
5350
5351 /*----------------------------------------------------------------------------
5352 | Returns 1 if the extended double-precision floating-point value `a' is equal
5353 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5354 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5355 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5356 *----------------------------------------------------------------------------*/
5357
5358 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5359 {
5360
5361     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5362         || (extractFloatx80Exp(a) == 0x7FFF
5363             && (uint64_t) (extractFloatx80Frac(a) << 1))
5364         || (extractFloatx80Exp(b) == 0x7FFF
5365             && (uint64_t) (extractFloatx80Frac(b) << 1))
5366        ) {
5367         float_raise(float_flag_invalid, status);
5368         return 0;
5369     }
5370     return
5371            ( a.low == b.low )
5372         && (    ( a.high == b.high )
5373              || (    ( a.low == 0 )
5374                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5375            );
5376
5377 }
5378
5379 /*----------------------------------------------------------------------------
5380 | Returns 1 if the extended double-precision floating-point value `a' is
5381 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5382 | invalid exception is raised if either operand is a NaN.  The comparison is
5383 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5384 | Arithmetic.
5385 *----------------------------------------------------------------------------*/
5386
5387 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5388 {
5389     flag aSign, bSign;
5390
5391     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5392         || (extractFloatx80Exp(a) == 0x7FFF
5393             && (uint64_t) (extractFloatx80Frac(a) << 1))
5394         || (extractFloatx80Exp(b) == 0x7FFF
5395             && (uint64_t) (extractFloatx80Frac(b) << 1))
5396        ) {
5397         float_raise(float_flag_invalid, status);
5398         return 0;
5399     }
5400     aSign = extractFloatx80Sign( a );
5401     bSign = extractFloatx80Sign( b );
5402     if ( aSign != bSign ) {
5403         return
5404                aSign
5405             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5406                  == 0 );
5407     }
5408     return
5409           aSign ? le128( b.high, b.low, a.high, a.low )
5410         : le128( a.high, a.low, b.high, b.low );
5411
5412 }
5413
5414 /*----------------------------------------------------------------------------
5415 | Returns 1 if the extended double-precision floating-point value `a' is
5416 | less than the corresponding value `b', and 0 otherwise.  The invalid
5417 | exception is raised if either operand is a NaN.  The comparison is performed
5418 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5419 *----------------------------------------------------------------------------*/
5420
5421 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5422 {
5423     flag aSign, bSign;
5424
5425     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5426         || (extractFloatx80Exp(a) == 0x7FFF
5427             && (uint64_t) (extractFloatx80Frac(a) << 1))
5428         || (extractFloatx80Exp(b) == 0x7FFF
5429             && (uint64_t) (extractFloatx80Frac(b) << 1))
5430        ) {
5431         float_raise(float_flag_invalid, status);
5432         return 0;
5433     }
5434     aSign = extractFloatx80Sign( a );
5435     bSign = extractFloatx80Sign( b );
5436     if ( aSign != bSign ) {
5437         return
5438                aSign
5439             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5440                  != 0 );
5441     }
5442     return
5443           aSign ? lt128( b.high, b.low, a.high, a.low )
5444         : lt128( a.high, a.low, b.high, b.low );
5445
5446 }
5447
5448 /*----------------------------------------------------------------------------
5449 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5450 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5451 | either operand is a NaN.   The comparison is performed according to the
5452 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5453 *----------------------------------------------------------------------------*/
5454 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5455 {
5456     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5457         || (extractFloatx80Exp(a) == 0x7FFF
5458             && (uint64_t) (extractFloatx80Frac(a) << 1))
5459         || (extractFloatx80Exp(b) == 0x7FFF
5460             && (uint64_t) (extractFloatx80Frac(b) << 1))
5461        ) {
5462         float_raise(float_flag_invalid, status);
5463         return 1;
5464     }
5465     return 0;
5466 }
5467
5468 /*----------------------------------------------------------------------------
5469 | Returns 1 if the extended double-precision floating-point value `a' is
5470 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5471 | cause an exception.  The comparison is performed according to the IEC/IEEE
5472 | Standard for Binary Floating-Point Arithmetic.
5473 *----------------------------------------------------------------------------*/
5474
5475 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5476 {
5477
5478     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5479         float_raise(float_flag_invalid, status);
5480         return 0;
5481     }
5482     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5483               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5484          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5485               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5486        ) {
5487         if (floatx80_is_signaling_nan(a, status)
5488          || floatx80_is_signaling_nan(b, status)) {
5489             float_raise(float_flag_invalid, status);
5490         }
5491         return 0;
5492     }
5493     return
5494            ( a.low == b.low )
5495         && (    ( a.high == b.high )
5496              || (    ( a.low == 0 )
5497                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5498            );
5499
5500 }
5501
5502 /*----------------------------------------------------------------------------
5503 | Returns 1 if the extended double-precision floating-point value `a' is less
5504 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5505 | do not cause an exception.  Otherwise, the comparison is performed according
5506 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5507 *----------------------------------------------------------------------------*/
5508
5509 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5510 {
5511     flag aSign, bSign;
5512
5513     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5514         float_raise(float_flag_invalid, status);
5515         return 0;
5516     }
5517     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5518               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5519          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5520               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5521        ) {
5522         if (floatx80_is_signaling_nan(a, status)
5523          || floatx80_is_signaling_nan(b, status)) {
5524             float_raise(float_flag_invalid, status);
5525         }
5526         return 0;
5527     }
5528     aSign = extractFloatx80Sign( a );
5529     bSign = extractFloatx80Sign( b );
5530     if ( aSign != bSign ) {
5531         return
5532                aSign
5533             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5534                  == 0 );
5535     }
5536     return
5537           aSign ? le128( b.high, b.low, a.high, a.low )
5538         : le128( a.high, a.low, b.high, b.low );
5539
5540 }
5541
5542 /*----------------------------------------------------------------------------
5543 | Returns 1 if the extended double-precision floating-point value `a' is less
5544 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5545 | an exception.  Otherwise, the comparison is performed according to the
5546 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5547 *----------------------------------------------------------------------------*/
5548
5549 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5550 {
5551     flag aSign, bSign;
5552
5553     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5554         float_raise(float_flag_invalid, status);
5555         return 0;
5556     }
5557     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5558               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5559          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5560               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5561        ) {
5562         if (floatx80_is_signaling_nan(a, status)
5563          || floatx80_is_signaling_nan(b, status)) {
5564             float_raise(float_flag_invalid, status);
5565         }
5566         return 0;
5567     }
5568     aSign = extractFloatx80Sign( a );
5569     bSign = extractFloatx80Sign( b );
5570     if ( aSign != bSign ) {
5571         return
5572                aSign
5573             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5574                  != 0 );
5575     }
5576     return
5577           aSign ? lt128( b.high, b.low, a.high, a.low )
5578         : lt128( a.high, a.low, b.high, b.low );
5579
5580 }
5581
5582 /*----------------------------------------------------------------------------
5583 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5584 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5585 | The comparison is performed according to the IEC/IEEE Standard for Binary
5586 | Floating-Point Arithmetic.
5587 *----------------------------------------------------------------------------*/
5588 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5589 {
5590     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5591         float_raise(float_flag_invalid, status);
5592         return 1;
5593     }
5594     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5595               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5596          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5597               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5598        ) {
5599         if (floatx80_is_signaling_nan(a, status)
5600          || floatx80_is_signaling_nan(b, status)) {
5601             float_raise(float_flag_invalid, status);
5602         }
5603         return 1;
5604     }
5605     return 0;
5606 }
5607
5608 /*----------------------------------------------------------------------------
5609 | Returns the result of converting the quadruple-precision floating-point
5610 | value `a' to the 32-bit two's complement integer format.  The conversion
5611 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5612 | Arithmetic---which means in particular that the conversion is rounded
5613 | according to the current rounding mode.  If `a' is a NaN, the largest
5614 | positive integer is returned.  Otherwise, if the conversion overflows, the
5615 | largest integer with the same sign as `a' is returned.
5616 *----------------------------------------------------------------------------*/
5617
5618 int32_t float128_to_int32(float128 a, float_status *status)
5619 {
5620     flag aSign;
5621     int32_t aExp, shiftCount;
5622     uint64_t aSig0, aSig1;
5623
5624     aSig1 = extractFloat128Frac1( a );
5625     aSig0 = extractFloat128Frac0( a );
5626     aExp = extractFloat128Exp( a );
5627     aSign = extractFloat128Sign( a );
5628     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5629     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5630     aSig0 |= ( aSig1 != 0 );
5631     shiftCount = 0x4028 - aExp;
5632     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5633     return roundAndPackInt32(aSign, aSig0, status);
5634
5635 }
5636
5637 /*----------------------------------------------------------------------------
5638 | Returns the result of converting the quadruple-precision floating-point
5639 | value `a' to the 32-bit two's complement integer format.  The conversion
5640 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5641 | Arithmetic, except that the conversion is always rounded toward zero.  If
5642 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5643 | conversion overflows, the largest integer with the same sign as `a' is
5644 | returned.
5645 *----------------------------------------------------------------------------*/
5646
5647 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5648 {
5649     flag aSign;
5650     int32_t aExp, shiftCount;
5651     uint64_t aSig0, aSig1, savedASig;
5652     int32_t z;
5653
5654     aSig1 = extractFloat128Frac1( a );
5655     aSig0 = extractFloat128Frac0( a );
5656     aExp = extractFloat128Exp( a );
5657     aSign = extractFloat128Sign( a );
5658     aSig0 |= ( aSig1 != 0 );
5659     if ( 0x401E < aExp ) {
5660         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5661         goto invalid;
5662     }
5663     else if ( aExp < 0x3FFF ) {
5664         if (aExp || aSig0) {
5665             status->float_exception_flags |= float_flag_inexact;
5666         }
5667         return 0;
5668     }
5669     aSig0 |= LIT64( 0x0001000000000000 );
5670     shiftCount = 0x402F - aExp;
5671     savedASig = aSig0;
5672     aSig0 >>= shiftCount;
5673     z = aSig0;
5674     if ( aSign ) z = - z;
5675     if ( ( z < 0 ) ^ aSign ) {
5676  invalid:
5677         float_raise(float_flag_invalid, status);
5678         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5679     }
5680     if ( ( aSig0<<shiftCount ) != savedASig ) {
5681         status->float_exception_flags |= float_flag_inexact;
5682     }
5683     return z;
5684
5685 }
5686
5687 /*----------------------------------------------------------------------------
5688 | Returns the result of converting the quadruple-precision floating-point
5689 | value `a' to the 64-bit two's complement integer format.  The conversion
5690 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5691 | Arithmetic---which means in particular that the conversion is rounded
5692 | according to the current rounding mode.  If `a' is a NaN, the largest
5693 | positive integer is returned.  Otherwise, if the conversion overflows, the
5694 | largest integer with the same sign as `a' is returned.
5695 *----------------------------------------------------------------------------*/
5696
5697 int64_t float128_to_int64(float128 a, float_status *status)
5698 {
5699     flag aSign;
5700     int32_t aExp, shiftCount;
5701     uint64_t aSig0, aSig1;
5702
5703     aSig1 = extractFloat128Frac1( a );
5704     aSig0 = extractFloat128Frac0( a );
5705     aExp = extractFloat128Exp( a );
5706     aSign = extractFloat128Sign( a );
5707     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5708     shiftCount = 0x402F - aExp;
5709     if ( shiftCount <= 0 ) {
5710         if ( 0x403E < aExp ) {
5711             float_raise(float_flag_invalid, status);
5712             if (    ! aSign
5713                  || (    ( aExp == 0x7FFF )
5714                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5715                     )
5716                ) {
5717                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5718             }
5719             return (int64_t) LIT64( 0x8000000000000000 );
5720         }
5721         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5722     }
5723     else {
5724         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5725     }
5726     return roundAndPackInt64(aSign, aSig0, aSig1, status);
5727
5728 }
5729
5730 /*----------------------------------------------------------------------------
5731 | Returns the result of converting the quadruple-precision floating-point
5732 | value `a' to the 64-bit two's complement integer format.  The conversion
5733 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5734 | Arithmetic, except that the conversion is always rounded toward zero.
5735 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5736 | the conversion overflows, the largest integer with the same sign as `a' is
5737 | returned.
5738 *----------------------------------------------------------------------------*/
5739
5740 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5741 {
5742     flag aSign;
5743     int32_t aExp, shiftCount;
5744     uint64_t aSig0, aSig1;
5745     int64_t z;
5746
5747     aSig1 = extractFloat128Frac1( a );
5748     aSig0 = extractFloat128Frac0( a );
5749     aExp = extractFloat128Exp( a );
5750     aSign = extractFloat128Sign( a );
5751     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5752     shiftCount = aExp - 0x402F;
5753     if ( 0 < shiftCount ) {
5754         if ( 0x403E <= aExp ) {
5755             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5756             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5757                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5758                 if (aSig1) {
5759                     status->float_exception_flags |= float_flag_inexact;
5760                 }
5761             }
5762             else {
5763                 float_raise(float_flag_invalid, status);
5764                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5765                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5766                 }
5767             }
5768             return (int64_t) LIT64( 0x8000000000000000 );
5769         }
5770         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5771         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5772             status->float_exception_flags |= float_flag_inexact;
5773         }
5774     }
5775     else {
5776         if ( aExp < 0x3FFF ) {
5777             if ( aExp | aSig0 | aSig1 ) {
5778                 status->float_exception_flags |= float_flag_inexact;
5779             }
5780             return 0;
5781         }
5782         z = aSig0>>( - shiftCount );
5783         if (    aSig1
5784              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5785             status->float_exception_flags |= float_flag_inexact;
5786         }
5787     }
5788     if ( aSign ) z = - z;
5789     return z;
5790
5791 }
5792
5793 /*----------------------------------------------------------------------------
5794 | Returns the result of converting the quadruple-precision floating-point value
5795 | `a' to the 64-bit unsigned integer format.  The conversion is
5796 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5797 | Arithmetic---which means in particular that the conversion is rounded
5798 | according to the current rounding mode.  If `a' is a NaN, the largest
5799 | positive integer is returned.  If the conversion overflows, the
5800 | largest unsigned integer is returned.  If 'a' is negative, the value is
5801 | rounded and zero is returned; negative values that do not round to zero
5802 | will raise the inexact exception.
5803 *----------------------------------------------------------------------------*/
5804
5805 uint64_t float128_to_uint64(float128 a, float_status *status)
5806 {
5807     flag aSign;
5808     int aExp;
5809     int shiftCount;
5810     uint64_t aSig0, aSig1;
5811
5812     aSig0 = extractFloat128Frac0(a);
5813     aSig1 = extractFloat128Frac1(a);
5814     aExp = extractFloat128Exp(a);
5815     aSign = extractFloat128Sign(a);
5816     if (aSign && (aExp > 0x3FFE)) {
5817         float_raise(float_flag_invalid, status);
5818         if (float128_is_any_nan(a)) {
5819             return LIT64(0xFFFFFFFFFFFFFFFF);
5820         } else {
5821             return 0;
5822         }
5823     }
5824     if (aExp) {
5825         aSig0 |= LIT64(0x0001000000000000);
5826     }
5827     shiftCount = 0x402F - aExp;
5828     if (shiftCount <= 0) {
5829         if (0x403E < aExp) {
5830             float_raise(float_flag_invalid, status);
5831             return LIT64(0xFFFFFFFFFFFFFFFF);
5832         }
5833         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5834     } else {
5835         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5836     }
5837     return roundAndPackUint64(aSign, aSig0, aSig1, status);
5838 }
5839
5840 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5841 {
5842     uint64_t v;
5843     signed char current_rounding_mode = status->float_rounding_mode;
5844
5845     set_float_rounding_mode(float_round_to_zero, status);
5846     v = float128_to_uint64(a, status);
5847     set_float_rounding_mode(current_rounding_mode, status);
5848
5849     return v;
5850 }
5851
5852 /*----------------------------------------------------------------------------
5853 | Returns the result of converting the quadruple-precision floating-point
5854 | value `a' to the 32-bit unsigned integer format.  The conversion
5855 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5856 | Arithmetic except that the conversion is always rounded toward zero.
5857 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
5858 | if the conversion overflows, the largest unsigned integer is returned.
5859 | If 'a' is negative, the value is rounded and zero is returned; negative
5860 | values that do not round to zero will raise the inexact exception.
5861 *----------------------------------------------------------------------------*/
5862
5863 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5864 {
5865     uint64_t v;
5866     uint32_t res;
5867     int old_exc_flags = get_float_exception_flags(status);
5868
5869     v = float128_to_uint64_round_to_zero(a, status);
5870     if (v > 0xffffffff) {
5871         res = 0xffffffff;
5872     } else {
5873         return v;
5874     }
5875     set_float_exception_flags(old_exc_flags, status);
5876     float_raise(float_flag_invalid, status);
5877     return res;
5878 }
5879
5880 /*----------------------------------------------------------------------------
5881 | Returns the result of converting the quadruple-precision floating-point
5882 | value `a' to the single-precision floating-point format.  The conversion
5883 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5884 | Arithmetic.
5885 *----------------------------------------------------------------------------*/
5886
5887 float32 float128_to_float32(float128 a, float_status *status)
5888 {
5889     flag aSign;
5890     int32_t aExp;
5891     uint64_t aSig0, aSig1;
5892     uint32_t zSig;
5893
5894     aSig1 = extractFloat128Frac1( a );
5895     aSig0 = extractFloat128Frac0( a );
5896     aExp = extractFloat128Exp( a );
5897     aSign = extractFloat128Sign( a );
5898     if ( aExp == 0x7FFF ) {
5899         if ( aSig0 | aSig1 ) {
5900             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5901         }
5902         return packFloat32( aSign, 0xFF, 0 );
5903     }
5904     aSig0 |= ( aSig1 != 0 );
5905     shift64RightJamming( aSig0, 18, &aSig0 );
5906     zSig = aSig0;
5907     if ( aExp || zSig ) {
5908         zSig |= 0x40000000;
5909         aExp -= 0x3F81;
5910     }
5911     return roundAndPackFloat32(aSign, aExp, zSig, status);
5912
5913 }
5914
5915 /*----------------------------------------------------------------------------
5916 | Returns the result of converting the quadruple-precision floating-point
5917 | value `a' to the double-precision floating-point format.  The conversion
5918 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5919 | Arithmetic.
5920 *----------------------------------------------------------------------------*/
5921
5922 float64 float128_to_float64(float128 a, float_status *status)
5923 {
5924     flag aSign;
5925     int32_t aExp;
5926     uint64_t aSig0, aSig1;
5927
5928     aSig1 = extractFloat128Frac1( a );
5929     aSig0 = extractFloat128Frac0( a );
5930     aExp = extractFloat128Exp( a );
5931     aSign = extractFloat128Sign( a );
5932     if ( aExp == 0x7FFF ) {
5933         if ( aSig0 | aSig1 ) {
5934             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5935         }
5936         return packFloat64( aSign, 0x7FF, 0 );
5937     }
5938     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5939     aSig0 |= ( aSig1 != 0 );
5940     if ( aExp || aSig0 ) {
5941         aSig0 |= LIT64( 0x4000000000000000 );
5942         aExp -= 0x3C01;
5943     }
5944     return roundAndPackFloat64(aSign, aExp, aSig0, status);
5945
5946 }
5947
5948 /*----------------------------------------------------------------------------
5949 | Returns the result of converting the quadruple-precision floating-point
5950 | value `a' to the extended double-precision floating-point format.  The
5951 | conversion is performed according to the IEC/IEEE Standard for Binary
5952 | Floating-Point Arithmetic.
5953 *----------------------------------------------------------------------------*/
5954
5955 floatx80 float128_to_floatx80(float128 a, float_status *status)
5956 {
5957     flag aSign;
5958     int32_t aExp;
5959     uint64_t aSig0, aSig1;
5960
5961     aSig1 = extractFloat128Frac1( a );
5962     aSig0 = extractFloat128Frac0( a );
5963     aExp = extractFloat128Exp( a );
5964     aSign = extractFloat128Sign( a );
5965     if ( aExp == 0x7FFF ) {
5966         if ( aSig0 | aSig1 ) {
5967             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
5968         }
5969         return packFloatx80(aSign, floatx80_infinity_high,
5970                                    floatx80_infinity_low);
5971     }
5972     if ( aExp == 0 ) {
5973         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5974         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5975     }
5976     else {
5977         aSig0 |= LIT64( 0x0001000000000000 );
5978     }
5979     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5980     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
5981
5982 }
5983
5984 /*----------------------------------------------------------------------------
5985 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5986 | returns the result as a quadruple-precision floating-point value.  The
5987 | operation is performed according to the IEC/IEEE Standard for Binary
5988 | Floating-Point Arithmetic.
5989 *----------------------------------------------------------------------------*/
5990
5991 float128 float128_round_to_int(float128 a, float_status *status)
5992 {
5993     flag aSign;
5994     int32_t aExp;
5995     uint64_t lastBitMask, roundBitsMask;
5996     float128 z;
5997
5998     aExp = extractFloat128Exp( a );
5999     if ( 0x402F <= aExp ) {
6000         if ( 0x406F <= aExp ) {
6001             if (    ( aExp == 0x7FFF )
6002                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6003                ) {
6004                 return propagateFloat128NaN(a, a, status);
6005             }
6006             return a;
6007         }
6008         lastBitMask = 1;
6009         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6010         roundBitsMask = lastBitMask - 1;
6011         z = a;
6012         switch (status->float_rounding_mode) {
6013         case float_round_nearest_even:
6014             if ( lastBitMask ) {
6015                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6016                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6017             }
6018             else {
6019                 if ( (int64_t) z.low < 0 ) {
6020                     ++z.high;
6021                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6022                 }
6023             }
6024             break;
6025         case float_round_ties_away:
6026             if (lastBitMask) {
6027                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6028             } else {
6029                 if ((int64_t) z.low < 0) {
6030                     ++z.high;
6031                 }
6032             }
6033             break;
6034         case float_round_to_zero:
6035             break;
6036         case float_round_up:
6037             if (!extractFloat128Sign(z)) {
6038                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6039             }
6040             break;
6041         case float_round_down:
6042             if (extractFloat128Sign(z)) {
6043                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6044             }
6045             break;
6046         default:
6047             abort();
6048         }
6049         z.low &= ~ roundBitsMask;
6050     }
6051     else {
6052         if ( aExp < 0x3FFF ) {
6053             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6054             status->float_exception_flags |= float_flag_inexact;
6055             aSign = extractFloat128Sign( a );
6056             switch (status->float_rounding_mode) {
6057              case float_round_nearest_even:
6058                 if (    ( aExp == 0x3FFE )
6059                      && (   extractFloat128Frac0( a )
6060                           | extractFloat128Frac1( a ) )
6061                    ) {
6062                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6063                 }
6064                 break;
6065             case float_round_ties_away:
6066                 if (aExp == 0x3FFE) {
6067                     return packFloat128(aSign, 0x3FFF, 0, 0);
6068                 }
6069                 break;
6070              case float_round_down:
6071                 return
6072                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6073                     : packFloat128( 0, 0, 0, 0 );
6074              case float_round_up:
6075                 return
6076                       aSign ? packFloat128( 1, 0, 0, 0 )
6077                     : packFloat128( 0, 0x3FFF, 0, 0 );
6078             }
6079             return packFloat128( aSign, 0, 0, 0 );
6080         }
6081         lastBitMask = 1;
6082         lastBitMask <<= 0x402F - aExp;
6083         roundBitsMask = lastBitMask - 1;
6084         z.low = 0;
6085         z.high = a.high;
6086         switch (status->float_rounding_mode) {
6087         case float_round_nearest_even:
6088             z.high += lastBitMask>>1;
6089             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6090                 z.high &= ~ lastBitMask;
6091             }
6092             break;
6093         case float_round_ties_away:
6094             z.high += lastBitMask>>1;
6095             break;
6096         case float_round_to_zero:
6097             break;
6098         case float_round_up:
6099             if (!extractFloat128Sign(z)) {
6100                 z.high |= ( a.low != 0 );
6101                 z.high += roundBitsMask;
6102             }
6103             break;
6104         case float_round_down:
6105             if (extractFloat128Sign(z)) {
6106                 z.high |= (a.low != 0);
6107                 z.high += roundBitsMask;
6108             }
6109             break;
6110         default:
6111             abort();
6112         }
6113         z.high &= ~ roundBitsMask;
6114     }
6115     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6116         status->float_exception_flags |= float_flag_inexact;
6117     }
6118     return z;
6119
6120 }
6121
6122 /*----------------------------------------------------------------------------
6123 | Returns the result of adding the absolute values of the quadruple-precision
6124 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6125 | before being returned.  `zSign' is ignored if the result is a NaN.
6126 | The addition is performed according to the IEC/IEEE Standard for Binary
6127 | Floating-Point Arithmetic.
6128 *----------------------------------------------------------------------------*/
6129
6130 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6131                                 float_status *status)
6132 {
6133     int32_t aExp, bExp, zExp;
6134     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6135     int32_t expDiff;
6136
6137     aSig1 = extractFloat128Frac1( a );
6138     aSig0 = extractFloat128Frac0( a );
6139     aExp = extractFloat128Exp( a );
6140     bSig1 = extractFloat128Frac1( b );
6141     bSig0 = extractFloat128Frac0( b );
6142     bExp = extractFloat128Exp( b );
6143     expDiff = aExp - bExp;
6144     if ( 0 < expDiff ) {
6145         if ( aExp == 0x7FFF ) {
6146             if (aSig0 | aSig1) {
6147                 return propagateFloat128NaN(a, b, status);
6148             }
6149             return a;
6150         }
6151         if ( bExp == 0 ) {
6152             --expDiff;
6153         }
6154         else {
6155             bSig0 |= LIT64( 0x0001000000000000 );
6156         }
6157         shift128ExtraRightJamming(
6158             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6159         zExp = aExp;
6160     }
6161     else if ( expDiff < 0 ) {
6162         if ( bExp == 0x7FFF ) {
6163             if (bSig0 | bSig1) {
6164                 return propagateFloat128NaN(a, b, status);
6165             }
6166             return packFloat128( zSign, 0x7FFF, 0, 0 );
6167         }
6168         if ( aExp == 0 ) {
6169             ++expDiff;
6170         }
6171         else {
6172             aSig0 |= LIT64( 0x0001000000000000 );
6173         }
6174         shift128ExtraRightJamming(
6175             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6176         zExp = bExp;
6177     }
6178     else {
6179         if ( aExp == 0x7FFF ) {
6180             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6181                 return propagateFloat128NaN(a, b, status);
6182             }
6183             return a;
6184         }
6185         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6186         if ( aExp == 0 ) {
6187             if (status->flush_to_zero) {
6188                 if (zSig0 | zSig1) {
6189                     float_raise(float_flag_output_denormal, status);
6190                 }
6191                 return packFloat128(zSign, 0, 0, 0);
6192             }
6193             return packFloat128( zSign, 0, zSig0, zSig1 );
6194         }
6195         zSig2 = 0;
6196         zSig0 |= LIT64( 0x0002000000000000 );
6197         zExp = aExp;
6198         goto shiftRight1;
6199     }
6200     aSig0 |= LIT64( 0x0001000000000000 );
6201     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6202     --zExp;
6203     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6204     ++zExp;
6205  shiftRight1:
6206     shift128ExtraRightJamming(
6207         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6208  roundAndPack:
6209     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6210
6211 }
6212
6213 /*----------------------------------------------------------------------------
6214 | Returns the result of subtracting the absolute values of the quadruple-
6215 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6216 | difference is negated before being returned.  `zSign' is ignored if the
6217 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6218 | Standard for Binary Floating-Point Arithmetic.
6219 *----------------------------------------------------------------------------*/
6220
6221 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6222                                 float_status *status)
6223 {
6224     int32_t aExp, bExp, zExp;
6225     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6226     int32_t expDiff;
6227
6228     aSig1 = extractFloat128Frac1( a );
6229     aSig0 = extractFloat128Frac0( a );
6230     aExp = extractFloat128Exp( a );
6231     bSig1 = extractFloat128Frac1( b );
6232     bSig0 = extractFloat128Frac0( b );
6233     bExp = extractFloat128Exp( b );
6234     expDiff = aExp - bExp;
6235     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6236     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6237     if ( 0 < expDiff ) goto aExpBigger;
6238     if ( expDiff < 0 ) goto bExpBigger;
6239     if ( aExp == 0x7FFF ) {
6240         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6241             return propagateFloat128NaN(a, b, status);
6242         }
6243         float_raise(float_flag_invalid, status);
6244         return float128_default_nan(status);
6245     }
6246     if ( aExp == 0 ) {
6247         aExp = 1;
6248         bExp = 1;
6249     }
6250     if ( bSig0 < aSig0 ) goto aBigger;
6251     if ( aSig0 < bSig0 ) goto bBigger;
6252     if ( bSig1 < aSig1 ) goto aBigger;
6253     if ( aSig1 < bSig1 ) goto bBigger;
6254     return packFloat128(status->float_rounding_mode == float_round_down,
6255                         0, 0, 0);
6256  bExpBigger:
6257     if ( bExp == 0x7FFF ) {
6258         if (bSig0 | bSig1) {
6259             return propagateFloat128NaN(a, b, status);
6260         }
6261         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6262     }
6263     if ( aExp == 0 ) {
6264         ++expDiff;
6265     }
6266     else {
6267         aSig0 |= LIT64( 0x4000000000000000 );
6268     }
6269     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6270     bSig0 |= LIT64( 0x4000000000000000 );
6271  bBigger:
6272     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6273     zExp = bExp;
6274     zSign ^= 1;
6275     goto normalizeRoundAndPack;
6276  aExpBigger:
6277     if ( aExp == 0x7FFF ) {
6278         if (aSig0 | aSig1) {
6279             return propagateFloat128NaN(a, b, status);
6280         }
6281         return a;
6282     }
6283     if ( bExp == 0 ) {
6284         --expDiff;
6285     }
6286     else {
6287         bSig0 |= LIT64( 0x4000000000000000 );
6288     }
6289     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6290     aSig0 |= LIT64( 0x4000000000000000 );
6291  aBigger:
6292     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6293     zExp = aExp;
6294  normalizeRoundAndPack:
6295     --zExp;
6296     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6297                                          status);
6298
6299 }
6300
6301 /*----------------------------------------------------------------------------
6302 | Returns the result of adding the quadruple-precision floating-point values
6303 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6304 | for Binary Floating-Point Arithmetic.
6305 *----------------------------------------------------------------------------*/
6306
6307 float128 float128_add(float128 a, float128 b, float_status *status)
6308 {
6309     flag aSign, bSign;
6310
6311     aSign = extractFloat128Sign( a );
6312     bSign = extractFloat128Sign( b );
6313     if ( aSign == bSign ) {
6314         return addFloat128Sigs(a, b, aSign, status);
6315     }
6316     else {
6317         return subFloat128Sigs(a, b, aSign, status);
6318     }
6319
6320 }
6321
6322 /*----------------------------------------------------------------------------
6323 | Returns the result of subtracting the quadruple-precision floating-point
6324 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6325 | Standard for Binary Floating-Point Arithmetic.
6326 *----------------------------------------------------------------------------*/
6327
6328 float128 float128_sub(float128 a, float128 b, float_status *status)
6329 {
6330     flag aSign, bSign;
6331
6332     aSign = extractFloat128Sign( a );
6333     bSign = extractFloat128Sign( b );
6334     if ( aSign == bSign ) {
6335         return subFloat128Sigs(a, b, aSign, status);
6336     }
6337     else {
6338         return addFloat128Sigs(a, b, aSign, status);
6339     }
6340
6341 }
6342
6343 /*----------------------------------------------------------------------------
6344 | Returns the result of multiplying the quadruple-precision floating-point
6345 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6346 | Standard for Binary Floating-Point Arithmetic.
6347 *----------------------------------------------------------------------------*/
6348
6349 float128 float128_mul(float128 a, float128 b, float_status *status)
6350 {
6351     flag aSign, bSign, zSign;
6352     int32_t aExp, bExp, zExp;
6353     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6354
6355     aSig1 = extractFloat128Frac1( a );
6356     aSig0 = extractFloat128Frac0( a );
6357     aExp = extractFloat128Exp( a );
6358     aSign = extractFloat128Sign( a );
6359     bSig1 = extractFloat128Frac1( b );
6360     bSig0 = extractFloat128Frac0( b );
6361     bExp = extractFloat128Exp( b );
6362     bSign = extractFloat128Sign( b );
6363     zSign = aSign ^ bSign;
6364     if ( aExp == 0x7FFF ) {
6365         if (    ( aSig0 | aSig1 )
6366              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6367             return propagateFloat128NaN(a, b, status);
6368         }
6369         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6370         return packFloat128( zSign, 0x7FFF, 0, 0 );
6371     }
6372     if ( bExp == 0x7FFF ) {
6373         if (bSig0 | bSig1) {
6374             return propagateFloat128NaN(a, b, status);
6375         }
6376         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6377  invalid:
6378             float_raise(float_flag_invalid, status);
6379             return float128_default_nan(status);
6380         }
6381         return packFloat128( zSign, 0x7FFF, 0, 0 );
6382     }
6383     if ( aExp == 0 ) {
6384         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6385         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6386     }
6387     if ( bExp == 0 ) {
6388         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6389         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6390     }
6391     zExp = aExp + bExp - 0x4000;
6392     aSig0 |= LIT64( 0x0001000000000000 );
6393     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6394     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6395     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6396     zSig2 |= ( zSig3 != 0 );
6397     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6398         shift128ExtraRightJamming(
6399             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6400         ++zExp;
6401     }
6402     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6403
6404 }
6405
6406 /*----------------------------------------------------------------------------
6407 | Returns the result of dividing the quadruple-precision floating-point value
6408 | `a' by the corresponding value `b'.  The operation is performed according to
6409 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6410 *----------------------------------------------------------------------------*/
6411
6412 float128 float128_div(float128 a, float128 b, float_status *status)
6413 {
6414     flag aSign, bSign, zSign;
6415     int32_t aExp, bExp, zExp;
6416     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6417     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6418
6419     aSig1 = extractFloat128Frac1( a );
6420     aSig0 = extractFloat128Frac0( a );
6421     aExp = extractFloat128Exp( a );
6422     aSign = extractFloat128Sign( a );
6423     bSig1 = extractFloat128Frac1( b );
6424     bSig0 = extractFloat128Frac0( b );
6425     bExp = extractFloat128Exp( b );
6426     bSign = extractFloat128Sign( b );
6427     zSign = aSign ^ bSign;
6428     if ( aExp == 0x7FFF ) {
6429         if (aSig0 | aSig1) {
6430             return propagateFloat128NaN(a, b, status);
6431         }
6432         if ( bExp == 0x7FFF ) {
6433             if (bSig0 | bSig1) {
6434                 return propagateFloat128NaN(a, b, status);
6435             }
6436             goto invalid;
6437         }
6438         return packFloat128( zSign, 0x7FFF, 0, 0 );
6439     }
6440     if ( bExp == 0x7FFF ) {
6441         if (bSig0 | bSig1) {
6442             return propagateFloat128NaN(a, b, status);
6443         }
6444         return packFloat128( zSign, 0, 0, 0 );
6445     }
6446     if ( bExp == 0 ) {
6447         if ( ( bSig0 | bSig1 ) == 0 ) {
6448             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6449  invalid:
6450                 float_raise(float_flag_invalid, status);
6451                 return float128_default_nan(status);
6452             }
6453             float_raise(float_flag_divbyzero, status);
6454             return packFloat128( zSign, 0x7FFF, 0, 0 );
6455         }
6456         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6457     }
6458     if ( aExp == 0 ) {
6459         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6460         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6461     }
6462     zExp = aExp - bExp + 0x3FFD;
6463     shortShift128Left(
6464         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6465     shortShift128Left(
6466         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6467     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6468         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6469         ++zExp;
6470     }
6471     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6472     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6473     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6474     while ( (int64_t) rem0 < 0 ) {
6475         --zSig0;
6476         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6477     }
6478     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6479     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6480         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6481         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6482         while ( (int64_t) rem1 < 0 ) {
6483             --zSig1;
6484             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6485         }
6486         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6487     }
6488     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6489     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6490
6491 }
6492
6493 /*----------------------------------------------------------------------------
6494 | Returns the remainder of the quadruple-precision floating-point value `a'
6495 | with respect to the corresponding value `b'.  The operation is performed
6496 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6497 *----------------------------------------------------------------------------*/
6498
6499 float128 float128_rem(float128 a, float128 b, float_status *status)
6500 {
6501     flag aSign, zSign;
6502     int32_t aExp, bExp, expDiff;
6503     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6504     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6505     int64_t sigMean0;
6506
6507     aSig1 = extractFloat128Frac1( a );
6508     aSig0 = extractFloat128Frac0( a );
6509     aExp = extractFloat128Exp( a );
6510     aSign = extractFloat128Sign( a );
6511     bSig1 = extractFloat128Frac1( b );
6512     bSig0 = extractFloat128Frac0( b );
6513     bExp = extractFloat128Exp( b );
6514     if ( aExp == 0x7FFF ) {
6515         if (    ( aSig0 | aSig1 )
6516              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6517             return propagateFloat128NaN(a, b, status);
6518         }
6519         goto invalid;
6520     }
6521     if ( bExp == 0x7FFF ) {
6522         if (bSig0 | bSig1) {
6523             return propagateFloat128NaN(a, b, status);
6524         }
6525         return a;
6526     }
6527     if ( bExp == 0 ) {
6528         if ( ( bSig0 | bSig1 ) == 0 ) {
6529  invalid:
6530             float_raise(float_flag_invalid, status);
6531             return float128_default_nan(status);
6532         }
6533         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6534     }
6535     if ( aExp == 0 ) {
6536         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6537         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6538     }
6539     expDiff = aExp - bExp;
6540     if ( expDiff < -1 ) return a;
6541     shortShift128Left(
6542         aSig0 | LIT64( 0x0001000000000000 ),
6543         aSig1,
6544         15 - ( expDiff < 0 ),
6545         &aSig0,
6546         &aSig1
6547     );
6548     shortShift128Left(
6549         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6550     q = le128( bSig0, bSig1, aSig0, aSig1 );
6551     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6552     expDiff -= 64;
6553     while ( 0 < expDiff ) {
6554         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6555         q = ( 4 < q ) ? q - 4 : 0;
6556         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6557         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6558         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6559         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6560         expDiff -= 61;
6561     }
6562     if ( -64 < expDiff ) {
6563         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6564         q = ( 4 < q ) ? q - 4 : 0;
6565         q >>= - expDiff;
6566         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6567         expDiff += 52;
6568         if ( expDiff < 0 ) {
6569             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6570         }
6571         else {
6572             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6573         }
6574         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6575         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6576     }
6577     else {
6578         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6579         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6580     }
6581     do {
6582         alternateASig0 = aSig0;
6583         alternateASig1 = aSig1;
6584         ++q;
6585         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6586     } while ( 0 <= (int64_t) aSig0 );
6587     add128(
6588         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6589     if (    ( sigMean0 < 0 )
6590          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6591         aSig0 = alternateASig0;
6592         aSig1 = alternateASig1;
6593     }
6594     zSign = ( (int64_t) aSig0 < 0 );
6595     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6596     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6597                                          status);
6598 }
6599
6600 /*----------------------------------------------------------------------------
6601 | Returns the square root of the quadruple-precision floating-point value `a'.
6602 | The operation is performed according to the IEC/IEEE Standard for Binary
6603 | Floating-Point Arithmetic.
6604 *----------------------------------------------------------------------------*/
6605
6606 float128 float128_sqrt(float128 a, float_status *status)
6607 {
6608     flag aSign;
6609     int32_t aExp, zExp;
6610     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6611     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6612
6613     aSig1 = extractFloat128Frac1( a );
6614     aSig0 = extractFloat128Frac0( a );
6615     aExp = extractFloat128Exp( a );
6616     aSign = extractFloat128Sign( a );
6617     if ( aExp == 0x7FFF ) {
6618         if (aSig0 | aSig1) {
6619             return propagateFloat128NaN(a, a, status);
6620         }
6621         if ( ! aSign ) return a;
6622         goto invalid;
6623     }
6624     if ( aSign ) {
6625         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6626  invalid:
6627         float_raise(float_flag_invalid, status);
6628         return float128_default_nan(status);
6629     }
6630     if ( aExp == 0 ) {
6631         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6632         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6633     }
6634     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6635     aSig0 |= LIT64( 0x0001000000000000 );
6636     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6637     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6638     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6639     doubleZSig0 = zSig0<<1;
6640     mul64To128( zSig0, zSig0, &term0, &term1 );
6641     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6642     while ( (int64_t) rem0 < 0 ) {
6643         --zSig0;
6644         doubleZSig0 -= 2;
6645         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6646     }
6647     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6648     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6649         if ( zSig1 == 0 ) zSig1 = 1;
6650         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6651         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6652         mul64To128( zSig1, zSig1, &term2, &term3 );
6653         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6654         while ( (int64_t) rem1 < 0 ) {
6655             --zSig1;
6656             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6657             term3 |= 1;
6658             term2 |= doubleZSig0;
6659             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6660         }
6661         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6662     }
6663     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6664     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6665
6666 }
6667
6668 /*----------------------------------------------------------------------------
6669 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6670 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6671 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6672 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6673 *----------------------------------------------------------------------------*/
6674
6675 int float128_eq(float128 a, float128 b, float_status *status)
6676 {
6677
6678     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6679               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6680          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6681               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6682        ) {
6683         float_raise(float_flag_invalid, status);
6684         return 0;
6685     }
6686     return
6687            ( a.low == b.low )
6688         && (    ( a.high == b.high )
6689              || (    ( a.low == 0 )
6690                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6691            );
6692
6693 }
6694
6695 /*----------------------------------------------------------------------------
6696 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6697 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6698 | exception is raised if either operand is a NaN.  The comparison is performed
6699 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6700 *----------------------------------------------------------------------------*/
6701
6702 int float128_le(float128 a, float128 b, float_status *status)
6703 {
6704     flag aSign, bSign;
6705
6706     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6707               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6708          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6709               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6710        ) {
6711         float_raise(float_flag_invalid, status);
6712         return 0;
6713     }
6714     aSign = extractFloat128Sign( a );
6715     bSign = extractFloat128Sign( b );
6716     if ( aSign != bSign ) {
6717         return
6718                aSign
6719             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6720                  == 0 );
6721     }
6722     return
6723           aSign ? le128( b.high, b.low, a.high, a.low )
6724         : le128( a.high, a.low, b.high, b.low );
6725
6726 }
6727
6728 /*----------------------------------------------------------------------------
6729 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6730 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6731 | raised if either operand is a NaN.  The comparison is performed according
6732 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6733 *----------------------------------------------------------------------------*/
6734
6735 int float128_lt(float128 a, float128 b, float_status *status)
6736 {
6737     flag aSign, bSign;
6738
6739     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6740               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6741          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6742               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6743        ) {
6744         float_raise(float_flag_invalid, status);
6745         return 0;
6746     }
6747     aSign = extractFloat128Sign( a );
6748     bSign = extractFloat128Sign( b );
6749     if ( aSign != bSign ) {
6750         return
6751                aSign
6752             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6753                  != 0 );
6754     }
6755     return
6756           aSign ? lt128( b.high, b.low, a.high, a.low )
6757         : lt128( a.high, a.low, b.high, b.low );
6758
6759 }
6760
6761 /*----------------------------------------------------------------------------
6762 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6763 | be compared, and 0 otherwise.  The invalid exception is raised if either
6764 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6765 | Standard for Binary Floating-Point Arithmetic.
6766 *----------------------------------------------------------------------------*/
6767
6768 int float128_unordered(float128 a, float128 b, float_status *status)
6769 {
6770     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6771               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6772          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6773               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6774        ) {
6775         float_raise(float_flag_invalid, status);
6776         return 1;
6777     }
6778     return 0;
6779 }
6780
6781 /*----------------------------------------------------------------------------
6782 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6783 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6784 | exception.  The comparison is performed according to the IEC/IEEE Standard
6785 | for Binary Floating-Point Arithmetic.
6786 *----------------------------------------------------------------------------*/
6787
6788 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6789 {
6790
6791     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6792               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6793          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6794               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6795        ) {
6796         if (float128_is_signaling_nan(a, status)
6797          || float128_is_signaling_nan(b, status)) {
6798             float_raise(float_flag_invalid, status);
6799         }
6800         return 0;
6801     }
6802     return
6803            ( a.low == b.low )
6804         && (    ( a.high == b.high )
6805              || (    ( a.low == 0 )
6806                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6807            );
6808
6809 }
6810
6811 /*----------------------------------------------------------------------------
6812 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6813 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6814 | cause an exception.  Otherwise, the comparison is performed according to the
6815 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6816 *----------------------------------------------------------------------------*/
6817
6818 int float128_le_quiet(float128 a, float128 b, float_status *status)
6819 {
6820     flag aSign, bSign;
6821
6822     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6823               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6824          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6825               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6826        ) {
6827         if (float128_is_signaling_nan(a, status)
6828          || float128_is_signaling_nan(b, status)) {
6829             float_raise(float_flag_invalid, status);
6830         }
6831         return 0;
6832     }
6833     aSign = extractFloat128Sign( a );
6834     bSign = extractFloat128Sign( b );
6835     if ( aSign != bSign ) {
6836         return
6837                aSign
6838             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6839                  == 0 );
6840     }
6841     return
6842           aSign ? le128( b.high, b.low, a.high, a.low )
6843         : le128( a.high, a.low, b.high, b.low );
6844
6845 }
6846
6847 /*----------------------------------------------------------------------------
6848 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6849 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6850 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6851 | Standard for Binary Floating-Point Arithmetic.
6852 *----------------------------------------------------------------------------*/
6853
6854 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6855 {
6856     flag aSign, bSign;
6857
6858     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6859               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6860          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6861               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6862        ) {
6863         if (float128_is_signaling_nan(a, status)
6864          || float128_is_signaling_nan(b, status)) {
6865             float_raise(float_flag_invalid, status);
6866         }
6867         return 0;
6868     }
6869     aSign = extractFloat128Sign( a );
6870     bSign = extractFloat128Sign( b );
6871     if ( aSign != bSign ) {
6872         return
6873                aSign
6874             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6875                  != 0 );
6876     }
6877     return
6878           aSign ? lt128( b.high, b.low, a.high, a.low )
6879         : lt128( a.high, a.low, b.high, b.low );
6880
6881 }
6882
6883 /*----------------------------------------------------------------------------
6884 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6885 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6886 | comparison is performed according to the IEC/IEEE Standard for Binary
6887 | Floating-Point Arithmetic.
6888 *----------------------------------------------------------------------------*/
6889
6890 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6891 {
6892     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6893               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6894          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6895               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6896        ) {
6897         if (float128_is_signaling_nan(a, status)
6898          || float128_is_signaling_nan(b, status)) {
6899             float_raise(float_flag_invalid, status);
6900         }
6901         return 1;
6902     }
6903     return 0;
6904 }
6905
6906 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6907                                             int is_quiet, float_status *status)
6908 {
6909     flag aSign, bSign;
6910
6911     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6912         float_raise(float_flag_invalid, status);
6913         return float_relation_unordered;
6914     }
6915     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6916           ( extractFloatx80Frac( a )<<1 ) ) ||
6917         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6918           ( extractFloatx80Frac( b )<<1 ) )) {
6919         if (!is_quiet ||
6920             floatx80_is_signaling_nan(a, status) ||
6921             floatx80_is_signaling_nan(b, status)) {
6922             float_raise(float_flag_invalid, status);
6923         }
6924         return float_relation_unordered;
6925     }
6926     aSign = extractFloatx80Sign( a );
6927     bSign = extractFloatx80Sign( b );
6928     if ( aSign != bSign ) {
6929
6930         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6931              ( ( a.low | b.low ) == 0 ) ) {
6932             /* zero case */
6933             return float_relation_equal;
6934         } else {
6935             return 1 - (2 * aSign);
6936         }
6937     } else {
6938         if (a.low == b.low && a.high == b.high) {
6939             return float_relation_equal;
6940         } else {
6941             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6942         }
6943     }
6944 }
6945
6946 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6947 {
6948     return floatx80_compare_internal(a, b, 0, status);
6949 }
6950
6951 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6952 {
6953     return floatx80_compare_internal(a, b, 1, status);
6954 }
6955
6956 static inline int float128_compare_internal(float128 a, float128 b,
6957                                             int is_quiet, float_status *status)
6958 {
6959     flag aSign, bSign;
6960
6961     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6962           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6963         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6964           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6965         if (!is_quiet ||
6966             float128_is_signaling_nan(a, status) ||
6967             float128_is_signaling_nan(b, status)) {
6968             float_raise(float_flag_invalid, status);
6969         }
6970         return float_relation_unordered;
6971     }
6972     aSign = extractFloat128Sign( a );
6973     bSign = extractFloat128Sign( b );
6974     if ( aSign != bSign ) {
6975         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6976             /* zero case */
6977             return float_relation_equal;
6978         } else {
6979             return 1 - (2 * aSign);
6980         }
6981     } else {
6982         if (a.low == b.low && a.high == b.high) {
6983             return float_relation_equal;
6984         } else {
6985             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6986         }
6987     }
6988 }
6989
6990 int float128_compare(float128 a, float128 b, float_status *status)
6991 {
6992     return float128_compare_internal(a, b, 0, status);
6993 }
6994
6995 int float128_compare_quiet(float128 a, float128 b, float_status *status)
6996 {
6997     return float128_compare_internal(a, b, 1, status);
6998 }
6999
7000 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7001 {
7002     flag aSign;
7003     int32_t aExp;
7004     uint64_t aSig;
7005
7006     if (floatx80_invalid_encoding(a)) {
7007         float_raise(float_flag_invalid, status);
7008         return floatx80_default_nan(status);
7009     }
7010     aSig = extractFloatx80Frac( a );
7011     aExp = extractFloatx80Exp( a );
7012     aSign = extractFloatx80Sign( a );
7013
7014     if ( aExp == 0x7FFF ) {
7015         if ( aSig<<1 ) {
7016             return propagateFloatx80NaN(a, a, status);
7017         }
7018         return a;
7019     }
7020
7021     if (aExp == 0) {
7022         if (aSig == 0) {
7023             return a;
7024         }
7025         aExp++;
7026     }
7027
7028     if (n > 0x10000) {
7029         n = 0x10000;
7030     } else if (n < -0x10000) {
7031         n = -0x10000;
7032     }
7033
7034     aExp += n;
7035     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7036                                          aSign, aExp, aSig, 0, status);
7037 }
7038
7039 float128 float128_scalbn(float128 a, int n, float_status *status)
7040 {
7041     flag aSign;
7042     int32_t aExp;
7043     uint64_t aSig0, aSig1;
7044
7045     aSig1 = extractFloat128Frac1( a );
7046     aSig0 = extractFloat128Frac0( a );
7047     aExp = extractFloat128Exp( a );
7048     aSign = extractFloat128Sign( a );
7049     if ( aExp == 0x7FFF ) {
7050         if ( aSig0 | aSig1 ) {
7051             return propagateFloat128NaN(a, a, status);
7052         }
7053         return a;
7054     }
7055     if (aExp != 0) {
7056         aSig0 |= LIT64( 0x0001000000000000 );
7057     } else if (aSig0 == 0 && aSig1 == 0) {
7058         return a;
7059     } else {
7060         aExp++;
7061     }
7062
7063     if (n > 0x10000) {
7064         n = 0x10000;
7065     } else if (n < -0x10000) {
7066         n = -0x10000;
7067     }
7068
7069     aExp += n - 1;
7070     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7071                                          , status);
7072
7073 }