fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * The code in this source file is derived from release 2a of the SoftFloat
   5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6  * some later contributions) are provided under that license, as detailed below.
   7  * It has subsequently been modified by contributors to the QEMU Project,
   8  * so some portions are provided under:
   9  *  the SoftFloat-2a license
  10  *  the BSD license
  11  *  GPL-v2-or-later
  12  *
  13  * Any future contributions to this file after December 1st 2014 will be
  14  * taken to be licensed under the Softfloat-2a license unless specifically
  15  * indicated otherwise.
  16  */
  17
  18 /*
  19 ===============================================================================
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* BSD licensing:
  48  * Copyright (c) 2006, Fabrice Bellard
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are met:
  53  *
  54  * 1. Redistributions of source code must retain the above copyright notice,
  55  * this list of conditions and the following disclaimer.
  56  *
  57  * 2. Redistributions in binary form must reproduce the above copyright notice,
  58  * this list of conditions and the following disclaimer in the documentation
  59  * and/or other materials provided with the distribution.
  60  *
  61  * 3. Neither the name of the copyright holder nor the names of its contributors
  62  * may be used to endorse or promote products derived from this software without
  63  * specific prior written permission.
  64  *
  65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75  * THE POSSIBILITY OF SUCH DAMAGE.
  76  */
  77
  78 /* Portions of this work are licensed under the terms of the GNU GPL,
  79  * version 2 or later. See the COPYING file in the top-level directory.
  80  */
  81
  82 /* softfloat (and in particular the code in softfloat-specialize.h) is
  83  * target-dependent and needs the TARGET_* macros.
  84  */
  85 #include "qemu/osdep.h"
  86 #include "qemu/bitops.h"
  87 #include "fpu/softfloat.h"
  88
  89 /* We only need stdlib for abort() */
  90
  91 /*----------------------------------------------------------------------------
  92 | Primitive arithmetic functions, including multi-word arithmetic, and
  93 | division and square root approximations.  (Can be specialized to target if
  94 | desired.)
  95 *----------------------------------------------------------------------------*/
  96 #include "fpu/softfloat-macros.h"
  97
  98 /*----------------------------------------------------------------------------
  99 | Functions and definitions to determine:  (1) whether tininess for underflow
 100 | is detected before or after rounding by default, (2) what (if anything)
 101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
 102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 103 | are propagated from function inputs to output.  These details are target-
 104 | specific.
 105 *----------------------------------------------------------------------------*/
 106 #include "softfloat-specialize.h"
 107
 108 /*----------------------------------------------------------------------------
 109 | Returns the fraction bits of the half-precision floating-point value `a'.
 110 *----------------------------------------------------------------------------*/
 111
 112 static inline uint32_t extractFloat16Frac(float16 a)
 113 {
 114     return float16_val(a) & 0x3ff;
 115 }
 116
 117 /*----------------------------------------------------------------------------
 118 | Returns the exponent bits of the half-precision floating-point value `a'.
 119 *----------------------------------------------------------------------------*/
 120
 121 static inline int extractFloat16Exp(float16 a)
 122 {
 123     return (float16_val(a) >> 10) & 0x1f;
 124 }
 125
 126 /*----------------------------------------------------------------------------
 127 | Returns the sign bit of the single-precision floating-point value `a'.
 128 *----------------------------------------------------------------------------*/
 129
 130 static inline flag extractFloat16Sign(float16 a)
 131 {
 132     return float16_val(a)>>15;
 133 }
 134
 135 /*----------------------------------------------------------------------------
 136 | Returns the fraction bits of the single-precision floating-point value `a'.
 137 *----------------------------------------------------------------------------*/
 138
 139 static inline uint32_t extractFloat32Frac(float32 a)
 140 {
 141     return float32_val(a) & 0x007FFFFF;
 142 }
 143
 144 /*----------------------------------------------------------------------------
 145 | Returns the exponent bits of the single-precision floating-point value `a'.
 146 *----------------------------------------------------------------------------*/
 147
 148 static inline int extractFloat32Exp(float32 a)
 149 {
 150     return (float32_val(a) >> 23) & 0xFF;
 151 }
 152
 153 /*----------------------------------------------------------------------------
 154 | Returns the sign bit of the single-precision floating-point value `a'.
 155 *----------------------------------------------------------------------------*/
 156
 157 static inline flag extractFloat32Sign(float32 a)
 158 {
 159     return float32_val(a) >> 31;
 160 }
 161
 162 /*----------------------------------------------------------------------------
 163 | Returns the fraction bits of the double-precision floating-point value `a'.
 164 *----------------------------------------------------------------------------*/
 165
 166 static inline uint64_t extractFloat64Frac(float64 a)
 167 {
 168     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
 169 }
 170
 171 /*----------------------------------------------------------------------------
 172 | Returns the exponent bits of the double-precision floating-point value `a'.
 173 *----------------------------------------------------------------------------*/
 174
 175 static inline int extractFloat64Exp(float64 a)
 176 {
 177     return (float64_val(a) >> 52) & 0x7FF;
 178 }
 179
 180 /*----------------------------------------------------------------------------
 181 | Returns the sign bit of the double-precision floating-point value `a'.
 182 *----------------------------------------------------------------------------*/
 183
 184 static inline flag extractFloat64Sign(float64 a)
 185 {
 186     return float64_val(a) >> 63;
 187 }
 188
 189 /*
 190  * Classify a floating point number. Everything above float_class_qnan
 191  * is a NaN so cls >= float_class_qnan is any NaN.
 192  */
 193
 194 typedef enum __attribute__ ((__packed__)) {
 195     float_class_unclassified,
 196     float_class_zero,
 197     float_class_normal,
 198     float_class_inf,
 199     float_class_qnan,  /* all NaNs from here */
 200     float_class_snan,
 201     float_class_dnan,
 202     float_class_msnan, /* maybe silenced */
 203 } FloatClass;
 204
 205 /*
 206  * Structure holding all of the decomposed parts of a float. The
 207  * exponent is unbiased and the fraction is normalized. All
 208  * calculations are done with a 64 bit fraction and then rounded as
 209  * appropriate for the final format.
 210  *
 211  * Thanks to the packed FloatClass a decent compiler should be able to
 212  * fit the whole structure into registers and avoid using the stack
 213  * for parameter passing.
 214  */
 215
 216 typedef struct {
 217     uint64_t frac;
 218     int32_t  exp;
 219     FloatClass cls;
 220     bool sign;
 221 } FloatParts;
 222
 223 #define DECOMPOSED_BINARY_POINT    (64 - 2)
 224 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 225 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
 226
 227 /* Structure holding all of the relevant parameters for a format.
 228  *   exp_size: the size of the exponent field
 229  *   exp_bias: the offset applied to the exponent field
 230  *   exp_max: the maximum normalised exponent
 231  *   frac_size: the size of the fraction field
 232  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
 233  * The following are computed based the size of fraction
 234  *   frac_lsb: least significant bit of fraction
 235  *   fram_lsbm1: the bit bellow the least significant bit (for rounding)
 236  *   round_mask/roundeven_mask: masks used for rounding
 237  */
 238 typedef struct {
 239     int exp_size;
 240     int exp_bias;
 241     int exp_max;
 242     int frac_size;
 243     int frac_shift;
 244     uint64_t frac_lsb;
 245     uint64_t frac_lsbm1;
 246     uint64_t round_mask;
 247     uint64_t roundeven_mask;
 248 } FloatFmt;
 249
 250 /* Expand fields based on the size of exponent and fraction */
 251 #define FLOAT_PARAMS(E, F)                                           \
 252     .exp_size       = E,                                             \
 253     .exp_bias       = ((1 << E) - 1) >> 1,                           \
 254     .exp_max        = (1 << E) - 1,                                  \
 255     .frac_size      = F,                                             \
 256     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
 257     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
 258     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
 259     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
 260     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
 261
 262 static const FloatFmt float16_params = {
 263     FLOAT_PARAMS(5, 10)
 264 };
 265
 266 static const FloatFmt float32_params = {
 267     FLOAT_PARAMS(8, 23)
 268 };
 269
 270 static const FloatFmt float64_params = {
 271     FLOAT_PARAMS(11, 52)
 272 };
 273
 274 /* Unpack a float to parts, but do not canonicalize.  */
 275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
 276 {
 277     const int sign_pos = fmt.frac_size + fmt.exp_size;
 278
 279     return (FloatParts) {
 280         .cls = float_class_unclassified,
 281         .sign = extract64(raw, sign_pos, 1),
 282         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
 283         .frac = extract64(raw, 0, fmt.frac_size),
 284     };
 285 }
 286
 287 static inline FloatParts float16_unpack_raw(float16 f)
 288 {
 289     return unpack_raw(float16_params, f);
 290 }
 291
 292 static inline FloatParts float32_unpack_raw(float32 f)
 293 {
 294     return unpack_raw(float32_params, f);
 295 }
 296
 297 static inline FloatParts float64_unpack_raw(float64 f)
 298 {
 299     return unpack_raw(float64_params, f);
 300 }
 301
 302 /* Pack a float from parts, but do not canonicalize.  */
 303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
 304 {
 305     const int sign_pos = fmt.frac_size + fmt.exp_size;
 306     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
 307     return deposit64(ret, sign_pos, 1, p.sign);
 308 }
 309
 310 static inline float16 float16_pack_raw(FloatParts p)
 311 {
 312     return make_float16(pack_raw(float16_params, p));
 313 }
 314
 315 static inline float32 float32_pack_raw(FloatParts p)
 316 {
 317     return make_float32(pack_raw(float32_params, p));
 318 }
 319
 320 static inline float64 float64_pack_raw(FloatParts p)
 321 {
 322     return make_float64(pack_raw(float64_params, p));
 323 }
 324
 325 /* Canonicalize EXP and FRAC, setting CLS.  */
 326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
 327                                float_status *status)
 328 {
 329     if (part.exp == parm->exp_max) {
 330         if (part.frac == 0) {
 331             part.cls = float_class_inf;
 332         } else {
 333 #ifdef NO_SIGNALING_NANS
 334             part.cls = float_class_qnan;
 335 #else
 336             int64_t msb = part.frac << (parm->frac_shift + 2);
 337             if ((msb < 0) == status->snan_bit_is_one) {
 338                 part.cls = float_class_snan;
 339             } else {
 340                 part.cls = float_class_qnan;
 341             }
 342 #endif
 343         }
 344     } else if (part.exp == 0) {
 345         if (likely(part.frac == 0)) {
 346             part.cls = float_class_zero;
 347         } else if (status->flush_inputs_to_zero) {
 348             float_raise(float_flag_input_denormal, status);
 349             part.cls = float_class_zero;
 350             part.frac = 0;
 351         } else {
 352             int shift = clz64(part.frac) - 1;
 353             part.cls = float_class_normal;
 354             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
 355             part.frac <<= shift;
 356         }
 357     } else {
 358         part.cls = float_class_normal;
 359         part.exp -= parm->exp_bias;
 360         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
 361     }
 362     return part;
 363 }
 364
 365 /* Round and uncanonicalize a floating-point number by parts. There
 366  * are FRAC_SHIFT bits that may require rounding at the bottom of the
 367  * fraction; these bits will be removed. The exponent will be biased
 368  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
 369  */
 370
 371 static FloatParts round_canonical(FloatParts p, float_status *s,
 372                                   const FloatFmt *parm)
 373 {
 374     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
 375     const uint64_t round_mask = parm->round_mask;
 376     const uint64_t roundeven_mask = parm->roundeven_mask;
 377     const int exp_max = parm->exp_max;
 378     const int frac_shift = parm->frac_shift;
 379     uint64_t frac, inc;
 380     int exp, flags = 0;
 381     bool overflow_norm;
 382
 383     frac = p.frac;
 384     exp = p.exp;
 385
 386     switch (p.cls) {
 387     case float_class_normal:
 388         switch (s->float_rounding_mode) {
 389         case float_round_nearest_even:
 390             overflow_norm = false;
 391             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
 392             break;
 393         case float_round_ties_away:
 394             overflow_norm = false;
 395             inc = frac_lsbm1;
 396             break;
 397         case float_round_to_zero:
 398             overflow_norm = true;
 399             inc = 0;
 400             break;
 401         case float_round_up:
 402             inc = p.sign ? 0 : round_mask;
 403             overflow_norm = p.sign;
 404             break;
 405         case float_round_down:
 406             inc = p.sign ? round_mask : 0;
 407             overflow_norm = !p.sign;
 408             break;
 409         default:
 410             g_assert_not_reached();
 411         }
 412
 413         exp += parm->exp_bias;
 414         if (likely(exp > 0)) {
 415             if (frac & round_mask) {
 416                 flags |= float_flag_inexact;
 417                 frac += inc;
 418                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
 419                     frac >>= 1;
 420                     exp++;
 421                 }
 422             }
 423             frac >>= frac_shift;
 424
 425             if (unlikely(exp >= exp_max)) {
 426                 flags |= float_flag_overflow | float_flag_inexact;
 427                 if (overflow_norm) {
 428                     exp = exp_max - 1;
 429                     frac = -1;
 430                 } else {
 431                     p.cls = float_class_inf;
 432                     goto do_inf;
 433                 }
 434             }
 435         } else if (s->flush_to_zero) {
 436             flags |= float_flag_output_denormal;
 437             p.cls = float_class_zero;
 438             goto do_zero;
 439         } else {
 440             bool is_tiny = (s->float_detect_tininess
 441                             == float_tininess_before_rounding)
 442                         || (exp < 0)
 443                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
 444
 445             shift64RightJamming(frac, 1 - exp, &frac);
 446             if (frac & round_mask) {
 447                 /* Need to recompute round-to-even.  */
 448                 if (s->float_rounding_mode == float_round_nearest_even) {
 449                     inc = ((frac & roundeven_mask) != frac_lsbm1
 450                            ? frac_lsbm1 : 0);
 451                 }
 452                 flags |= float_flag_inexact;
 453                 frac += inc;
 454             }
 455
 456             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
 457             frac >>= frac_shift;
 458
 459             if (is_tiny && (flags & float_flag_inexact)) {
 460                 flags |= float_flag_underflow;
 461             }
 462             if (exp == 0 && frac == 0) {
 463                 p.cls = float_class_zero;
 464             }
 465         }
 466         break;
 467
 468     case float_class_zero:
 469     do_zero:
 470         exp = 0;
 471         frac = 0;
 472         break;
 473
 474     case float_class_inf:
 475     do_inf:
 476         exp = exp_max;
 477         frac = 0;
 478         break;
 479
 480     case float_class_qnan:
 481     case float_class_snan:
 482         exp = exp_max;
 483         break;
 484
 485     default:
 486         g_assert_not_reached();
 487     }
 488
 489     float_raise(flags, s);
 490     p.exp = exp;
 491     p.frac = frac;
 492     return p;
 493 }
 494
 495 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
 496 {
 497     return canonicalize(float16_unpack_raw(f), &float16_params, s);
 498 }
 499
 500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
 501 {
 502     switch (p.cls) {
 503     case float_class_dnan:
 504         return float16_default_nan(s);
 505     case float_class_msnan:
 506         return float16_maybe_silence_nan(float16_pack_raw(p), s);
 507     default:
 508         p = round_canonical(p, s, &float16_params);
 509         return float16_pack_raw(p);
 510     }
 511 }
 512
 513 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
 514 {
 515     return canonicalize(float32_unpack_raw(f), &float32_params, s);
 516 }
 517
 518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
 519 {
 520     switch (p.cls) {
 521     case float_class_dnan:
 522         return float32_default_nan(s);
 523     case float_class_msnan:
 524         return float32_maybe_silence_nan(float32_pack_raw(p), s);
 525     default:
 526         p = round_canonical(p, s, &float32_params);
 527         return float32_pack_raw(p);
 528     }
 529 }
 530
 531 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
 532 {
 533     return canonicalize(float64_unpack_raw(f), &float64_params, s);
 534 }
 535
 536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
 537 {
 538     switch (p.cls) {
 539     case float_class_dnan:
 540         return float64_default_nan(s);
 541     case float_class_msnan:
 542         return float64_maybe_silence_nan(float64_pack_raw(p), s);
 543     default:
 544         p = round_canonical(p, s, &float64_params);
 545         return float64_pack_raw(p);
 546     }
 547 }
 548
 549 /* Simple helpers for checking if what NaN we have */
 550 static bool is_nan(FloatClass c)
 551 {
 552     return unlikely(c >= float_class_qnan);
 553 }
 554 static bool is_snan(FloatClass c)
 555 {
 556     return c == float_class_snan;
 557 }
 558 static bool is_qnan(FloatClass c)
 559 {
 560     return c == float_class_qnan;
 561 }
 562
 563 static FloatParts return_nan(FloatParts a, float_status *s)
 564 {
 565     switch (a.cls) {
 566     case float_class_snan:
 567         s->float_exception_flags |= float_flag_invalid;
 568         a.cls = float_class_msnan;
 569         /* fall through */
 570     case float_class_qnan:
 571         if (s->default_nan_mode) {
 572             a.cls = float_class_dnan;
 573         }
 574         break;
 575
 576     default:
 577         g_assert_not_reached();
 578     }
 579     return a;
 580 }
 581
 582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
 583 {
 584     if (is_snan(a.cls) || is_snan(b.cls)) {
 585         s->float_exception_flags |= float_flag_invalid;
 586     }
 587
 588     if (s->default_nan_mode) {
 589         a.cls = float_class_dnan;
 590     } else {
 591         if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
 592                     is_qnan(b.cls), is_snan(b.cls),
 593                     a.frac > b.frac ||
 594                     (a.frac == b.frac && a.sign < b.sign))) {
 595             a = b;
 596         }
 597         a.cls = float_class_msnan;
 598     }
 599     return a;
 600 }
 601
 602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
 603                                   bool inf_zero, float_status *s)
 604 {
 605     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
 606         s->float_exception_flags |= float_flag_invalid;
 607     }
 608
 609     if (s->default_nan_mode) {
 610         a.cls = float_class_dnan;
 611     } else {
 612         switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
 613                               is_qnan(b.cls), is_snan(b.cls),
 614                               is_qnan(c.cls), is_snan(c.cls),
 615                               inf_zero, s)) {
 616         case 0:
 617             break;
 618         case 1:
 619             a = b;
 620             break;
 621         case 2:
 622             a = c;
 623             break;
 624         case 3:
 625             a.cls = float_class_dnan;
 626             return a;
 627         default:
 628             g_assert_not_reached();
 629         }
 630
 631         a.cls = float_class_msnan;
 632     }
 633     return a;
 634 }
 635
 636 /*
 637  * Returns the result of adding or subtracting the values of the
 638  * floating-point values `a' and `b'. The operation is performed
 639  * according to the IEC/IEEE Standard for Binary Floating-Point
 640  * Arithmetic.
 641  */
 642
 643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
 644                                 float_status *s)
 645 {
 646     bool a_sign = a.sign;
 647     bool b_sign = b.sign ^ subtract;
 648
 649     if (a_sign != b_sign) {
 650         /* Subtraction */
 651
 652         if (a.cls == float_class_normal && b.cls == float_class_normal) {
 653             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
 654                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 655                 a.frac = a.frac - b.frac;
 656             } else {
 657                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 658                 a.frac = b.frac - a.frac;
 659                 a.exp = b.exp;
 660                 a_sign ^= 1;
 661             }
 662
 663             if (a.frac == 0) {
 664                 a.cls = float_class_zero;
 665                 a.sign = s->float_rounding_mode == float_round_down;
 666             } else {
 667                 int shift = clz64(a.frac) - 1;
 668                 a.frac = a.frac << shift;
 669                 a.exp = a.exp - shift;
 670                 a.sign = a_sign;
 671             }
 672             return a;
 673         }
 674         if (is_nan(a.cls) || is_nan(b.cls)) {
 675             return pick_nan(a, b, s);
 676         }
 677         if (a.cls == float_class_inf) {
 678             if (b.cls == float_class_inf) {
 679                 float_raise(float_flag_invalid, s);
 680                 a.cls = float_class_dnan;
 681             }
 682             return a;
 683         }
 684         if (a.cls == float_class_zero && b.cls == float_class_zero) {
 685             a.sign = s->float_rounding_mode == float_round_down;
 686             return a;
 687         }
 688         if (a.cls == float_class_zero || b.cls == float_class_inf) {
 689             b.sign = a_sign ^ 1;
 690             return b;
 691         }
 692         if (b.cls == float_class_zero) {
 693             return a;
 694         }
 695     } else {
 696         /* Addition */
 697         if (a.cls == float_class_normal && b.cls == float_class_normal) {
 698             if (a.exp > b.exp) {
 699                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 700             } else if (a.exp < b.exp) {
 701                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 702                 a.exp = b.exp;
 703             }
 704             a.frac += b.frac;
 705             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
 706                 a.frac >>= 1;
 707                 a.exp += 1;
 708             }
 709             return a;
 710         }
 711         if (is_nan(a.cls) || is_nan(b.cls)) {
 712             return pick_nan(a, b, s);
 713         }
 714         if (a.cls == float_class_inf || b.cls == float_class_zero) {
 715             return a;
 716         }
 717         if (b.cls == float_class_inf || a.cls == float_class_zero) {
 718             b.sign = b_sign;
 719             return b;
 720         }
 721     }
 722     g_assert_not_reached();
 723 }
 724
 725 /*
 726  * Returns the result of adding or subtracting the floating-point
 727  * values `a' and `b'. The operation is performed according to the
 728  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 729  */
 730
 731 float16  __attribute__((flatten)) float16_add(float16 a, float16 b,
 732                                               float_status *status)
 733 {
 734     FloatParts pa = float16_unpack_canonical(a, status);
 735     FloatParts pb = float16_unpack_canonical(b, status);
 736     FloatParts pr = addsub_floats(pa, pb, false, status);
 737
 738     return float16_round_pack_canonical(pr, status);
 739 }
 740
 741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
 742                                              float_status *status)
 743 {
 744     FloatParts pa = float32_unpack_canonical(a, status);
 745     FloatParts pb = float32_unpack_canonical(b, status);
 746     FloatParts pr = addsub_floats(pa, pb, false, status);
 747
 748     return float32_round_pack_canonical(pr, status);
 749 }
 750
 751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
 752                                              float_status *status)
 753 {
 754     FloatParts pa = float64_unpack_canonical(a, status);
 755     FloatParts pb = float64_unpack_canonical(b, status);
 756     FloatParts pr = addsub_floats(pa, pb, false, status);
 757
 758     return float64_round_pack_canonical(pr, status);
 759 }
 760
 761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
 762                                              float_status *status)
 763 {
 764     FloatParts pa = float16_unpack_canonical(a, status);
 765     FloatParts pb = float16_unpack_canonical(b, status);
 766     FloatParts pr = addsub_floats(pa, pb, true, status);
 767
 768     return float16_round_pack_canonical(pr, status);
 769 }
 770
 771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
 772                                              float_status *status)
 773 {
 774     FloatParts pa = float32_unpack_canonical(a, status);
 775     FloatParts pb = float32_unpack_canonical(b, status);
 776     FloatParts pr = addsub_floats(pa, pb, true, status);
 777
 778     return float32_round_pack_canonical(pr, status);
 779 }
 780
 781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
 782                                              float_status *status)
 783 {
 784     FloatParts pa = float64_unpack_canonical(a, status);
 785     FloatParts pb = float64_unpack_canonical(b, status);
 786     FloatParts pr = addsub_floats(pa, pb, true, status);
 787
 788     return float64_round_pack_canonical(pr, status);
 789 }
 790
 791 /*
 792  * Returns the result of multiplying the floating-point values `a' and
 793  * `b'. The operation is performed according to the IEC/IEEE Standard
 794  * for Binary Floating-Point Arithmetic.
 795  */
 796
 797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
 798 {
 799     bool sign = a.sign ^ b.sign;
 800
 801     if (a.cls == float_class_normal && b.cls == float_class_normal) {
 802         uint64_t hi, lo;
 803         int exp = a.exp + b.exp;
 804
 805         mul64To128(a.frac, b.frac, &hi, &lo);
 806         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 807         if (lo & DECOMPOSED_OVERFLOW_BIT) {
 808             shift64RightJamming(lo, 1, &lo);
 809             exp += 1;
 810         }
 811
 812         /* Re-use a */
 813         a.exp = exp;
 814         a.sign = sign;
 815         a.frac = lo;
 816         return a;
 817     }
 818     /* handle all the NaN cases */
 819     if (is_nan(a.cls) || is_nan(b.cls)) {
 820         return pick_nan(a, b, s);
 821     }
 822     /* Inf * Zero == NaN */
 823     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
 824         (a.cls == float_class_zero && b.cls == float_class_inf)) {
 825         s->float_exception_flags |= float_flag_invalid;
 826         a.cls = float_class_dnan;
 827         a.sign = sign;
 828         return a;
 829     }
 830     /* Multiply by 0 or Inf */
 831     if (a.cls == float_class_inf || a.cls == float_class_zero) {
 832         a.sign = sign;
 833         return a;
 834     }
 835     if (b.cls == float_class_inf || b.cls == float_class_zero) {
 836         b.sign = sign;
 837         return b;
 838     }
 839     g_assert_not_reached();
 840 }
 841
 842 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
 843                                              float_status *status)
 844 {
 845     FloatParts pa = float16_unpack_canonical(a, status);
 846     FloatParts pb = float16_unpack_canonical(b, status);
 847     FloatParts pr = mul_floats(pa, pb, status);
 848
 849     return float16_round_pack_canonical(pr, status);
 850 }
 851
 852 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
 853                                              float_status *status)
 854 {
 855     FloatParts pa = float32_unpack_canonical(a, status);
 856     FloatParts pb = float32_unpack_canonical(b, status);
 857     FloatParts pr = mul_floats(pa, pb, status);
 858
 859     return float32_round_pack_canonical(pr, status);
 860 }
 861
 862 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
 863                                              float_status *status)
 864 {
 865     FloatParts pa = float64_unpack_canonical(a, status);
 866     FloatParts pb = float64_unpack_canonical(b, status);
 867     FloatParts pr = mul_floats(pa, pb, status);
 868
 869     return float64_round_pack_canonical(pr, status);
 870 }
 871
 872 /*
 873  * Returns the result of multiplying the floating-point values `a' and
 874  * `b' then adding 'c', with no intermediate rounding step after the
 875  * multiplication. The operation is performed according to the
 876  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
 877  * The flags argument allows the caller to select negation of the
 878  * addend, the intermediate product, or the final result. (The
 879  * difference between this and having the caller do a separate
 880  * negation is that negating externally will flip the sign bit on
 881  * NaNs.)
 882  */
 883
 884 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
 885                                 int flags, float_status *s)
 886 {
 887     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
 888                     ((1 << float_class_inf) | (1 << float_class_zero));
 889     bool p_sign;
 890     bool sign_flip = flags & float_muladd_negate_result;
 891     FloatClass p_class;
 892     uint64_t hi, lo;
 893     int p_exp;
 894
 895     /* It is implementation-defined whether the cases of (0,inf,qnan)
 896      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
 897      * they return if they do), so we have to hand this information
 898      * off to the target-specific pick-a-NaN routine.
 899      */
 900     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
 901         return pick_nan_muladd(a, b, c, inf_zero, s);
 902     }
 903
 904     if (inf_zero) {
 905         s->float_exception_flags |= float_flag_invalid;
 906         a.cls = float_class_dnan;
 907         return a;
 908     }
 909
 910     if (flags & float_muladd_negate_c) {
 911         c.sign ^= 1;
 912     }
 913
 914     p_sign = a.sign ^ b.sign;
 915
 916     if (flags & float_muladd_negate_product) {
 917         p_sign ^= 1;
 918     }
 919
 920     if (a.cls == float_class_inf || b.cls == float_class_inf) {
 921         p_class = float_class_inf;
 922     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
 923         p_class = float_class_zero;
 924     } else {
 925         p_class = float_class_normal;
 926     }
 927
 928     if (c.cls == float_class_inf) {
 929         if (p_class == float_class_inf && p_sign != c.sign) {
 930             s->float_exception_flags |= float_flag_invalid;
 931             a.cls = float_class_dnan;
 932         } else {
 933             a.cls = float_class_inf;
 934             a.sign = c.sign ^ sign_flip;
 935         }
 936         return a;
 937     }
 938
 939     if (p_class == float_class_inf) {
 940         a.cls = float_class_inf;
 941         a.sign = p_sign ^ sign_flip;
 942         return a;
 943     }
 944
 945     if (p_class == float_class_zero) {
 946         if (c.cls == float_class_zero) {
 947             if (p_sign != c.sign) {
 948                 p_sign = s->float_rounding_mode == float_round_down;
 949             }
 950             c.sign = p_sign;
 951         } else if (flags & float_muladd_halve_result) {
 952             c.exp -= 1;
 953         }
 954         c.sign ^= sign_flip;
 955         return c;
 956     }
 957
 958     /* a & b should be normals now... */
 959     assert(a.cls == float_class_normal &&
 960            b.cls == float_class_normal);
 961
 962     p_exp = a.exp + b.exp;
 963
 964     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
 965      * result.
 966      */
 967     mul64To128(a.frac, b.frac, &hi, &lo);
 968     /* binary point now at bit 124 */
 969
 970     /* check for overflow */
 971     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
 972         shift128RightJamming(hi, lo, 1, &hi, &lo);
 973         p_exp += 1;
 974     }
 975
 976     /* + add/sub */
 977     if (c.cls == float_class_zero) {
 978         /* move binary point back to 62 */
 979         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 980     } else {
 981         int exp_diff = p_exp - c.exp;
 982         if (p_sign == c.sign) {
 983             /* Addition */
 984             if (exp_diff <= 0) {
 985                 shift128RightJamming(hi, lo,
 986                                      DECOMPOSED_BINARY_POINT - exp_diff,
 987                                      &hi, &lo);
 988                 lo += c.frac;
 989                 p_exp = c.exp;
 990             } else {
 991                 uint64_t c_hi, c_lo;
 992                 /* shift c to the same binary point as the product (124) */
 993                 c_hi = c.frac >> 2;
 994                 c_lo = 0;
 995                 shift128RightJamming(c_hi, c_lo,
 996                                      exp_diff,
 997                                      &c_hi, &c_lo);
 998                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
 999                 /* move binary point back to 62 */
1000                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1001             }
1002
1003             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1004                 shift64RightJamming(lo, 1, &lo);
1005                 p_exp += 1;
1006             }
1007
1008         } else {
1009             /* Subtraction */
1010             uint64_t c_hi, c_lo;
1011             /* make C binary point match product at bit 124 */
1012             c_hi = c.frac >> 2;
1013             c_lo = 0;
1014
1015             if (exp_diff <= 0) {
1016                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1017                 if (exp_diff == 0
1018                     &&
1019                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1020                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1021                 } else {
1022                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1023                     p_sign ^= 1;
1024                     p_exp = c.exp;
1025                 }
1026             } else {
1027                 shift128RightJamming(c_hi, c_lo,
1028                                      exp_diff,
1029                                      &c_hi, &c_lo);
1030                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1031             }
1032
1033             if (hi == 0 && lo == 0) {
1034                 a.cls = float_class_zero;
1035                 a.sign = s->float_rounding_mode == float_round_down;
1036                 a.sign ^= sign_flip;
1037                 return a;
1038             } else {
1039                 int shift;
1040                 if (hi != 0) {
1041                     shift = clz64(hi);
1042                 } else {
1043                     shift = clz64(lo) + 64;
1044                 }
1045                 /* Normalizing to a binary point of 124 is the
1046                    correct adjust for the exponent.  However since we're
1047                    shifting, we might as well put the binary point back
1048                    at 62 where we really want it.  Therefore shift as
1049                    if we're leaving 1 bit at the top of the word, but
1050                    adjust the exponent as if we're leaving 3 bits.  */
1051                 shift -= 1;
1052                 if (shift >= 64) {
1053                     lo = lo << (shift - 64);
1054                 } else {
1055                     hi = (hi << shift) | (lo >> (64 - shift));
1056                     lo = hi | ((lo << shift) != 0);
1057                 }
1058                 p_exp -= shift - 2;
1059             }
1060         }
1061     }
1062
1063     if (flags & float_muladd_halve_result) {
1064         p_exp -= 1;
1065     }
1066
1067     /* finally prepare our result */
1068     a.cls = float_class_normal;
1069     a.sign = p_sign ^ sign_flip;
1070     a.exp = p_exp;
1071     a.frac = lo;
1072
1073     return a;
1074 }
1075
1076 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1077                                                 int flags, float_status *status)
1078 {
1079     FloatParts pa = float16_unpack_canonical(a, status);
1080     FloatParts pb = float16_unpack_canonical(b, status);
1081     FloatParts pc = float16_unpack_canonical(c, status);
1082     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1083
1084     return float16_round_pack_canonical(pr, status);
1085 }
1086
1087 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1088                                                 int flags, float_status *status)
1089 {
1090     FloatParts pa = float32_unpack_canonical(a, status);
1091     FloatParts pb = float32_unpack_canonical(b, status);
1092     FloatParts pc = float32_unpack_canonical(c, status);
1093     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1094
1095     return float32_round_pack_canonical(pr, status);
1096 }
1097
1098 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1099                                                 int flags, float_status *status)
1100 {
1101     FloatParts pa = float64_unpack_canonical(a, status);
1102     FloatParts pb = float64_unpack_canonical(b, status);
1103     FloatParts pc = float64_unpack_canonical(c, status);
1104     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1105
1106     return float64_round_pack_canonical(pr, status);
1107 }
1108
1109 /*
1110  * Returns the result of dividing the floating-point value `a' by the
1111  * corresponding value `b'. The operation is performed according to
1112  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1113  */
1114
1115 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1116 {
1117     bool sign = a.sign ^ b.sign;
1118
1119     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1120         uint64_t temp_lo, temp_hi;
1121         int exp = a.exp - b.exp;
1122         if (a.frac < b.frac) {
1123             exp -= 1;
1124             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1125                               &temp_hi, &temp_lo);
1126         } else {
1127             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1128                               &temp_hi, &temp_lo);
1129         }
1130         /* LSB of quot is set if inexact which roundandpack will use
1131          * to set flags. Yet again we re-use a for the result */
1132         a.frac = div128To64(temp_lo, temp_hi, b.frac);
1133         a.sign = sign;
1134         a.exp = exp;
1135         return a;
1136     }
1137     /* handle all the NaN cases */
1138     if (is_nan(a.cls) || is_nan(b.cls)) {
1139         return pick_nan(a, b, s);
1140     }
1141     /* 0/0 or Inf/Inf */
1142     if (a.cls == b.cls
1143         &&
1144         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1145         s->float_exception_flags |= float_flag_invalid;
1146         a.cls = float_class_dnan;
1147         return a;
1148     }
1149     /* Div 0 => Inf */
1150     if (b.cls == float_class_zero) {
1151         s->float_exception_flags |= float_flag_divbyzero;
1152         a.cls = float_class_inf;
1153         a.sign = sign;
1154         return a;
1155     }
1156     /* Inf / x or 0 / x */
1157     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1158         a.sign = sign;
1159         return a;
1160     }
1161     /* Div by Inf */
1162     if (b.cls == float_class_inf) {
1163         a.cls = float_class_zero;
1164         a.sign = sign;
1165         return a;
1166     }
1167     g_assert_not_reached();
1168 }
1169
1170 float16 float16_div(float16 a, float16 b, float_status *status)
1171 {
1172     FloatParts pa = float16_unpack_canonical(a, status);
1173     FloatParts pb = float16_unpack_canonical(b, status);
1174     FloatParts pr = div_floats(pa, pb, status);
1175
1176     return float16_round_pack_canonical(pr, status);
1177 }
1178
1179 float32 float32_div(float32 a, float32 b, float_status *status)
1180 {
1181     FloatParts pa = float32_unpack_canonical(a, status);
1182     FloatParts pb = float32_unpack_canonical(b, status);
1183     FloatParts pr = div_floats(pa, pb, status);
1184
1185     return float32_round_pack_canonical(pr, status);
1186 }
1187
1188 float64 float64_div(float64 a, float64 b, float_status *status)
1189 {
1190     FloatParts pa = float64_unpack_canonical(a, status);
1191     FloatParts pb = float64_unpack_canonical(b, status);
1192     FloatParts pr = div_floats(pa, pb, status);
1193
1194     return float64_round_pack_canonical(pr, status);
1195 }
1196
1197 /*
1198  * Rounds the floating-point value `a' to an integer, and returns the
1199  * result as a floating-point value. The operation is performed
1200  * according to the IEC/IEEE Standard for Binary Floating-Point
1201  * Arithmetic.
1202  */
1203
1204 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1205 {
1206     if (is_nan(a.cls)) {
1207         return return_nan(a, s);
1208     }
1209
1210     switch (a.cls) {
1211     case float_class_zero:
1212     case float_class_inf:
1213     case float_class_qnan:
1214         /* already "integral" */
1215         break;
1216     case float_class_normal:
1217         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1218             /* already integral */
1219             break;
1220         }
1221         if (a.exp < 0) {
1222             bool one;
1223             /* all fractional */
1224             s->float_exception_flags |= float_flag_inexact;
1225             switch (rounding_mode) {
1226             case float_round_nearest_even:
1227                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1228                 break;
1229             case float_round_ties_away:
1230                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1231                 break;
1232             case float_round_to_zero:
1233                 one = false;
1234                 break;
1235             case float_round_up:
1236                 one = !a.sign;
1237                 break;
1238             case float_round_down:
1239                 one = a.sign;
1240                 break;
1241             default:
1242                 g_assert_not_reached();
1243             }
1244
1245             if (one) {
1246                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1247                 a.exp = 0;
1248             } else {
1249                 a.cls = float_class_zero;
1250             }
1251         } else {
1252             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1253             uint64_t frac_lsbm1 = frac_lsb >> 1;
1254             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1255             uint64_t rnd_mask = rnd_even_mask >> 1;
1256             uint64_t inc;
1257
1258             switch (rounding_mode) {
1259             case float_round_nearest_even:
1260                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1261                 break;
1262             case float_round_ties_away:
1263                 inc = frac_lsbm1;
1264                 break;
1265             case float_round_to_zero:
1266                 inc = 0;
1267                 break;
1268             case float_round_up:
1269                 inc = a.sign ? 0 : rnd_mask;
1270                 break;
1271             case float_round_down:
1272                 inc = a.sign ? rnd_mask : 0;
1273                 break;
1274             default:
1275                 g_assert_not_reached();
1276             }
1277
1278             if (a.frac & rnd_mask) {
1279                 s->float_exception_flags |= float_flag_inexact;
1280                 a.frac += inc;
1281                 a.frac &= ~rnd_mask;
1282                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1283                     a.frac >>= 1;
1284                     a.exp++;
1285                 }
1286             }
1287         }
1288         break;
1289     default:
1290         g_assert_not_reached();
1291     }
1292     return a;
1293 }
1294
1295 float16 float16_round_to_int(float16 a, float_status *s)
1296 {
1297     FloatParts pa = float16_unpack_canonical(a, s);
1298     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1299     return float16_round_pack_canonical(pr, s);
1300 }
1301
1302 float32 float32_round_to_int(float32 a, float_status *s)
1303 {
1304     FloatParts pa = float32_unpack_canonical(a, s);
1305     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1306     return float32_round_pack_canonical(pr, s);
1307 }
1308
1309 float64 float64_round_to_int(float64 a, float_status *s)
1310 {
1311     FloatParts pa = float64_unpack_canonical(a, s);
1312     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1313     return float64_round_pack_canonical(pr, s);
1314 }
1315
1316 float64 float64_trunc_to_int(float64 a, float_status *s)
1317 {
1318     FloatParts pa = float64_unpack_canonical(a, s);
1319     FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1320     return float64_round_pack_canonical(pr, s);
1321 }
1322
1323 /*
1324  * Returns the result of converting the floating-point value `a' to
1325  * the two's complement integer format. The conversion is performed
1326  * according to the IEC/IEEE Standard for Binary Floating-Point
1327  * Arithmetic---which means in particular that the conversion is
1328  * rounded according to the current rounding mode. If `a' is a NaN,
1329  * the largest positive integer is returned. Otherwise, if the
1330  * conversion overflows, the largest integer with the same sign as `a'
1331  * is returned.
1332 */
1333
1334 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1335                                      int64_t min, int64_t max,
1336                                      float_status *s)
1337 {
1338     uint64_t r;
1339     int orig_flags = get_float_exception_flags(s);
1340     FloatParts p = round_to_int(in, rmode, s);
1341
1342     switch (p.cls) {
1343     case float_class_snan:
1344     case float_class_qnan:
1345     case float_class_dnan:
1346     case float_class_msnan:
1347         s->float_exception_flags = orig_flags | float_flag_invalid;
1348         return max;
1349     case float_class_inf:
1350         s->float_exception_flags = orig_flags | float_flag_invalid;
1351         return p.sign ? min : max;
1352     case float_class_zero:
1353         return 0;
1354     case float_class_normal:
1355         if (p.exp < DECOMPOSED_BINARY_POINT) {
1356             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1357         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1358             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1359         } else {
1360             r = UINT64_MAX;
1361         }
1362         if (p.sign) {
1363             if (r < -(uint64_t) min) {
1364                 return -r;
1365             } else {
1366                 s->float_exception_flags = orig_flags | float_flag_invalid;
1367                 return min;
1368             }
1369         } else {
1370             if (r < max) {
1371                 return r;
1372             } else {
1373                 s->float_exception_flags = orig_flags | float_flag_invalid;
1374                 return max;
1375             }
1376         }
1377     default:
1378         g_assert_not_reached();
1379     }
1380 }
1381
1382 #define FLOAT_TO_INT(fsz, isz)                                          \
1383 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a,         \
1384                                                 float_status *s)        \
1385 {                                                                       \
1386     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1387     return round_to_int_and_pack(p, s->float_rounding_mode,             \
1388                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1389                                  s);                                    \
1390 }                                                                       \
1391                                                                         \
1392 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero       \
1393  (float ## fsz a, float_status *s)                                      \
1394 {                                                                       \
1395     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1396     return round_to_int_and_pack(p, float_round_to_zero,                \
1397                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1398                                  s);                                    \
1399 }
1400
1401 FLOAT_TO_INT(16, 16)
1402 FLOAT_TO_INT(16, 32)
1403 FLOAT_TO_INT(16, 64)
1404
1405 FLOAT_TO_INT(32, 16)
1406 FLOAT_TO_INT(32, 32)
1407 FLOAT_TO_INT(32, 64)
1408
1409 FLOAT_TO_INT(64, 16)
1410 FLOAT_TO_INT(64, 32)
1411 FLOAT_TO_INT(64, 64)
1412
1413 #undef FLOAT_TO_INT
1414
1415 /*
1416  *  Returns the result of converting the floating-point value `a' to
1417  *  the unsigned integer format. The conversion is performed according
1418  *  to the IEC/IEEE Standard for Binary Floating-Point
1419  *  Arithmetic---which means in particular that the conversion is
1420  *  rounded according to the current rounding mode. If `a' is a NaN,
1421  *  the largest unsigned integer is returned. Otherwise, if the
1422  *  conversion overflows, the largest unsigned integer is returned. If
1423  *  the 'a' is negative, the result is rounded and zero is returned;
1424  *  values that do not round to zero will raise the inexact exception
1425  *  flag.
1426  */
1427
1428 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1429                                        float_status *s)
1430 {
1431     int orig_flags = get_float_exception_flags(s);
1432     FloatParts p = round_to_int(in, rmode, s);
1433
1434     switch (p.cls) {
1435     case float_class_snan:
1436     case float_class_qnan:
1437     case float_class_dnan:
1438     case float_class_msnan:
1439         s->float_exception_flags = orig_flags | float_flag_invalid;
1440         return max;
1441     case float_class_inf:
1442         s->float_exception_flags = orig_flags | float_flag_invalid;
1443         return p.sign ? 0 : max;
1444     case float_class_zero:
1445         return 0;
1446     case float_class_normal:
1447     {
1448         uint64_t r;
1449         if (p.sign) {
1450             s->float_exception_flags = orig_flags | float_flag_invalid;
1451             return 0;
1452         }
1453
1454         if (p.exp < DECOMPOSED_BINARY_POINT) {
1455             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1456         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1457             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1458         } else {
1459             s->float_exception_flags = orig_flags | float_flag_invalid;
1460             return max;
1461         }
1462
1463         /* For uint64 this will never trip, but if p.exp is too large
1464          * to shift a decomposed fraction we shall have exited via the
1465          * 3rd leg above.
1466          */
1467         if (r > max) {
1468             s->float_exception_flags = orig_flags | float_flag_invalid;
1469             return max;
1470         } else {
1471             return r;
1472         }
1473     }
1474     default:
1475         g_assert_not_reached();
1476     }
1477 }
1478
1479 #define FLOAT_TO_UINT(fsz, isz) \
1480 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a,       \
1481                                                   float_status *s)      \
1482 {                                                                       \
1483     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1484     return round_to_uint_and_pack(p, s->float_rounding_mode,            \
1485                                  UINT ## isz ## _MAX, s);               \
1486 }                                                                       \
1487                                                                         \
1488 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero     \
1489  (float ## fsz a, float_status *s)                                      \
1490 {                                                                       \
1491     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1492     return round_to_uint_and_pack(p, float_round_to_zero,               \
1493                                   UINT ## isz ## _MAX, s);              \
1494 }
1495
1496 FLOAT_TO_UINT(16, 16)
1497 FLOAT_TO_UINT(16, 32)
1498 FLOAT_TO_UINT(16, 64)
1499
1500 FLOAT_TO_UINT(32, 16)
1501 FLOAT_TO_UINT(32, 32)
1502 FLOAT_TO_UINT(32, 64)
1503
1504 FLOAT_TO_UINT(64, 16)
1505 FLOAT_TO_UINT(64, 32)
1506 FLOAT_TO_UINT(64, 64)
1507
1508 #undef FLOAT_TO_UINT
1509
1510 /*
1511  * Integer to float conversions
1512  *
1513  * Returns the result of converting the two's complement integer `a'
1514  * to the floating-point format. The conversion is performed according
1515  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1516  */
1517
1518 static FloatParts int_to_float(int64_t a, float_status *status)
1519 {
1520     FloatParts r;
1521     if (a == 0) {
1522         r.cls = float_class_zero;
1523         r.sign = false;
1524     } else if (a == (1ULL << 63)) {
1525         r.cls = float_class_normal;
1526         r.sign = true;
1527         r.frac = DECOMPOSED_IMPLICIT_BIT;
1528         r.exp = 63;
1529     } else {
1530         uint64_t f;
1531         if (a < 0) {
1532             f = -a;
1533             r.sign = true;
1534         } else {
1535             f = a;
1536             r.sign = false;
1537         }
1538         int shift = clz64(f) - 1;
1539         r.cls = float_class_normal;
1540         r.exp = (DECOMPOSED_BINARY_POINT - shift);
1541         r.frac = f << shift;
1542     }
1543
1544     return r;
1545 }
1546
1547 float16 int64_to_float16(int64_t a, float_status *status)
1548 {
1549     FloatParts pa = int_to_float(a, status);
1550     return float16_round_pack_canonical(pa, status);
1551 }
1552
1553 float16 int32_to_float16(int32_t a, float_status *status)
1554 {
1555     return int64_to_float16(a, status);
1556 }
1557
1558 float16 int16_to_float16(int16_t a, float_status *status)
1559 {
1560     return int64_to_float16(a, status);
1561 }
1562
1563 float32 int64_to_float32(int64_t a, float_status *status)
1564 {
1565     FloatParts pa = int_to_float(a, status);
1566     return float32_round_pack_canonical(pa, status);
1567 }
1568
1569 float32 int32_to_float32(int32_t a, float_status *status)
1570 {
1571     return int64_to_float32(a, status);
1572 }
1573
1574 float32 int16_to_float32(int16_t a, float_status *status)
1575 {
1576     return int64_to_float32(a, status);
1577 }
1578
1579 float64 int64_to_float64(int64_t a, float_status *status)
1580 {
1581     FloatParts pa = int_to_float(a, status);
1582     return float64_round_pack_canonical(pa, status);
1583 }
1584
1585 float64 int32_to_float64(int32_t a, float_status *status)
1586 {
1587     return int64_to_float64(a, status);
1588 }
1589
1590 float64 int16_to_float64(int16_t a, float_status *status)
1591 {
1592     return int64_to_float64(a, status);
1593 }
1594
1595
1596 /*
1597  * Unsigned Integer to float conversions
1598  *
1599  * Returns the result of converting the unsigned integer `a' to the
1600  * floating-point format. The conversion is performed according to the
1601  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1602  */
1603
1604 static FloatParts uint_to_float(uint64_t a, float_status *status)
1605 {
1606     FloatParts r = { .sign = false};
1607
1608     if (a == 0) {
1609         r.cls = float_class_zero;
1610     } else {
1611         int spare_bits = clz64(a) - 1;
1612         r.cls = float_class_normal;
1613         r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1614         if (spare_bits < 0) {
1615             shift64RightJamming(a, -spare_bits, &a);
1616             r.frac = a;
1617         } else {
1618             r.frac = a << spare_bits;
1619         }
1620     }
1621
1622     return r;
1623 }
1624
1625 float16 uint64_to_float16(uint64_t a, float_status *status)
1626 {
1627     FloatParts pa = uint_to_float(a, status);
1628     return float16_round_pack_canonical(pa, status);
1629 }
1630
1631 float16 uint32_to_float16(uint32_t a, float_status *status)
1632 {
1633     return uint64_to_float16(a, status);
1634 }
1635
1636 float16 uint16_to_float16(uint16_t a, float_status *status)
1637 {
1638     return uint64_to_float16(a, status);
1639 }
1640
1641 float32 uint64_to_float32(uint64_t a, float_status *status)
1642 {
1643     FloatParts pa = uint_to_float(a, status);
1644     return float32_round_pack_canonical(pa, status);
1645 }
1646
1647 float32 uint32_to_float32(uint32_t a, float_status *status)
1648 {
1649     return uint64_to_float32(a, status);
1650 }
1651
1652 float32 uint16_to_float32(uint16_t a, float_status *status)
1653 {
1654     return uint64_to_float32(a, status);
1655 }
1656
1657 float64 uint64_to_float64(uint64_t a, float_status *status)
1658 {
1659     FloatParts pa = uint_to_float(a, status);
1660     return float64_round_pack_canonical(pa, status);
1661 }
1662
1663 float64 uint32_to_float64(uint32_t a, float_status *status)
1664 {
1665     return uint64_to_float64(a, status);
1666 }
1667
1668 float64 uint16_to_float64(uint16_t a, float_status *status)
1669 {
1670     return uint64_to_float64(a, status);
1671 }
1672
1673 /* Float Min/Max */
1674 /* min() and max() functions. These can't be implemented as
1675  * 'compare and pick one input' because that would mishandle
1676  * NaNs and +0 vs -0.
1677  *
1678  * minnum() and maxnum() functions. These are similar to the min()
1679  * and max() functions but if one of the arguments is a QNaN and
1680  * the other is numerical then the numerical argument is returned.
1681  * SNaNs will get quietened before being returned.
1682  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1683  * and maxNum() operations. min() and max() are the typical min/max
1684  * semantics provided by many CPUs which predate that specification.
1685  *
1686  * minnummag() and maxnummag() functions correspond to minNumMag()
1687  * and minNumMag() from the IEEE-754 2008.
1688  */
1689 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1690                                 bool ieee, bool ismag, float_status *s)
1691 {
1692     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1693         if (ieee) {
1694             /* Takes two floating-point values `a' and `b', one of
1695              * which is a NaN, and returns the appropriate NaN
1696              * result. If either `a' or `b' is a signaling NaN,
1697              * the invalid exception is raised.
1698              */
1699             if (is_snan(a.cls) || is_snan(b.cls)) {
1700                 return pick_nan(a, b, s);
1701             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1702                 return b;
1703             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1704                 return a;
1705             }
1706         }
1707         return pick_nan(a, b, s);
1708     } else {
1709         int a_exp, b_exp;
1710
1711         switch (a.cls) {
1712         case float_class_normal:
1713             a_exp = a.exp;
1714             break;
1715         case float_class_inf:
1716             a_exp = INT_MAX;
1717             break;
1718         case float_class_zero:
1719             a_exp = INT_MIN;
1720             break;
1721         default:
1722             g_assert_not_reached();
1723             break;
1724         }
1725         switch (b.cls) {
1726         case float_class_normal:
1727             b_exp = b.exp;
1728             break;
1729         case float_class_inf:
1730             b_exp = INT_MAX;
1731             break;
1732         case float_class_zero:
1733             b_exp = INT_MIN;
1734             break;
1735         default:
1736             g_assert_not_reached();
1737             break;
1738         }
1739
1740         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
1741             bool a_less = a_exp < b_exp;
1742             if (a_exp == b_exp) {
1743                 a_less = a.frac < b.frac;
1744             }
1745             return a_less ^ ismin ? b : a;
1746         }
1747
1748         if (a.sign == b.sign) {
1749             bool a_less = a_exp < b_exp;
1750             if (a_exp == b_exp) {
1751                 a_less = a.frac < b.frac;
1752             }
1753             return a.sign ^ a_less ^ ismin ? b : a;
1754         } else {
1755             return a.sign ^ ismin ? b : a;
1756         }
1757     }
1758 }
1759
1760 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
1761 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
1762                                      float_status *s)                   \
1763 {                                                                       \
1764     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1765     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1766     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
1767                                                                         \
1768     return float ## sz ## _round_pack_canonical(pr, s);                 \
1769 }
1770
1771 MINMAX(16, min, true, false, false)
1772 MINMAX(16, minnum, true, true, false)
1773 MINMAX(16, minnummag, true, true, true)
1774 MINMAX(16, max, false, false, false)
1775 MINMAX(16, maxnum, false, true, false)
1776 MINMAX(16, maxnummag, false, true, true)
1777
1778 MINMAX(32, min, true, false, false)
1779 MINMAX(32, minnum, true, true, false)
1780 MINMAX(32, minnummag, true, true, true)
1781 MINMAX(32, max, false, false, false)
1782 MINMAX(32, maxnum, false, true, false)
1783 MINMAX(32, maxnummag, false, true, true)
1784
1785 MINMAX(64, min, true, false, false)
1786 MINMAX(64, minnum, true, true, false)
1787 MINMAX(64, minnummag, true, true, true)
1788 MINMAX(64, max, false, false, false)
1789 MINMAX(64, maxnum, false, true, false)
1790 MINMAX(64, maxnummag, false, true, true)
1791
1792 #undef MINMAX
1793
1794 /* Floating point compare */
1795 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1796                           float_status *s)
1797 {
1798     if (is_nan(a.cls) || is_nan(b.cls)) {
1799         if (!is_quiet ||
1800             a.cls == float_class_snan ||
1801             b.cls == float_class_snan) {
1802             s->float_exception_flags |= float_flag_invalid;
1803         }
1804         return float_relation_unordered;
1805     }
1806
1807     if (a.cls == float_class_zero) {
1808         if (b.cls == float_class_zero) {
1809             return float_relation_equal;
1810         }
1811         return b.sign ? float_relation_greater : float_relation_less;
1812     } else if (b.cls == float_class_zero) {
1813         return a.sign ? float_relation_less : float_relation_greater;
1814     }
1815
1816     /* The only really important thing about infinity is its sign. If
1817      * both are infinities the sign marks the smallest of the two.
1818      */
1819     if (a.cls == float_class_inf) {
1820         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1821             return float_relation_equal;
1822         }
1823         return a.sign ? float_relation_less : float_relation_greater;
1824     } else if (b.cls == float_class_inf) {
1825         return b.sign ? float_relation_greater : float_relation_less;
1826     }
1827
1828     if (a.sign != b.sign) {
1829         return a.sign ? float_relation_less : float_relation_greater;
1830     }
1831
1832     if (a.exp == b.exp) {
1833         if (a.frac == b.frac) {
1834             return float_relation_equal;
1835         }
1836         if (a.sign) {
1837             return a.frac > b.frac ?
1838                 float_relation_less : float_relation_greater;
1839         } else {
1840             return a.frac > b.frac ?
1841                 float_relation_greater : float_relation_less;
1842         }
1843     } else {
1844         if (a.sign) {
1845             return a.exp > b.exp ? float_relation_less : float_relation_greater;
1846         } else {
1847             return a.exp > b.exp ? float_relation_greater : float_relation_less;
1848         }
1849     }
1850 }
1851
1852 #define COMPARE(sz)                                                     \
1853 int float ## sz ## _compare(float ## sz a, float ## sz b,               \
1854                             float_status *s)                            \
1855 {                                                                       \
1856     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1857     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1858     return compare_floats(pa, pb, false, s);                            \
1859 }                                                                       \
1860 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b,         \
1861                                   float_status *s)                      \
1862 {                                                                       \
1863     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1864     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1865     return compare_floats(pa, pb, true, s);                             \
1866 }
1867
1868 COMPARE(16)
1869 COMPARE(32)
1870 COMPARE(64)
1871
1872 #undef COMPARE
1873
1874 /* Multiply A by 2 raised to the power N.  */
1875 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1876 {
1877     if (unlikely(is_nan(a.cls))) {
1878         return return_nan(a, s);
1879     }
1880     if (a.cls == float_class_normal) {
1881         a.exp += n;
1882     }
1883     return a;
1884 }
1885
1886 float16 float16_scalbn(float16 a, int n, float_status *status)
1887 {
1888     FloatParts pa = float16_unpack_canonical(a, status);
1889     FloatParts pr = scalbn_decomposed(pa, n, status);
1890     return float16_round_pack_canonical(pr, status);
1891 }
1892
1893 float32 float32_scalbn(float32 a, int n, float_status *status)
1894 {
1895     FloatParts pa = float32_unpack_canonical(a, status);
1896     FloatParts pr = scalbn_decomposed(pa, n, status);
1897     return float32_round_pack_canonical(pr, status);
1898 }
1899
1900 float64 float64_scalbn(float64 a, int n, float_status *status)
1901 {
1902     FloatParts pa = float64_unpack_canonical(a, status);
1903     FloatParts pr = scalbn_decomposed(pa, n, status);
1904     return float64_round_pack_canonical(pr, status);
1905 }
1906
1907 /*
1908  * Square Root
1909  *
1910  * The old softfloat code did an approximation step before zeroing in
1911  * on the final result. However for simpleness we just compute the
1912  * square root by iterating down from the implicit bit to enough extra
1913  * bits to ensure we get a correctly rounded result.
1914  *
1915  * This does mean however the calculation is slower than before,
1916  * especially for 64 bit floats.
1917  */
1918
1919 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
1920 {
1921     uint64_t a_frac, r_frac, s_frac;
1922     int bit, last_bit;
1923
1924     if (is_nan(a.cls)) {
1925         return return_nan(a, s);
1926     }
1927     if (a.cls == float_class_zero) {
1928         return a;  /* sqrt(+-0) = +-0 */
1929     }
1930     if (a.sign) {
1931         s->float_exception_flags |= float_flag_invalid;
1932         a.cls = float_class_dnan;
1933         return a;
1934     }
1935     if (a.cls == float_class_inf) {
1936         return a;  /* sqrt(+inf) = +inf */
1937     }
1938
1939     assert(a.cls == float_class_normal);
1940
1941     /* We need two overflow bits at the top. Adding room for that is a
1942      * right shift. If the exponent is odd, we can discard the low bit
1943      * by multiplying the fraction by 2; that's a left shift. Combine
1944      * those and we shift right if the exponent is even.
1945      */
1946     a_frac = a.frac;
1947     if (!(a.exp & 1)) {
1948         a_frac >>= 1;
1949     }
1950     a.exp >>= 1;
1951
1952     /* Bit-by-bit computation of sqrt.  */
1953     r_frac = 0;
1954     s_frac = 0;
1955
1956     /* Iterate from implicit bit down to the 3 extra bits to compute a
1957      * properly rounded result. Remember we've inserted one more bit
1958      * at the top, so these positions are one less.
1959      */
1960     bit = DECOMPOSED_BINARY_POINT - 1;
1961     last_bit = MAX(p->frac_shift - 4, 0);
1962     do {
1963         uint64_t q = 1ULL << bit;
1964         uint64_t t_frac = s_frac + q;
1965         if (t_frac <= a_frac) {
1966             s_frac = t_frac + q;
1967             a_frac -= t_frac;
1968             r_frac += q;
1969         }
1970         a_frac <<= 1;
1971     } while (--bit >= last_bit);
1972
1973     /* Undo the right shift done above. If there is any remaining
1974      * fraction, the result is inexact. Set the sticky bit.
1975      */
1976     a.frac = (r_frac << 1) + (a_frac != 0);
1977
1978     return a;
1979 }
1980
1981 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
1982 {
1983     FloatParts pa = float16_unpack_canonical(a, status);
1984     FloatParts pr = sqrt_float(pa, status, &float16_params);
1985     return float16_round_pack_canonical(pr, status);
1986 }
1987
1988 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
1989 {
1990     FloatParts pa = float32_unpack_canonical(a, status);
1991     FloatParts pr = sqrt_float(pa, status, &float32_params);
1992     return float32_round_pack_canonical(pr, status);
1993 }
1994
1995 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
1996 {
1997     FloatParts pa = float64_unpack_canonical(a, status);
1998     FloatParts pr = sqrt_float(pa, status, &float64_params);
1999     return float64_round_pack_canonical(pr, status);
2000 }
2001
2002
2003 /*----------------------------------------------------------------------------
2004 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2005 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2006 | input.  If `zSign' is 1, the input is negated before being converted to an
2007 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
2008 | is simply rounded to an integer, with the inexact exception raised if the
2009 | input cannot be represented exactly as an integer.  However, if the fixed-
2010 | point input is too large, the invalid exception is raised and the largest
2011 | positive or negative integer is returned.
2012 *----------------------------------------------------------------------------*/
2013
2014 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2015 {
2016     int8_t roundingMode;
2017     flag roundNearestEven;
2018     int8_t roundIncrement, roundBits;
2019     int32_t z;
2020
2021     roundingMode = status->float_rounding_mode;
2022     roundNearestEven = ( roundingMode == float_round_nearest_even );
2023     switch (roundingMode) {
2024     case float_round_nearest_even:
2025     case float_round_ties_away:
2026         roundIncrement = 0x40;
2027         break;
2028     case float_round_to_zero:
2029         roundIncrement = 0;
2030         break;
2031     case float_round_up:
2032         roundIncrement = zSign ? 0 : 0x7f;
2033         break;
2034     case float_round_down:
2035         roundIncrement = zSign ? 0x7f : 0;
2036         break;
2037     default:
2038         abort();
2039     }
2040     roundBits = absZ & 0x7F;
2041     absZ = ( absZ + roundIncrement )>>7;
2042     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2043     z = absZ;
2044     if ( zSign ) z = - z;
2045     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2046         float_raise(float_flag_invalid, status);
2047         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2048     }
2049     if (roundBits) {
2050         status->float_exception_flags |= float_flag_inexact;
2051     }
2052     return z;
2053
2054 }
2055
2056 /*----------------------------------------------------------------------------
2057 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2058 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2059 | and returns the properly rounded 64-bit integer corresponding to the input.
2060 | If `zSign' is 1, the input is negated before being converted to an integer.
2061 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2062 | the inexact exception raised if the input cannot be represented exactly as
2063 | an integer.  However, if the fixed-point input is too large, the invalid
2064 | exception is raised and the largest positive or negative integer is
2065 | returned.
2066 *----------------------------------------------------------------------------*/
2067
2068 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2069                                float_status *status)
2070 {
2071     int8_t roundingMode;
2072     flag roundNearestEven, increment;
2073     int64_t z;
2074
2075     roundingMode = status->float_rounding_mode;
2076     roundNearestEven = ( roundingMode == float_round_nearest_even );
2077     switch (roundingMode) {
2078     case float_round_nearest_even:
2079     case float_round_ties_away:
2080         increment = ((int64_t) absZ1 < 0);
2081         break;
2082     case float_round_to_zero:
2083         increment = 0;
2084         break;
2085     case float_round_up:
2086         increment = !zSign && absZ1;
2087         break;
2088     case float_round_down:
2089         increment = zSign && absZ1;
2090         break;
2091     default:
2092         abort();
2093     }
2094     if ( increment ) {
2095         ++absZ0;
2096         if ( absZ0 == 0 ) goto overflow;
2097         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2098     }
2099     z = absZ0;
2100     if ( zSign ) z = - z;
2101     if ( z && ( ( z < 0 ) ^ zSign ) ) {
2102  overflow:
2103         float_raise(float_flag_invalid, status);
2104         return
2105               zSign ? (int64_t) LIT64( 0x8000000000000000 )
2106             : LIT64( 0x7FFFFFFFFFFFFFFF );
2107     }
2108     if (absZ1) {
2109         status->float_exception_flags |= float_flag_inexact;
2110     }
2111     return z;
2112
2113 }
2114
2115 /*----------------------------------------------------------------------------
2116 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2117 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2118 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2119 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
2120 | with the inexact exception raised if the input cannot be represented exactly
2121 | as an integer.  However, if the fixed-point input is too large, the invalid
2122 | exception is raised and the largest unsigned integer is returned.
2123 *----------------------------------------------------------------------------*/
2124
2125 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2126                                 uint64_t absZ1, float_status *status)
2127 {
2128     int8_t roundingMode;
2129     flag roundNearestEven, increment;
2130
2131     roundingMode = status->float_rounding_mode;
2132     roundNearestEven = (roundingMode == float_round_nearest_even);
2133     switch (roundingMode) {
2134     case float_round_nearest_even:
2135     case float_round_ties_away:
2136         increment = ((int64_t)absZ1 < 0);
2137         break;
2138     case float_round_to_zero:
2139         increment = 0;
2140         break;
2141     case float_round_up:
2142         increment = !zSign && absZ1;
2143         break;
2144     case float_round_down:
2145         increment = zSign && absZ1;
2146         break;
2147     default:
2148         abort();
2149     }
2150     if (increment) {
2151         ++absZ0;
2152         if (absZ0 == 0) {
2153             float_raise(float_flag_invalid, status);
2154             return LIT64(0xFFFFFFFFFFFFFFFF);
2155         }
2156         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2157     }
2158
2159     if (zSign && absZ0) {
2160         float_raise(float_flag_invalid, status);
2161         return 0;
2162     }
2163
2164     if (absZ1) {
2165         status->float_exception_flags |= float_flag_inexact;
2166     }
2167     return absZ0;
2168 }
2169
2170 /*----------------------------------------------------------------------------
2171 | If `a' is denormal and we are in flush-to-zero mode then set the
2172 | input-denormal exception and return zero. Otherwise just return the value.
2173 *----------------------------------------------------------------------------*/
2174 float32 float32_squash_input_denormal(float32 a, float_status *status)
2175 {
2176     if (status->flush_inputs_to_zero) {
2177         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2178             float_raise(float_flag_input_denormal, status);
2179             return make_float32(float32_val(a) & 0x80000000);
2180         }
2181     }
2182     return a;
2183 }
2184
2185 /*----------------------------------------------------------------------------
2186 | Normalizes the subnormal single-precision floating-point value represented
2187 | by the denormalized significand `aSig'.  The normalized exponent and
2188 | significand are stored at the locations pointed to by `zExpPtr' and
2189 | `zSigPtr', respectively.
2190 *----------------------------------------------------------------------------*/
2191
2192 static void
2193  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2194 {
2195     int8_t shiftCount;
2196
2197     shiftCount = countLeadingZeros32( aSig ) - 8;
2198     *zSigPtr = aSig<<shiftCount;
2199     *zExpPtr = 1 - shiftCount;
2200
2201 }
2202
2203 /*----------------------------------------------------------------------------
2204 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2205 | and significand `zSig', and returns the proper single-precision floating-
2206 | point value corresponding to the abstract input.  Ordinarily, the abstract
2207 | value is simply rounded and packed into the single-precision format, with
2208 | the inexact exception raised if the abstract input cannot be represented
2209 | exactly.  However, if the abstract value is too large, the overflow and
2210 | inexact exceptions are raised and an infinity or maximal finite value is
2211 | returned.  If the abstract value is too small, the input value is rounded to
2212 | a subnormal number, and the underflow and inexact exceptions are raised if
2213 | the abstract input cannot be represented exactly as a subnormal single-
2214 | precision floating-point number.
2215 |     The input significand `zSig' has its binary point between bits 30
2216 | and 29, which is 7 bits to the left of the usual location.  This shifted
2217 | significand must be normalized or smaller.  If `zSig' is not normalized,
2218 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2219 | and it must not require rounding.  In the usual case that `zSig' is
2220 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2221 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2222 | Binary Floating-Point Arithmetic.
2223 *----------------------------------------------------------------------------*/
2224
2225 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2226                                    float_status *status)
2227 {
2228     int8_t roundingMode;
2229     flag roundNearestEven;
2230     int8_t roundIncrement, roundBits;
2231     flag isTiny;
2232
2233     roundingMode = status->float_rounding_mode;
2234     roundNearestEven = ( roundingMode == float_round_nearest_even );
2235     switch (roundingMode) {
2236     case float_round_nearest_even:
2237     case float_round_ties_away:
2238         roundIncrement = 0x40;
2239         break;
2240     case float_round_to_zero:
2241         roundIncrement = 0;
2242         break;
2243     case float_round_up:
2244         roundIncrement = zSign ? 0 : 0x7f;
2245         break;
2246     case float_round_down:
2247         roundIncrement = zSign ? 0x7f : 0;
2248         break;
2249     default:
2250         abort();
2251         break;
2252     }
2253     roundBits = zSig & 0x7F;
2254     if ( 0xFD <= (uint16_t) zExp ) {
2255         if (    ( 0xFD < zExp )
2256              || (    ( zExp == 0xFD )
2257                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2258            ) {
2259             float_raise(float_flag_overflow | float_flag_inexact, status);
2260             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2261         }
2262         if ( zExp < 0 ) {
2263             if (status->flush_to_zero) {
2264                 float_raise(float_flag_output_denormal, status);
2265                 return packFloat32(zSign, 0, 0);
2266             }
2267             isTiny =
2268                 (status->float_detect_tininess
2269                  == float_tininess_before_rounding)
2270                 || ( zExp < -1 )
2271                 || ( zSig + roundIncrement < 0x80000000 );
2272             shift32RightJamming( zSig, - zExp, &zSig );
2273             zExp = 0;
2274             roundBits = zSig & 0x7F;
2275             if (isTiny && roundBits) {
2276                 float_raise(float_flag_underflow, status);
2277             }
2278         }
2279     }
2280     if (roundBits) {
2281         status->float_exception_flags |= float_flag_inexact;
2282     }
2283     zSig = ( zSig + roundIncrement )>>7;
2284     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2285     if ( zSig == 0 ) zExp = 0;
2286     return packFloat32( zSign, zExp, zSig );
2287
2288 }
2289
2290 /*----------------------------------------------------------------------------
2291 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2292 | and significand `zSig', and returns the proper single-precision floating-
2293 | point value corresponding to the abstract input.  This routine is just like
2294 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2295 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2296 | floating-point exponent.
2297 *----------------------------------------------------------------------------*/
2298
2299 static float32
2300  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2301                               float_status *status)
2302 {
2303     int8_t shiftCount;
2304
2305     shiftCount = countLeadingZeros32( zSig ) - 1;
2306     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2307                                status);
2308
2309 }
2310
2311 /*----------------------------------------------------------------------------
2312 | If `a' is denormal and we are in flush-to-zero mode then set the
2313 | input-denormal exception and return zero. Otherwise just return the value.
2314 *----------------------------------------------------------------------------*/
2315 float64 float64_squash_input_denormal(float64 a, float_status *status)
2316 {
2317     if (status->flush_inputs_to_zero) {
2318         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2319             float_raise(float_flag_input_denormal, status);
2320             return make_float64(float64_val(a) & (1ULL << 63));
2321         }
2322     }
2323     return a;
2324 }
2325
2326 /*----------------------------------------------------------------------------
2327 | Normalizes the subnormal double-precision floating-point value represented
2328 | by the denormalized significand `aSig'.  The normalized exponent and
2329 | significand are stored at the locations pointed to by `zExpPtr' and
2330 | `zSigPtr', respectively.
2331 *----------------------------------------------------------------------------*/
2332
2333 static void
2334  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2335 {
2336     int8_t shiftCount;
2337
2338     shiftCount = countLeadingZeros64( aSig ) - 11;
2339     *zSigPtr = aSig<<shiftCount;
2340     *zExpPtr = 1 - shiftCount;
2341
2342 }
2343
2344 /*----------------------------------------------------------------------------
2345 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2346 | double-precision floating-point value, returning the result.  After being
2347 | shifted into the proper positions, the three fields are simply added
2348 | together to form the result.  This means that any integer portion of `zSig'
2349 | will be added into the exponent.  Since a properly normalized significand
2350 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2351 | than the desired result exponent whenever `zSig' is a complete, normalized
2352 | significand.
2353 *----------------------------------------------------------------------------*/
2354
2355 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2356 {
2357
2358     return make_float64(
2359         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2360
2361 }
2362
2363 /*----------------------------------------------------------------------------
2364 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2365 | and significand `zSig', and returns the proper double-precision floating-
2366 | point value corresponding to the abstract input.  Ordinarily, the abstract
2367 | value is simply rounded and packed into the double-precision format, with
2368 | the inexact exception raised if the abstract input cannot be represented
2369 | exactly.  However, if the abstract value is too large, the overflow and
2370 | inexact exceptions are raised and an infinity or maximal finite value is
2371 | returned.  If the abstract value is too small, the input value is rounded to
2372 | a subnormal number, and the underflow and inexact exceptions are raised if
2373 | the abstract input cannot be represented exactly as a subnormal double-
2374 | precision floating-point number.
2375 |     The input significand `zSig' has its binary point between bits 62
2376 | and 61, which is 10 bits to the left of the usual location.  This shifted
2377 | significand must be normalized or smaller.  If `zSig' is not normalized,
2378 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2379 | and it must not require rounding.  In the usual case that `zSig' is
2380 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2381 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2382 | Binary Floating-Point Arithmetic.
2383 *----------------------------------------------------------------------------*/
2384
2385 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2386                                    float_status *status)
2387 {
2388     int8_t roundingMode;
2389     flag roundNearestEven;
2390     int roundIncrement, roundBits;
2391     flag isTiny;
2392
2393     roundingMode = status->float_rounding_mode;
2394     roundNearestEven = ( roundingMode == float_round_nearest_even );
2395     switch (roundingMode) {
2396     case float_round_nearest_even:
2397     case float_round_ties_away:
2398         roundIncrement = 0x200;
2399         break;
2400     case float_round_to_zero:
2401         roundIncrement = 0;
2402         break;
2403     case float_round_up:
2404         roundIncrement = zSign ? 0 : 0x3ff;
2405         break;
2406     case float_round_down:
2407         roundIncrement = zSign ? 0x3ff : 0;
2408         break;
2409     case float_round_to_odd:
2410         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2411         break;
2412     default:
2413         abort();
2414     }
2415     roundBits = zSig & 0x3FF;
2416     if ( 0x7FD <= (uint16_t) zExp ) {
2417         if (    ( 0x7FD < zExp )
2418              || (    ( zExp == 0x7FD )
2419                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2420            ) {
2421             bool overflow_to_inf = roundingMode != float_round_to_odd &&
2422                                    roundIncrement != 0;
2423             float_raise(float_flag_overflow | float_flag_inexact, status);
2424             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2425         }
2426         if ( zExp < 0 ) {
2427             if (status->flush_to_zero) {
2428                 float_raise(float_flag_output_denormal, status);
2429                 return packFloat64(zSign, 0, 0);
2430             }
2431             isTiny =
2432                    (status->float_detect_tininess
2433                     == float_tininess_before_rounding)
2434                 || ( zExp < -1 )
2435                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2436             shift64RightJamming( zSig, - zExp, &zSig );
2437             zExp = 0;
2438             roundBits = zSig & 0x3FF;
2439             if (isTiny && roundBits) {
2440                 float_raise(float_flag_underflow, status);
2441             }
2442             if (roundingMode == float_round_to_odd) {
2443                 /*
2444                  * For round-to-odd case, the roundIncrement depends on
2445                  * zSig which just changed.
2446                  */
2447                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2448             }
2449         }
2450     }
2451     if (roundBits) {
2452         status->float_exception_flags |= float_flag_inexact;
2453     }
2454     zSig = ( zSig + roundIncrement )>>10;
2455     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2456     if ( zSig == 0 ) zExp = 0;
2457     return packFloat64( zSign, zExp, zSig );
2458
2459 }
2460
2461 /*----------------------------------------------------------------------------
2462 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2463 | and significand `zSig', and returns the proper double-precision floating-
2464 | point value corresponding to the abstract input.  This routine is just like
2465 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2466 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2467 | floating-point exponent.
2468 *----------------------------------------------------------------------------*/
2469
2470 static float64
2471  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2472                               float_status *status)
2473 {
2474     int8_t shiftCount;
2475
2476     shiftCount = countLeadingZeros64( zSig ) - 1;
2477     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2478                                status);
2479
2480 }
2481
2482 /*----------------------------------------------------------------------------
2483 | Normalizes the subnormal extended double-precision floating-point value
2484 | represented by the denormalized significand `aSig'.  The normalized exponent
2485 | and significand are stored at the locations pointed to by `zExpPtr' and
2486 | `zSigPtr', respectively.
2487 *----------------------------------------------------------------------------*/
2488
2489 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2490                                 uint64_t *zSigPtr)
2491 {
2492     int8_t shiftCount;
2493
2494     shiftCount = countLeadingZeros64( aSig );
2495     *zSigPtr = aSig<<shiftCount;
2496     *zExpPtr = 1 - shiftCount;
2497 }
2498
2499 /*----------------------------------------------------------------------------
2500 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2501 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2502 | and returns the proper extended double-precision floating-point value
2503 | corresponding to the abstract input.  Ordinarily, the abstract value is
2504 | rounded and packed into the extended double-precision format, with the
2505 | inexact exception raised if the abstract input cannot be represented
2506 | exactly.  However, if the abstract value is too large, the overflow and
2507 | inexact exceptions are raised and an infinity or maximal finite value is
2508 | returned.  If the abstract value is too small, the input value is rounded to
2509 | a subnormal number, and the underflow and inexact exceptions are raised if
2510 | the abstract input cannot be represented exactly as a subnormal extended
2511 | double-precision floating-point number.
2512 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
2513 | number of bits as single or double precision, respectively.  Otherwise, the
2514 | result is rounded to the full precision of the extended double-precision
2515 | format.
2516 |     The input significand must be normalized or smaller.  If the input
2517 | significand is not normalized, `zExp' must be 0; in that case, the result
2518 | returned is a subnormal number, and it must not require rounding.  The
2519 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2520 | Floating-Point Arithmetic.
2521 *----------------------------------------------------------------------------*/
2522
2523 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2524                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2525                               float_status *status)
2526 {
2527     int8_t roundingMode;
2528     flag roundNearestEven, increment, isTiny;
2529     int64_t roundIncrement, roundMask, roundBits;
2530
2531     roundingMode = status->float_rounding_mode;
2532     roundNearestEven = ( roundingMode == float_round_nearest_even );
2533     if ( roundingPrecision == 80 ) goto precision80;
2534     if ( roundingPrecision == 64 ) {
2535         roundIncrement = LIT64( 0x0000000000000400 );
2536         roundMask = LIT64( 0x00000000000007FF );
2537     }
2538     else if ( roundingPrecision == 32 ) {
2539         roundIncrement = LIT64( 0x0000008000000000 );
2540         roundMask = LIT64( 0x000000FFFFFFFFFF );
2541     }
2542     else {
2543         goto precision80;
2544     }
2545     zSig0 |= ( zSig1 != 0 );
2546     switch (roundingMode) {
2547     case float_round_nearest_even:
2548     case float_round_ties_away:
2549         break;
2550     case float_round_to_zero:
2551         roundIncrement = 0;
2552         break;
2553     case float_round_up:
2554         roundIncrement = zSign ? 0 : roundMask;
2555         break;
2556     case float_round_down:
2557         roundIncrement = zSign ? roundMask : 0;
2558         break;
2559     default:
2560         abort();
2561     }
2562     roundBits = zSig0 & roundMask;
2563     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2564         if (    ( 0x7FFE < zExp )
2565              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2566            ) {
2567             goto overflow;
2568         }
2569         if ( zExp <= 0 ) {
2570             if (status->flush_to_zero) {
2571                 float_raise(float_flag_output_denormal, status);
2572                 return packFloatx80(zSign, 0, 0);
2573             }
2574             isTiny =
2575                    (status->float_detect_tininess
2576                     == float_tininess_before_rounding)
2577                 || ( zExp < 0 )
2578                 || ( zSig0 <= zSig0 + roundIncrement );
2579             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2580             zExp = 0;
2581             roundBits = zSig0 & roundMask;
2582             if (isTiny && roundBits) {
2583                 float_raise(float_flag_underflow, status);
2584             }
2585             if (roundBits) {
2586                 status->float_exception_flags |= float_flag_inexact;
2587             }
2588             zSig0 += roundIncrement;
2589             if ( (int64_t) zSig0 < 0 ) zExp = 1;
2590             roundIncrement = roundMask + 1;
2591             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2592                 roundMask |= roundIncrement;
2593             }
2594             zSig0 &= ~ roundMask;
2595             return packFloatx80( zSign, zExp, zSig0 );
2596         }
2597     }
2598     if (roundBits) {
2599         status->float_exception_flags |= float_flag_inexact;
2600     }
2601     zSig0 += roundIncrement;
2602     if ( zSig0 < roundIncrement ) {
2603         ++zExp;
2604         zSig0 = LIT64( 0x8000000000000000 );
2605     }
2606     roundIncrement = roundMask + 1;
2607     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2608         roundMask |= roundIncrement;
2609     }
2610     zSig0 &= ~ roundMask;
2611     if ( zSig0 == 0 ) zExp = 0;
2612     return packFloatx80( zSign, zExp, zSig0 );
2613  precision80:
2614     switch (roundingMode) {
2615     case float_round_nearest_even:
2616     case float_round_ties_away:
2617         increment = ((int64_t)zSig1 < 0);
2618         break;
2619     case float_round_to_zero:
2620         increment = 0;
2621         break;
2622     case float_round_up:
2623         increment = !zSign && zSig1;
2624         break;
2625     case float_round_down:
2626         increment = zSign && zSig1;
2627         break;
2628     default:
2629         abort();
2630     }
2631     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2632         if (    ( 0x7FFE < zExp )
2633              || (    ( zExp == 0x7FFE )
2634                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2635                   && increment
2636                 )
2637            ) {
2638             roundMask = 0;
2639  overflow:
2640             float_raise(float_flag_overflow | float_flag_inexact, status);
2641             if (    ( roundingMode == float_round_to_zero )
2642                  || ( zSign && ( roundingMode == float_round_up ) )
2643                  || ( ! zSign && ( roundingMode == float_round_down ) )
2644                ) {
2645                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2646             }
2647             return packFloatx80(zSign,
2648                                 floatx80_infinity_high,
2649                                 floatx80_infinity_low);
2650         }
2651         if ( zExp <= 0 ) {
2652             isTiny =
2653                    (status->float_detect_tininess
2654                     == float_tininess_before_rounding)
2655                 || ( zExp < 0 )
2656                 || ! increment
2657                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2658             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2659             zExp = 0;
2660             if (isTiny && zSig1) {
2661                 float_raise(float_flag_underflow, status);
2662             }
2663             if (zSig1) {
2664                 status->float_exception_flags |= float_flag_inexact;
2665             }
2666             switch (roundingMode) {
2667             case float_round_nearest_even:
2668             case float_round_ties_away:
2669                 increment = ((int64_t)zSig1 < 0);
2670                 break;
2671             case float_round_to_zero:
2672                 increment = 0;
2673                 break;
2674             case float_round_up:
2675                 increment = !zSign && zSig1;
2676                 break;
2677             case float_round_down:
2678                 increment = zSign && zSig1;
2679                 break;
2680             default:
2681                 abort();
2682             }
2683             if ( increment ) {
2684                 ++zSig0;
2685                 zSig0 &=
2686                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2687                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2688             }
2689             return packFloatx80( zSign, zExp, zSig0 );
2690         }
2691     }
2692     if (zSig1) {
2693         status->float_exception_flags |= float_flag_inexact;
2694     }
2695     if ( increment ) {
2696         ++zSig0;
2697         if ( zSig0 == 0 ) {
2698             ++zExp;
2699             zSig0 = LIT64( 0x8000000000000000 );
2700         }
2701         else {
2702             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2703         }
2704     }
2705     else {
2706         if ( zSig0 == 0 ) zExp = 0;
2707     }
2708     return packFloatx80( zSign, zExp, zSig0 );
2709
2710 }
2711
2712 /*----------------------------------------------------------------------------
2713 | Takes an abstract floating-point value having sign `zSign', exponent
2714 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2715 | and returns the proper extended double-precision floating-point value
2716 | corresponding to the abstract input.  This routine is just like
2717 | `roundAndPackFloatx80' except that the input significand does not have to be
2718 | normalized.
2719 *----------------------------------------------------------------------------*/
2720
2721 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2722                                        flag zSign, int32_t zExp,
2723                                        uint64_t zSig0, uint64_t zSig1,
2724                                        float_status *status)
2725 {
2726     int8_t shiftCount;
2727
2728     if ( zSig0 == 0 ) {
2729         zSig0 = zSig1;
2730         zSig1 = 0;
2731         zExp -= 64;
2732     }
2733     shiftCount = countLeadingZeros64( zSig0 );
2734     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2735     zExp -= shiftCount;
2736     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2737                                 zSig0, zSig1, status);
2738
2739 }
2740
2741 /*----------------------------------------------------------------------------
2742 | Returns the least-significant 64 fraction bits of the quadruple-precision
2743 | floating-point value `a'.
2744 *----------------------------------------------------------------------------*/
2745
2746 static inline uint64_t extractFloat128Frac1( float128 a )
2747 {
2748
2749     return a.low;
2750
2751 }
2752
2753 /*----------------------------------------------------------------------------
2754 | Returns the most-significant 48 fraction bits of the quadruple-precision
2755 | floating-point value `a'.
2756 *----------------------------------------------------------------------------*/
2757
2758 static inline uint64_t extractFloat128Frac0( float128 a )
2759 {
2760
2761     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2762
2763 }
2764
2765 /*----------------------------------------------------------------------------
2766 | Returns the exponent bits of the quadruple-precision floating-point value
2767 | `a'.
2768 *----------------------------------------------------------------------------*/
2769
2770 static inline int32_t extractFloat128Exp( float128 a )
2771 {
2772
2773     return ( a.high>>48 ) & 0x7FFF;
2774
2775 }
2776
2777 /*----------------------------------------------------------------------------
2778 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2779 *----------------------------------------------------------------------------*/
2780
2781 static inline flag extractFloat128Sign( float128 a )
2782 {
2783
2784     return a.high>>63;
2785
2786 }
2787
2788 /*----------------------------------------------------------------------------
2789 | Normalizes the subnormal quadruple-precision floating-point value
2790 | represented by the denormalized significand formed by the concatenation of
2791 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
2792 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
2793 | significand are stored at the location pointed to by `zSig0Ptr', and the
2794 | least significant 64 bits of the normalized significand are stored at the
2795 | location pointed to by `zSig1Ptr'.
2796 *----------------------------------------------------------------------------*/
2797
2798 static void
2799  normalizeFloat128Subnormal(
2800      uint64_t aSig0,
2801      uint64_t aSig1,
2802      int32_t *zExpPtr,
2803      uint64_t *zSig0Ptr,
2804      uint64_t *zSig1Ptr
2805  )
2806 {
2807     int8_t shiftCount;
2808
2809     if ( aSig0 == 0 ) {
2810         shiftCount = countLeadingZeros64( aSig1 ) - 15;
2811         if ( shiftCount < 0 ) {
2812             *zSig0Ptr = aSig1>>( - shiftCount );
2813             *zSig1Ptr = aSig1<<( shiftCount & 63 );
2814         }
2815         else {
2816             *zSig0Ptr = aSig1<<shiftCount;
2817             *zSig1Ptr = 0;
2818         }
2819         *zExpPtr = - shiftCount - 63;
2820     }
2821     else {
2822         shiftCount = countLeadingZeros64( aSig0 ) - 15;
2823         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2824         *zExpPtr = 1 - shiftCount;
2825     }
2826
2827 }
2828
2829 /*----------------------------------------------------------------------------
2830 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2831 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2832 | floating-point value, returning the result.  After being shifted into the
2833 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2834 | added together to form the most significant 32 bits of the result.  This
2835 | means that any integer portion of `zSig0' will be added into the exponent.
2836 | Since a properly normalized significand will have an integer portion equal
2837 | to 1, the `zExp' input should be 1 less than the desired result exponent
2838 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2839 | significand.
2840 *----------------------------------------------------------------------------*/
2841
2842 static inline float128
2843  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
2844 {
2845     float128 z;
2846
2847     z.low = zSig1;
2848     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
2849     return z;
2850
2851 }
2852
2853 /*----------------------------------------------------------------------------
2854 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2855 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2856 | and `zSig2', and returns the proper quadruple-precision floating-point value
2857 | corresponding to the abstract input.  Ordinarily, the abstract value is
2858 | simply rounded and packed into the quadruple-precision format, with the
2859 | inexact exception raised if the abstract input cannot be represented
2860 | exactly.  However, if the abstract value is too large, the overflow and
2861 | inexact exceptions are raised and an infinity or maximal finite value is
2862 | returned.  If the abstract value is too small, the input value is rounded to
2863 | a subnormal number, and the underflow and inexact exceptions are raised if
2864 | the abstract input cannot be represented exactly as a subnormal quadruple-
2865 | precision floating-point number.
2866 |     The input significand must be normalized or smaller.  If the input
2867 | significand is not normalized, `zExp' must be 0; in that case, the result
2868 | returned is a subnormal number, and it must not require rounding.  In the
2869 | usual case that the input significand is normalized, `zExp' must be 1 less
2870 | than the ``true'' floating-point exponent.  The handling of underflow and
2871 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2872 *----------------------------------------------------------------------------*/
2873
2874 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
2875                                      uint64_t zSig0, uint64_t zSig1,
2876                                      uint64_t zSig2, float_status *status)
2877 {
2878     int8_t roundingMode;
2879     flag roundNearestEven, increment, isTiny;
2880
2881     roundingMode = status->float_rounding_mode;
2882     roundNearestEven = ( roundingMode == float_round_nearest_even );
2883     switch (roundingMode) {
2884     case float_round_nearest_even:
2885     case float_round_ties_away:
2886         increment = ((int64_t)zSig2 < 0);
2887         break;
2888     case float_round_to_zero:
2889         increment = 0;
2890         break;
2891     case float_round_up:
2892         increment = !zSign && zSig2;
2893         break;
2894     case float_round_down:
2895         increment = zSign && zSig2;
2896         break;
2897     case float_round_to_odd:
2898         increment = !(zSig1 & 0x1) && zSig2;
2899         break;
2900     default:
2901         abort();
2902     }
2903     if ( 0x7FFD <= (uint32_t) zExp ) {
2904         if (    ( 0x7FFD < zExp )
2905              || (    ( zExp == 0x7FFD )
2906                   && eq128(
2907                          LIT64( 0x0001FFFFFFFFFFFF ),
2908                          LIT64( 0xFFFFFFFFFFFFFFFF ),
2909                          zSig0,
2910                          zSig1
2911                      )
2912                   && increment
2913                 )
2914            ) {
2915             float_raise(float_flag_overflow | float_flag_inexact, status);
2916             if (    ( roundingMode == float_round_to_zero )
2917                  || ( zSign && ( roundingMode == float_round_up ) )
2918                  || ( ! zSign && ( roundingMode == float_round_down ) )
2919                  || (roundingMode == float_round_to_odd)
2920                ) {
2921                 return
2922                     packFloat128(
2923                         zSign,
2924                         0x7FFE,
2925                         LIT64( 0x0000FFFFFFFFFFFF ),
2926                         LIT64( 0xFFFFFFFFFFFFFFFF )
2927                     );
2928             }
2929             return packFloat128( zSign, 0x7FFF, 0, 0 );
2930         }
2931         if ( zExp < 0 ) {
2932             if (status->flush_to_zero) {
2933                 float_raise(float_flag_output_denormal, status);
2934                 return packFloat128(zSign, 0, 0, 0);
2935             }
2936             isTiny =
2937                    (status->float_detect_tininess
2938                     == float_tininess_before_rounding)
2939                 || ( zExp < -1 )
2940                 || ! increment
2941                 || lt128(
2942                        zSig0,
2943                        zSig1,
2944                        LIT64( 0x0001FFFFFFFFFFFF ),
2945                        LIT64( 0xFFFFFFFFFFFFFFFF )
2946                    );
2947             shift128ExtraRightJamming(
2948                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2949             zExp = 0;
2950             if (isTiny && zSig2) {
2951                 float_raise(float_flag_underflow, status);
2952             }
2953             switch (roundingMode) {
2954             case float_round_nearest_even:
2955             case float_round_ties_away:
2956                 increment = ((int64_t)zSig2 < 0);
2957                 break;
2958             case float_round_to_zero:
2959                 increment = 0;
2960                 break;
2961             case float_round_up:
2962                 increment = !zSign && zSig2;
2963                 break;
2964             case float_round_down:
2965                 increment = zSign && zSig2;
2966                 break;
2967             case float_round_to_odd:
2968                 increment = !(zSig1 & 0x1) && zSig2;
2969                 break;
2970             default:
2971                 abort();
2972             }
2973         }
2974     }
2975     if (zSig2) {
2976         status->float_exception_flags |= float_flag_inexact;
2977     }
2978     if ( increment ) {
2979         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2980         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2981     }
2982     else {
2983         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2984     }
2985     return packFloat128( zSign, zExp, zSig0, zSig1 );
2986
2987 }
2988
2989 /*----------------------------------------------------------------------------
2990 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2991 | and significand formed by the concatenation of `zSig0' and `zSig1', and
2992 | returns the proper quadruple-precision floating-point value corresponding
2993 | to the abstract input.  This routine is just like `roundAndPackFloat128'
2994 | except that the input significand has fewer bits and does not have to be
2995 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
2996 | point exponent.
2997 *----------------------------------------------------------------------------*/
2998
2999 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3000                                               uint64_t zSig0, uint64_t zSig1,
3001                                               float_status *status)
3002 {
3003     int8_t shiftCount;
3004     uint64_t zSig2;
3005
3006     if ( zSig0 == 0 ) {
3007         zSig0 = zSig1;
3008         zSig1 = 0;
3009         zExp -= 64;
3010     }
3011     shiftCount = countLeadingZeros64( zSig0 ) - 15;
3012     if ( 0 <= shiftCount ) {
3013         zSig2 = 0;
3014         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3015     }
3016     else {
3017         shift128ExtraRightJamming(
3018             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3019     }
3020     zExp -= shiftCount;
3021     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3022
3023 }
3024
3025
3026 /*----------------------------------------------------------------------------
3027 | Returns the result of converting the 32-bit two's complement integer `a'
3028 | to the extended double-precision floating-point format.  The conversion
3029 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3030 | Arithmetic.
3031 *----------------------------------------------------------------------------*/
3032
3033 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3034 {
3035     flag zSign;
3036     uint32_t absA;
3037     int8_t shiftCount;
3038     uint64_t zSig;
3039
3040     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3041     zSign = ( a < 0 );
3042     absA = zSign ? - a : a;
3043     shiftCount = countLeadingZeros32( absA ) + 32;
3044     zSig = absA;
3045     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3046
3047 }
3048
3049 /*----------------------------------------------------------------------------
3050 | Returns the result of converting the 32-bit two's complement integer `a' to
3051 | the quadruple-precision floating-point format.  The conversion is performed
3052 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3053 *----------------------------------------------------------------------------*/
3054
3055 float128 int32_to_float128(int32_t a, float_status *status)
3056 {
3057     flag zSign;
3058     uint32_t absA;
3059     int8_t shiftCount;
3060     uint64_t zSig0;
3061
3062     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3063     zSign = ( a < 0 );
3064     absA = zSign ? - a : a;
3065     shiftCount = countLeadingZeros32( absA ) + 17;
3066     zSig0 = absA;
3067     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3068
3069 }
3070
3071 /*----------------------------------------------------------------------------
3072 | Returns the result of converting the 64-bit two's complement integer `a'
3073 | to the extended double-precision floating-point format.  The conversion
3074 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3075 | Arithmetic.
3076 *----------------------------------------------------------------------------*/
3077
3078 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3079 {
3080     flag zSign;
3081     uint64_t absA;
3082     int8_t shiftCount;
3083
3084     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3085     zSign = ( a < 0 );
3086     absA = zSign ? - a : a;
3087     shiftCount = countLeadingZeros64( absA );
3088     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3089
3090 }
3091
3092 /*----------------------------------------------------------------------------
3093 | Returns the result of converting the 64-bit two's complement integer `a' to
3094 | the quadruple-precision floating-point format.  The conversion is performed
3095 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3096 *----------------------------------------------------------------------------*/
3097
3098 float128 int64_to_float128(int64_t a, float_status *status)
3099 {
3100     flag zSign;
3101     uint64_t absA;
3102     int8_t shiftCount;
3103     int32_t zExp;
3104     uint64_t zSig0, zSig1;
3105
3106     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3107     zSign = ( a < 0 );
3108     absA = zSign ? - a : a;
3109     shiftCount = countLeadingZeros64( absA ) + 49;
3110     zExp = 0x406E - shiftCount;
3111     if ( 64 <= shiftCount ) {
3112         zSig1 = 0;
3113         zSig0 = absA;
3114         shiftCount -= 64;
3115     }
3116     else {
3117         zSig1 = absA;
3118         zSig0 = 0;
3119     }
3120     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3121     return packFloat128( zSign, zExp, zSig0, zSig1 );
3122
3123 }
3124
3125 /*----------------------------------------------------------------------------
3126 | Returns the result of converting the 64-bit unsigned integer `a'
3127 | to the quadruple-precision floating-point format.  The conversion is performed
3128 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3129 *----------------------------------------------------------------------------*/
3130
3131 float128 uint64_to_float128(uint64_t a, float_status *status)
3132 {
3133     if (a == 0) {
3134         return float128_zero;
3135     }
3136     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
3137 }
3138
3139
3140
3141
3142 /*----------------------------------------------------------------------------
3143 | Returns the result of converting the single-precision floating-point value
3144 | `a' to the double-precision floating-point format.  The conversion is
3145 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3146 | Arithmetic.
3147 *----------------------------------------------------------------------------*/
3148
3149 float64 float32_to_float64(float32 a, float_status *status)
3150 {
3151     flag aSign;
3152     int aExp;
3153     uint32_t aSig;
3154     a = float32_squash_input_denormal(a, status);
3155
3156     aSig = extractFloat32Frac( a );
3157     aExp = extractFloat32Exp( a );
3158     aSign = extractFloat32Sign( a );
3159     if ( aExp == 0xFF ) {
3160         if (aSig) {
3161             return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
3162         }
3163         return packFloat64( aSign, 0x7FF, 0 );
3164     }
3165     if ( aExp == 0 ) {
3166         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
3167         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3168         --aExp;
3169     }
3170     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
3171
3172 }
3173
3174 /*----------------------------------------------------------------------------
3175 | Returns the result of converting the single-precision floating-point value
3176 | `a' to the extended double-precision floating-point format.  The conversion
3177 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3178 | Arithmetic.
3179 *----------------------------------------------------------------------------*/
3180
3181 floatx80 float32_to_floatx80(float32 a, float_status *status)
3182 {
3183     flag aSign;
3184     int aExp;
3185     uint32_t aSig;
3186
3187     a = float32_squash_input_denormal(a, status);
3188     aSig = extractFloat32Frac( a );
3189     aExp = extractFloat32Exp( a );
3190     aSign = extractFloat32Sign( a );
3191     if ( aExp == 0xFF ) {
3192         if (aSig) {
3193             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3194         }
3195         return packFloatx80(aSign,
3196                             floatx80_infinity_high,
3197                             floatx80_infinity_low);
3198     }
3199     if ( aExp == 0 ) {
3200         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3201         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3202     }
3203     aSig |= 0x00800000;
3204     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3205
3206 }
3207
3208 /*----------------------------------------------------------------------------
3209 | Returns the result of converting the single-precision floating-point value
3210 | `a' to the double-precision floating-point format.  The conversion is
3211 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3212 | Arithmetic.
3213 *----------------------------------------------------------------------------*/
3214
3215 float128 float32_to_float128(float32 a, float_status *status)
3216 {
3217     flag aSign;
3218     int aExp;
3219     uint32_t aSig;
3220
3221     a = float32_squash_input_denormal(a, status);
3222     aSig = extractFloat32Frac( a );
3223     aExp = extractFloat32Exp( a );
3224     aSign = extractFloat32Sign( a );
3225     if ( aExp == 0xFF ) {
3226         if (aSig) {
3227             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3228         }
3229         return packFloat128( aSign, 0x7FFF, 0, 0 );
3230     }
3231     if ( aExp == 0 ) {
3232         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3233         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3234         --aExp;
3235     }
3236     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3237
3238 }
3239
3240 /*----------------------------------------------------------------------------
3241 | Returns the remainder of the single-precision floating-point value `a'
3242 | with respect to the corresponding value `b'.  The operation is performed
3243 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3244 *----------------------------------------------------------------------------*/
3245
3246 float32 float32_rem(float32 a, float32 b, float_status *status)
3247 {
3248     flag aSign, zSign;
3249     int aExp, bExp, expDiff;
3250     uint32_t aSig, bSig;
3251     uint32_t q;
3252     uint64_t aSig64, bSig64, q64;
3253     uint32_t alternateASig;
3254     int32_t sigMean;
3255     a = float32_squash_input_denormal(a, status);
3256     b = float32_squash_input_denormal(b, status);
3257
3258     aSig = extractFloat32Frac( a );
3259     aExp = extractFloat32Exp( a );
3260     aSign = extractFloat32Sign( a );
3261     bSig = extractFloat32Frac( b );
3262     bExp = extractFloat32Exp( b );
3263     if ( aExp == 0xFF ) {
3264         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3265             return propagateFloat32NaN(a, b, status);
3266         }
3267         float_raise(float_flag_invalid, status);
3268         return float32_default_nan(status);
3269     }
3270     if ( bExp == 0xFF ) {
3271         if (bSig) {
3272             return propagateFloat32NaN(a, b, status);
3273         }
3274         return a;
3275     }
3276     if ( bExp == 0 ) {
3277         if ( bSig == 0 ) {
3278             float_raise(float_flag_invalid, status);
3279             return float32_default_nan(status);
3280         }
3281         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3282     }
3283     if ( aExp == 0 ) {
3284         if ( aSig == 0 ) return a;
3285         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3286     }
3287     expDiff = aExp - bExp;
3288     aSig |= 0x00800000;
3289     bSig |= 0x00800000;
3290     if ( expDiff < 32 ) {
3291         aSig <<= 8;
3292         bSig <<= 8;
3293         if ( expDiff < 0 ) {
3294             if ( expDiff < -1 ) return a;
3295             aSig >>= 1;
3296         }
3297         q = ( bSig <= aSig );
3298         if ( q ) aSig -= bSig;
3299         if ( 0 < expDiff ) {
3300             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3301             q >>= 32 - expDiff;
3302             bSig >>= 2;
3303             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3304         }
3305         else {
3306             aSig >>= 2;
3307             bSig >>= 2;
3308         }
3309     }
3310     else {
3311         if ( bSig <= aSig ) aSig -= bSig;
3312         aSig64 = ( (uint64_t) aSig )<<40;
3313         bSig64 = ( (uint64_t) bSig )<<40;
3314         expDiff -= 64;
3315         while ( 0 < expDiff ) {
3316             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3317             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3318             aSig64 = - ( ( bSig * q64 )<<38 );
3319             expDiff -= 62;
3320         }
3321         expDiff += 64;
3322         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3323         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3324         q = q64>>( 64 - expDiff );
3325         bSig <<= 6;
3326         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3327     }
3328     do {
3329         alternateASig = aSig;
3330         ++q;
3331         aSig -= bSig;
3332     } while ( 0 <= (int32_t) aSig );
3333     sigMean = aSig + alternateASig;
3334     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3335         aSig = alternateASig;
3336     }
3337     zSign = ( (int32_t) aSig < 0 );
3338     if ( zSign ) aSig = - aSig;
3339     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3340 }
3341
3342
3343
3344 /*----------------------------------------------------------------------------
3345 | Returns the binary exponential of the single-precision floating-point value
3346 | `a'. The operation is performed according to the IEC/IEEE Standard for
3347 | Binary Floating-Point Arithmetic.
3348 |
3349 | Uses the following identities:
3350 |
3351 | 1. -------------------------------------------------------------------------
3352 |      x    x*ln(2)
3353 |     2  = e
3354 |
3355 | 2. -------------------------------------------------------------------------
3356 |                      2     3     4     5           n
3357 |      x        x     x     x     x     x           x
3358 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3359 |               1!    2!    3!    4!    5!          n!
3360 *----------------------------------------------------------------------------*/
3361
3362 static const float64 float32_exp2_coefficients[15] =
3363 {
3364     const_float64( 0x3ff0000000000000ll ), /*  1 */
3365     const_float64( 0x3fe0000000000000ll ), /*  2 */
3366     const_float64( 0x3fc5555555555555ll ), /*  3 */
3367     const_float64( 0x3fa5555555555555ll ), /*  4 */
3368     const_float64( 0x3f81111111111111ll ), /*  5 */
3369     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
3370     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
3371     const_float64( 0x3efa01a01a01a01all ), /*  8 */
3372     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
3373     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3374     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3375     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3376     const_float64( 0x3de6124613a86d09ll ), /* 13 */
3377     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3378     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3379 };
3380
3381 float32 float32_exp2(float32 a, float_status *status)
3382 {
3383     flag aSign;
3384     int aExp;
3385     uint32_t aSig;
3386     float64 r, x, xn;
3387     int i;
3388     a = float32_squash_input_denormal(a, status);
3389
3390     aSig = extractFloat32Frac( a );
3391     aExp = extractFloat32Exp( a );
3392     aSign = extractFloat32Sign( a );
3393
3394     if ( aExp == 0xFF) {
3395         if (aSig) {
3396             return propagateFloat32NaN(a, float32_zero, status);
3397         }
3398         return (aSign) ? float32_zero : a;
3399     }
3400     if (aExp == 0) {
3401         if (aSig == 0) return float32_one;
3402     }
3403
3404     float_raise(float_flag_inexact, status);
3405
3406     /* ******************************* */
3407     /* using float64 for approximation */
3408     /* ******************************* */
3409     x = float32_to_float64(a, status);
3410     x = float64_mul(x, float64_ln2, status);
3411
3412     xn = x;
3413     r = float64_one;
3414     for (i = 0 ; i < 15 ; i++) {
3415         float64 f;
3416
3417         f = float64_mul(xn, float32_exp2_coefficients[i], status);
3418         r = float64_add(r, f, status);
3419
3420         xn = float64_mul(xn, x, status);
3421     }
3422
3423     return float64_to_float32(r, status);
3424 }
3425
3426 /*----------------------------------------------------------------------------
3427 | Returns the binary log of the single-precision floating-point value `a'.
3428 | The operation is performed according to the IEC/IEEE Standard for Binary
3429 | Floating-Point Arithmetic.
3430 *----------------------------------------------------------------------------*/
3431 float32 float32_log2(float32 a, float_status *status)
3432 {
3433     flag aSign, zSign;
3434     int aExp;
3435     uint32_t aSig, zSig, i;
3436
3437     a = float32_squash_input_denormal(a, status);
3438     aSig = extractFloat32Frac( a );
3439     aExp = extractFloat32Exp( a );
3440     aSign = extractFloat32Sign( a );
3441
3442     if ( aExp == 0 ) {
3443         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3444         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3445     }
3446     if ( aSign ) {
3447         float_raise(float_flag_invalid, status);
3448         return float32_default_nan(status);
3449     }
3450     if ( aExp == 0xFF ) {
3451         if (aSig) {
3452             return propagateFloat32NaN(a, float32_zero, status);
3453         }
3454         return a;
3455     }
3456
3457     aExp -= 0x7F;
3458     aSig |= 0x00800000;
3459     zSign = aExp < 0;
3460     zSig = aExp << 23;
3461
3462     for (i = 1 << 22; i > 0; i >>= 1) {
3463         aSig = ( (uint64_t)aSig * aSig ) >> 23;
3464         if ( aSig & 0x01000000 ) {
3465             aSig >>= 1;
3466             zSig |= i;
3467         }
3468     }
3469
3470     if ( zSign )
3471         zSig = -zSig;
3472
3473     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3474 }
3475
3476 /*----------------------------------------------------------------------------
3477 | Returns 1 if the single-precision floating-point value `a' is equal to
3478 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3479 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3480 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3481 *----------------------------------------------------------------------------*/
3482
3483 int float32_eq(float32 a, float32 b, float_status *status)
3484 {
3485     uint32_t av, bv;
3486     a = float32_squash_input_denormal(a, status);
3487     b = float32_squash_input_denormal(b, status);
3488
3489     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3490          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3491        ) {
3492         float_raise(float_flag_invalid, status);
3493         return 0;
3494     }
3495     av = float32_val(a);
3496     bv = float32_val(b);
3497     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3498 }
3499
3500 /*----------------------------------------------------------------------------
3501 | Returns 1 if the single-precision floating-point value `a' is less than
3502 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
3503 | exception is raised if either operand is a NaN.  The comparison is performed
3504 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3505 *----------------------------------------------------------------------------*/
3506
3507 int float32_le(float32 a, float32 b, float_status *status)
3508 {
3509     flag aSign, bSign;
3510     uint32_t av, bv;
3511     a = float32_squash_input_denormal(a, status);
3512     b = float32_squash_input_denormal(b, status);
3513
3514     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3515          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3516        ) {
3517         float_raise(float_flag_invalid, status);
3518         return 0;
3519     }
3520     aSign = extractFloat32Sign( a );
3521     bSign = extractFloat32Sign( b );
3522     av = float32_val(a);
3523     bv = float32_val(b);
3524     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3525     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3526
3527 }
3528
3529 /*----------------------------------------------------------------------------
3530 | Returns 1 if the single-precision floating-point value `a' is less than
3531 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3532 | raised if either operand is a NaN.  The comparison is performed according
3533 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3534 *----------------------------------------------------------------------------*/
3535
3536 int float32_lt(float32 a, float32 b, float_status *status)
3537 {
3538     flag aSign, bSign;
3539     uint32_t av, bv;
3540     a = float32_squash_input_denormal(a, status);
3541     b = float32_squash_input_denormal(b, status);
3542
3543     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3544          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3545        ) {
3546         float_raise(float_flag_invalid, status);
3547         return 0;
3548     }
3549     aSign = extractFloat32Sign( a );
3550     bSign = extractFloat32Sign( b );
3551     av = float32_val(a);
3552     bv = float32_val(b);
3553     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3554     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3555
3556 }
3557
3558 /*----------------------------------------------------------------------------
3559 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3560 | be compared, and 0 otherwise.  The invalid exception is raised if either
3561 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
3562 | Standard for Binary Floating-Point Arithmetic.
3563 *----------------------------------------------------------------------------*/
3564
3565 int float32_unordered(float32 a, float32 b, float_status *status)
3566 {
3567     a = float32_squash_input_denormal(a, status);
3568     b = float32_squash_input_denormal(b, status);
3569
3570     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3571          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3572        ) {
3573         float_raise(float_flag_invalid, status);
3574         return 1;
3575     }
3576     return 0;
3577 }
3578
3579 /*----------------------------------------------------------------------------
3580 | Returns 1 if the single-precision floating-point value `a' is equal to
3581 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3582 | exception.  The comparison is performed according to the IEC/IEEE Standard
3583 | for Binary Floating-Point Arithmetic.
3584 *----------------------------------------------------------------------------*/
3585
3586 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3587 {
3588     a = float32_squash_input_denormal(a, status);
3589     b = float32_squash_input_denormal(b, status);
3590
3591     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3592          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3593        ) {
3594         if (float32_is_signaling_nan(a, status)
3595          || float32_is_signaling_nan(b, status)) {
3596             float_raise(float_flag_invalid, status);
3597         }
3598         return 0;
3599     }
3600     return ( float32_val(a) == float32_val(b) ) ||
3601             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3602 }
3603
3604 /*----------------------------------------------------------------------------
3605 | Returns 1 if the single-precision floating-point value `a' is less than or
3606 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3607 | cause an exception.  Otherwise, the comparison is performed according to the
3608 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3609 *----------------------------------------------------------------------------*/
3610
3611 int float32_le_quiet(float32 a, float32 b, float_status *status)
3612 {
3613     flag aSign, bSign;
3614     uint32_t av, bv;
3615     a = float32_squash_input_denormal(a, status);
3616     b = float32_squash_input_denormal(b, status);
3617
3618     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3619          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3620        ) {
3621         if (float32_is_signaling_nan(a, status)
3622          || float32_is_signaling_nan(b, status)) {
3623             float_raise(float_flag_invalid, status);
3624         }
3625         return 0;
3626     }
3627     aSign = extractFloat32Sign( a );
3628     bSign = extractFloat32Sign( b );
3629     av = float32_val(a);
3630     bv = float32_val(b);
3631     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3632     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3633
3634 }
3635
3636 /*----------------------------------------------------------------------------
3637 | Returns 1 if the single-precision floating-point value `a' is less than
3638 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3639 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3640 | Standard for Binary Floating-Point Arithmetic.
3641 *----------------------------------------------------------------------------*/
3642
3643 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3644 {
3645     flag aSign, bSign;
3646     uint32_t av, bv;
3647     a = float32_squash_input_denormal(a, status);
3648     b = float32_squash_input_denormal(b, status);
3649
3650     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3651          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3652        ) {
3653         if (float32_is_signaling_nan(a, status)
3654          || float32_is_signaling_nan(b, status)) {
3655             float_raise(float_flag_invalid, status);
3656         }
3657         return 0;
3658     }
3659     aSign = extractFloat32Sign( a );
3660     bSign = extractFloat32Sign( b );
3661     av = float32_val(a);
3662     bv = float32_val(b);
3663     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3664     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3665
3666 }
3667
3668 /*----------------------------------------------------------------------------
3669 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3670 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3671 | comparison is performed according to the IEC/IEEE Standard for Binary
3672 | Floating-Point Arithmetic.
3673 *----------------------------------------------------------------------------*/
3674
3675 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3676 {
3677     a = float32_squash_input_denormal(a, status);
3678     b = float32_squash_input_denormal(b, status);
3679
3680     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3681          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3682        ) {
3683         if (float32_is_signaling_nan(a, status)
3684          || float32_is_signaling_nan(b, status)) {
3685             float_raise(float_flag_invalid, status);
3686         }
3687         return 1;
3688     }
3689     return 0;
3690 }
3691
3692
3693 /*----------------------------------------------------------------------------
3694 | Returns the result of converting the double-precision floating-point value
3695 | `a' to the single-precision floating-point format.  The conversion is
3696 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3697 | Arithmetic.
3698 *----------------------------------------------------------------------------*/
3699
3700 float32 float64_to_float32(float64 a, float_status *status)
3701 {
3702     flag aSign;
3703     int aExp;
3704     uint64_t aSig;
3705     uint32_t zSig;
3706     a = float64_squash_input_denormal(a, status);
3707
3708     aSig = extractFloat64Frac( a );
3709     aExp = extractFloat64Exp( a );
3710     aSign = extractFloat64Sign( a );
3711     if ( aExp == 0x7FF ) {
3712         if (aSig) {
3713             return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3714         }
3715         return packFloat32( aSign, 0xFF, 0 );
3716     }
3717     shift64RightJamming( aSig, 22, &aSig );
3718     zSig = aSig;
3719     if ( aExp || zSig ) {
3720         zSig |= 0x40000000;
3721         aExp -= 0x381;
3722     }
3723     return roundAndPackFloat32(aSign, aExp, zSig, status);
3724
3725 }
3726
3727
3728 /*----------------------------------------------------------------------------
3729 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3730 | half-precision floating-point value, returning the result.  After being
3731 | shifted into the proper positions, the three fields are simply added
3732 | together to form the result.  This means that any integer portion of `zSig'
3733 | will be added into the exponent.  Since a properly normalized significand
3734 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3735 | than the desired result exponent whenever `zSig' is a complete, normalized
3736 | significand.
3737 *----------------------------------------------------------------------------*/
3738 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3739 {
3740     return make_float16(
3741         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3742 }
3743
3744 /*----------------------------------------------------------------------------
3745 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3746 | and significand `zSig', and returns the proper half-precision floating-
3747 | point value corresponding to the abstract input.  Ordinarily, the abstract
3748 | value is simply rounded and packed into the half-precision format, with
3749 | the inexact exception raised if the abstract input cannot be represented
3750 | exactly.  However, if the abstract value is too large, the overflow and
3751 | inexact exceptions are raised and an infinity or maximal finite value is
3752 | returned.  If the abstract value is too small, the input value is rounded to
3753 | a subnormal number, and the underflow and inexact exceptions are raised if
3754 | the abstract input cannot be represented exactly as a subnormal half-
3755 | precision floating-point number.
3756 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3757 | ARM-style "alternative representation", which omits the NaN and Inf
3758 | encodings in order to raise the maximum representable exponent by one.
3759 |     The input significand `zSig' has its binary point between bits 22
3760 | and 23, which is 13 bits to the left of the usual location.  This shifted
3761 | significand must be normalized or smaller.  If `zSig' is not normalized,
3762 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3763 | and it must not require rounding.  In the usual case that `zSig' is
3764 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3765 | Note the slightly odd position of the binary point in zSig compared with the
3766 | other roundAndPackFloat functions. This should probably be fixed if we
3767 | need to implement more float16 routines than just conversion.
3768 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3769 | Binary Floating-Point Arithmetic.
3770 *----------------------------------------------------------------------------*/
3771
3772 static float16 roundAndPackFloat16(flag zSign, int zExp,
3773                                    uint32_t zSig, flag ieee,
3774                                    float_status *status)
3775 {
3776     int maxexp = ieee ? 29 : 30;
3777     uint32_t mask;
3778     uint32_t increment;
3779     bool rounding_bumps_exp;
3780     bool is_tiny = false;
3781
3782     /* Calculate the mask of bits of the mantissa which are not
3783      * representable in half-precision and will be lost.
3784      */
3785     if (zExp < 1) {
3786         /* Will be denormal in halfprec */
3787         mask = 0x00ffffff;
3788         if (zExp >= -11) {
3789             mask >>= 11 + zExp;
3790         }
3791     } else {
3792         /* Normal number in halfprec */
3793         mask = 0x00001fff;
3794     }
3795
3796     switch (status->float_rounding_mode) {
3797     case float_round_nearest_even:
3798         increment = (mask + 1) >> 1;
3799         if ((zSig & mask) == increment) {
3800             increment = zSig & (increment << 1);
3801         }
3802         break;
3803     case float_round_ties_away:
3804         increment = (mask + 1) >> 1;
3805         break;
3806     case float_round_up:
3807         increment = zSign ? 0 : mask;
3808         break;
3809     case float_round_down:
3810         increment = zSign ? mask : 0;
3811         break;
3812     default: /* round_to_zero */
3813         increment = 0;
3814         break;
3815     }
3816
3817     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3818
3819     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3820         if (ieee) {
3821             float_raise(float_flag_overflow | float_flag_inexact, status);
3822             return packFloat16(zSign, 0x1f, 0);
3823         } else {
3824             float_raise(float_flag_invalid, status);
3825             return packFloat16(zSign, 0x1f, 0x3ff);
3826         }
3827     }
3828
3829     if (zExp < 0) {
3830         /* Note that flush-to-zero does not affect half-precision results */
3831         is_tiny =
3832             (status->float_detect_tininess == float_tininess_before_rounding)
3833             || (zExp < -1)
3834             || (!rounding_bumps_exp);
3835     }
3836     if (zSig & mask) {
3837         float_raise(float_flag_inexact, status);
3838         if (is_tiny) {
3839             float_raise(float_flag_underflow, status);
3840         }
3841     }
3842
3843     zSig += increment;
3844     if (rounding_bumps_exp) {
3845         zSig >>= 1;
3846         zExp++;
3847     }
3848
3849     if (zExp < -10) {
3850         return packFloat16(zSign, 0, 0);
3851     }
3852     if (zExp < 0) {
3853         zSig >>= -zExp;
3854         zExp = 0;
3855     }
3856     return packFloat16(zSign, zExp, zSig >> 13);
3857 }
3858
3859 /*----------------------------------------------------------------------------
3860 | If `a' is denormal and we are in flush-to-zero mode then set the
3861 | input-denormal exception and return zero. Otherwise just return the value.
3862 *----------------------------------------------------------------------------*/
3863 float16 float16_squash_input_denormal(float16 a, float_status *status)
3864 {
3865     if (status->flush_inputs_to_zero) {
3866         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3867             float_raise(float_flag_input_denormal, status);
3868             return make_float16(float16_val(a) & 0x8000);
3869         }
3870     }
3871     return a;
3872 }
3873
3874 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3875                                       uint32_t *zSigPtr)
3876 {
3877     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3878     *zSigPtr = aSig << shiftCount;
3879     *zExpPtr = 1 - shiftCount;
3880 }
3881
3882 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3883    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3884
3885 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3886 {
3887     flag aSign;
3888     int aExp;
3889     uint32_t aSig;
3890
3891     aSign = extractFloat16Sign(a);
3892     aExp = extractFloat16Exp(a);
3893     aSig = extractFloat16Frac(a);
3894
3895     if (aExp == 0x1f && ieee) {
3896         if (aSig) {
3897             return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3898         }
3899         return packFloat32(aSign, 0xff, 0);
3900     }
3901     if (aExp == 0) {
3902         if (aSig == 0) {
3903             return packFloat32(aSign, 0, 0);
3904         }
3905
3906         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3907         aExp--;
3908     }
3909     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3910 }
3911
3912 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3913 {
3914     flag aSign;
3915     int aExp;
3916     uint32_t aSig;
3917
3918     a = float32_squash_input_denormal(a, status);
3919
3920     aSig = extractFloat32Frac( a );
3921     aExp = extractFloat32Exp( a );
3922     aSign = extractFloat32Sign( a );
3923     if ( aExp == 0xFF ) {
3924         if (aSig) {
3925             /* Input is a NaN */
3926             if (!ieee) {
3927                 float_raise(float_flag_invalid, status);
3928                 return packFloat16(aSign, 0, 0);
3929             }
3930             return commonNaNToFloat16(
3931                 float32ToCommonNaN(a, status), status);
3932         }
3933         /* Infinity */
3934         if (!ieee) {
3935             float_raise(float_flag_invalid, status);
3936             return packFloat16(aSign, 0x1f, 0x3ff);
3937         }
3938         return packFloat16(aSign, 0x1f, 0);
3939     }
3940     if (aExp == 0 && aSig == 0) {
3941         return packFloat16(aSign, 0, 0);
3942     }
3943     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3944      * even if the input is denormal; however this is harmless because
3945      * the largest possible single-precision denormal is still smaller
3946      * than the smallest representable half-precision denormal, and so we
3947      * will end up ignoring aSig and returning via the "always return zero"
3948      * codepath.
3949      */
3950     aSig |= 0x00800000;
3951     aExp -= 0x71;
3952
3953     return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3954 }
3955
3956 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3957 {
3958     flag aSign;
3959     int aExp;
3960     uint32_t aSig;
3961
3962     aSign = extractFloat16Sign(a);
3963     aExp = extractFloat16Exp(a);
3964     aSig = extractFloat16Frac(a);
3965
3966     if (aExp == 0x1f && ieee) {
3967         if (aSig) {
3968             return commonNaNToFloat64(
3969                 float16ToCommonNaN(a, status), status);
3970         }
3971         return packFloat64(aSign, 0x7ff, 0);
3972     }
3973     if (aExp == 0) {
3974         if (aSig == 0) {
3975             return packFloat64(aSign, 0, 0);
3976         }
3977
3978         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3979         aExp--;
3980     }
3981     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3982 }
3983
3984 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3985 {
3986     flag aSign;
3987     int aExp;
3988     uint64_t aSig;
3989     uint32_t zSig;
3990
3991     a = float64_squash_input_denormal(a, status);
3992
3993     aSig = extractFloat64Frac(a);
3994     aExp = extractFloat64Exp(a);
3995     aSign = extractFloat64Sign(a);
3996     if (aExp == 0x7FF) {
3997         if (aSig) {
3998             /* Input is a NaN */
3999             if (!ieee) {
4000                 float_raise(float_flag_invalid, status);
4001                 return packFloat16(aSign, 0, 0);
4002             }
4003             return commonNaNToFloat16(
4004                 float64ToCommonNaN(a, status), status);
4005         }
4006         /* Infinity */
4007         if (!ieee) {
4008             float_raise(float_flag_invalid, status);
4009             return packFloat16(aSign, 0x1f, 0x3ff);
4010         }
4011         return packFloat16(aSign, 0x1f, 0);
4012     }
4013     shift64RightJamming(aSig, 29, &aSig);
4014     zSig = aSig;
4015     if (aExp == 0 && zSig == 0) {
4016         return packFloat16(aSign, 0, 0);
4017     }
4018     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4019      * even if the input is denormal; however this is harmless because
4020      * the largest possible single-precision denormal is still smaller
4021      * than the smallest representable half-precision denormal, and so we
4022      * will end up ignoring aSig and returning via the "always return zero"
4023      * codepath.
4024      */
4025     zSig |= 0x00800000;
4026     aExp -= 0x3F1;
4027
4028     return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
4029 }
4030
4031 /*----------------------------------------------------------------------------
4032 | Returns the result of converting the double-precision floating-point value
4033 | `a' to the extended double-precision floating-point format.  The conversion
4034 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4035 | Arithmetic.
4036 *----------------------------------------------------------------------------*/
4037
4038 floatx80 float64_to_floatx80(float64 a, float_status *status)
4039 {
4040     flag aSign;
4041     int aExp;
4042     uint64_t aSig;
4043
4044     a = float64_squash_input_denormal(a, status);
4045     aSig = extractFloat64Frac( a );
4046     aExp = extractFloat64Exp( a );
4047     aSign = extractFloat64Sign( a );
4048     if ( aExp == 0x7FF ) {
4049         if (aSig) {
4050             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4051         }
4052         return packFloatx80(aSign,
4053                             floatx80_infinity_high,
4054                             floatx80_infinity_low);
4055     }
4056     if ( aExp == 0 ) {
4057         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4058         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4059     }
4060     return
4061         packFloatx80(
4062             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4063
4064 }
4065
4066 /*----------------------------------------------------------------------------
4067 | Returns the result of converting the double-precision floating-point value
4068 | `a' to the quadruple-precision floating-point format.  The conversion is
4069 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4070 | Arithmetic.
4071 *----------------------------------------------------------------------------*/
4072
4073 float128 float64_to_float128(float64 a, float_status *status)
4074 {
4075     flag aSign;
4076     int aExp;
4077     uint64_t aSig, zSig0, zSig1;
4078
4079     a = float64_squash_input_denormal(a, status);
4080     aSig = extractFloat64Frac( a );
4081     aExp = extractFloat64Exp( a );
4082     aSign = extractFloat64Sign( a );
4083     if ( aExp == 0x7FF ) {
4084         if (aSig) {
4085             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4086         }
4087         return packFloat128( aSign, 0x7FFF, 0, 0 );
4088     }
4089     if ( aExp == 0 ) {
4090         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4091         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4092         --aExp;
4093     }
4094     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4095     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4096
4097 }
4098
4099
4100 /*----------------------------------------------------------------------------
4101 | Returns the remainder of the double-precision floating-point value `a'
4102 | with respect to the corresponding value `b'.  The operation is performed
4103 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4104 *----------------------------------------------------------------------------*/
4105
4106 float64 float64_rem(float64 a, float64 b, float_status *status)
4107 {
4108     flag aSign, zSign;
4109     int aExp, bExp, expDiff;
4110     uint64_t aSig, bSig;
4111     uint64_t q, alternateASig;
4112     int64_t sigMean;
4113
4114     a = float64_squash_input_denormal(a, status);
4115     b = float64_squash_input_denormal(b, status);
4116     aSig = extractFloat64Frac( a );
4117     aExp = extractFloat64Exp( a );
4118     aSign = extractFloat64Sign( a );
4119     bSig = extractFloat64Frac( b );
4120     bExp = extractFloat64Exp( b );
4121     if ( aExp == 0x7FF ) {
4122         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4123             return propagateFloat64NaN(a, b, status);
4124         }
4125         float_raise(float_flag_invalid, status);
4126         return float64_default_nan(status);
4127     }
4128     if ( bExp == 0x7FF ) {
4129         if (bSig) {
4130             return propagateFloat64NaN(a, b, status);
4131         }
4132         return a;
4133     }
4134     if ( bExp == 0 ) {
4135         if ( bSig == 0 ) {
4136             float_raise(float_flag_invalid, status);
4137             return float64_default_nan(status);
4138         }
4139         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4140     }
4141     if ( aExp == 0 ) {
4142         if ( aSig == 0 ) return a;
4143         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4144     }
4145     expDiff = aExp - bExp;
4146     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4147     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4148     if ( expDiff < 0 ) {
4149         if ( expDiff < -1 ) return a;
4150         aSig >>= 1;
4151     }
4152     q = ( bSig <= aSig );
4153     if ( q ) aSig -= bSig;
4154     expDiff -= 64;
4155     while ( 0 < expDiff ) {
4156         q = estimateDiv128To64( aSig, 0, bSig );
4157         q = ( 2 < q ) ? q - 2 : 0;
4158         aSig = - ( ( bSig>>2 ) * q );
4159         expDiff -= 62;
4160     }
4161     expDiff += 64;
4162     if ( 0 < expDiff ) {
4163         q = estimateDiv128To64( aSig, 0, bSig );
4164         q = ( 2 < q ) ? q - 2 : 0;
4165         q >>= 64 - expDiff;
4166         bSig >>= 2;
4167         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4168     }
4169     else {
4170         aSig >>= 2;
4171         bSig >>= 2;
4172     }
4173     do {
4174         alternateASig = aSig;
4175         ++q;
4176         aSig -= bSig;
4177     } while ( 0 <= (int64_t) aSig );
4178     sigMean = aSig + alternateASig;
4179     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4180         aSig = alternateASig;
4181     }
4182     zSign = ( (int64_t) aSig < 0 );
4183     if ( zSign ) aSig = - aSig;
4184     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4185
4186 }
4187
4188 /*----------------------------------------------------------------------------
4189 | Returns the binary log of the double-precision floating-point value `a'.
4190 | The operation is performed according to the IEC/IEEE Standard for Binary
4191 | Floating-Point Arithmetic.
4192 *----------------------------------------------------------------------------*/
4193 float64 float64_log2(float64 a, float_status *status)
4194 {
4195     flag aSign, zSign;
4196     int aExp;
4197     uint64_t aSig, aSig0, aSig1, zSig, i;
4198     a = float64_squash_input_denormal(a, status);
4199
4200     aSig = extractFloat64Frac( a );
4201     aExp = extractFloat64Exp( a );
4202     aSign = extractFloat64Sign( a );
4203
4204     if ( aExp == 0 ) {
4205         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4206         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4207     }
4208     if ( aSign ) {
4209         float_raise(float_flag_invalid, status);
4210         return float64_default_nan(status);
4211     }
4212     if ( aExp == 0x7FF ) {
4213         if (aSig) {
4214             return propagateFloat64NaN(a, float64_zero, status);
4215         }
4216         return a;
4217     }
4218
4219     aExp -= 0x3FF;
4220     aSig |= LIT64( 0x0010000000000000 );
4221     zSign = aExp < 0;
4222     zSig = (uint64_t)aExp << 52;
4223     for (i = 1LL << 51; i > 0; i >>= 1) {
4224         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4225         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4226         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4227             aSig >>= 1;
4228             zSig |= i;
4229         }
4230     }
4231
4232     if ( zSign )
4233         zSig = -zSig;
4234     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4235 }
4236
4237 /*----------------------------------------------------------------------------
4238 | Returns 1 if the double-precision floating-point value `a' is equal to the
4239 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4240 | if either operand is a NaN.  Otherwise, the comparison is performed
4241 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4242 *----------------------------------------------------------------------------*/
4243
4244 int float64_eq(float64 a, float64 b, float_status *status)
4245 {
4246     uint64_t av, bv;
4247     a = float64_squash_input_denormal(a, status);
4248     b = float64_squash_input_denormal(b, status);
4249
4250     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4251          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4252        ) {
4253         float_raise(float_flag_invalid, status);
4254         return 0;
4255     }
4256     av = float64_val(a);
4257     bv = float64_val(b);
4258     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4259
4260 }
4261
4262 /*----------------------------------------------------------------------------
4263 | Returns 1 if the double-precision floating-point value `a' is less than or
4264 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4265 | exception is raised if either operand is a NaN.  The comparison is performed
4266 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4267 *----------------------------------------------------------------------------*/
4268
4269 int float64_le(float64 a, float64 b, float_status *status)
4270 {
4271     flag aSign, bSign;
4272     uint64_t av, bv;
4273     a = float64_squash_input_denormal(a, status);
4274     b = float64_squash_input_denormal(b, status);
4275
4276     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4277          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4278        ) {
4279         float_raise(float_flag_invalid, status);
4280         return 0;
4281     }
4282     aSign = extractFloat64Sign( a );
4283     bSign = extractFloat64Sign( b );
4284     av = float64_val(a);
4285     bv = float64_val(b);
4286     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4287     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4288
4289 }
4290
4291 /*----------------------------------------------------------------------------
4292 | Returns 1 if the double-precision floating-point value `a' is less than
4293 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4294 | raised if either operand is a NaN.  The comparison is performed according
4295 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4296 *----------------------------------------------------------------------------*/
4297
4298 int float64_lt(float64 a, float64 b, float_status *status)
4299 {
4300     flag aSign, bSign;
4301     uint64_t av, bv;
4302
4303     a = float64_squash_input_denormal(a, status);
4304     b = float64_squash_input_denormal(b, status);
4305     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4306          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4307        ) {
4308         float_raise(float_flag_invalid, status);
4309         return 0;
4310     }
4311     aSign = extractFloat64Sign( a );
4312     bSign = extractFloat64Sign( b );
4313     av = float64_val(a);
4314     bv = float64_val(b);
4315     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4316     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4317
4318 }
4319
4320 /*----------------------------------------------------------------------------
4321 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4322 | be compared, and 0 otherwise.  The invalid exception is raised if either
4323 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4324 | Standard for Binary Floating-Point Arithmetic.
4325 *----------------------------------------------------------------------------*/
4326
4327 int float64_unordered(float64 a, float64 b, float_status *status)
4328 {
4329     a = float64_squash_input_denormal(a, status);
4330     b = float64_squash_input_denormal(b, status);
4331
4332     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4333          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4334        ) {
4335         float_raise(float_flag_invalid, status);
4336         return 1;
4337     }
4338     return 0;
4339 }
4340
4341 /*----------------------------------------------------------------------------
4342 | Returns 1 if the double-precision floating-point value `a' is equal to the
4343 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4344 | exception.The comparison is performed according to the IEC/IEEE Standard
4345 | for Binary Floating-Point Arithmetic.
4346 *----------------------------------------------------------------------------*/
4347
4348 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4349 {
4350     uint64_t av, bv;
4351     a = float64_squash_input_denormal(a, status);
4352     b = float64_squash_input_denormal(b, status);
4353
4354     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4355          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4356        ) {
4357         if (float64_is_signaling_nan(a, status)
4358          || float64_is_signaling_nan(b, status)) {
4359             float_raise(float_flag_invalid, status);
4360         }
4361         return 0;
4362     }
4363     av = float64_val(a);
4364     bv = float64_val(b);
4365     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4366
4367 }
4368
4369 /*----------------------------------------------------------------------------
4370 | Returns 1 if the double-precision floating-point value `a' is less than or
4371 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4372 | cause an exception.  Otherwise, the comparison is performed according to the
4373 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4374 *----------------------------------------------------------------------------*/
4375
4376 int float64_le_quiet(float64 a, float64 b, float_status *status)
4377 {
4378     flag aSign, bSign;
4379     uint64_t av, bv;
4380     a = float64_squash_input_denormal(a, status);
4381     b = float64_squash_input_denormal(b, status);
4382
4383     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4384          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4385        ) {
4386         if (float64_is_signaling_nan(a, status)
4387          || float64_is_signaling_nan(b, status)) {
4388             float_raise(float_flag_invalid, status);
4389         }
4390         return 0;
4391     }
4392     aSign = extractFloat64Sign( a );
4393     bSign = extractFloat64Sign( b );
4394     av = float64_val(a);
4395     bv = float64_val(b);
4396     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4397     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4398
4399 }
4400
4401 /*----------------------------------------------------------------------------
4402 | Returns 1 if the double-precision floating-point value `a' is less than
4403 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4404 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4405 | Standard for Binary Floating-Point Arithmetic.
4406 *----------------------------------------------------------------------------*/
4407
4408 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4409 {
4410     flag aSign, bSign;
4411     uint64_t av, bv;
4412     a = float64_squash_input_denormal(a, status);
4413     b = float64_squash_input_denormal(b, status);
4414
4415     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4416          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4417        ) {
4418         if (float64_is_signaling_nan(a, status)
4419          || float64_is_signaling_nan(b, status)) {
4420             float_raise(float_flag_invalid, status);
4421         }
4422         return 0;
4423     }
4424     aSign = extractFloat64Sign( a );
4425     bSign = extractFloat64Sign( b );
4426     av = float64_val(a);
4427     bv = float64_val(b);
4428     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4429     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4430
4431 }
4432
4433 /*----------------------------------------------------------------------------
4434 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4435 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4436 | comparison is performed according to the IEC/IEEE Standard for Binary
4437 | Floating-Point Arithmetic.
4438 *----------------------------------------------------------------------------*/
4439
4440 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4441 {
4442     a = float64_squash_input_denormal(a, status);
4443     b = float64_squash_input_denormal(b, status);
4444
4445     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4446          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4447        ) {
4448         if (float64_is_signaling_nan(a, status)
4449          || float64_is_signaling_nan(b, status)) {
4450             float_raise(float_flag_invalid, status);
4451         }
4452         return 1;
4453     }
4454     return 0;
4455 }
4456
4457 /*----------------------------------------------------------------------------
4458 | Returns the result of converting the extended double-precision floating-
4459 | point value `a' to the 32-bit two's complement integer format.  The
4460 | conversion is performed according to the IEC/IEEE Standard for Binary
4461 | Floating-Point Arithmetic---which means in particular that the conversion
4462 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4463 | largest positive integer is returned.  Otherwise, if the conversion
4464 | overflows, the largest integer with the same sign as `a' is returned.
4465 *----------------------------------------------------------------------------*/
4466
4467 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4468 {
4469     flag aSign;
4470     int32_t aExp, shiftCount;
4471     uint64_t aSig;
4472
4473     if (floatx80_invalid_encoding(a)) {
4474         float_raise(float_flag_invalid, status);
4475         return 1 << 31;
4476     }
4477     aSig = extractFloatx80Frac( a );
4478     aExp = extractFloatx80Exp( a );
4479     aSign = extractFloatx80Sign( a );
4480     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4481     shiftCount = 0x4037 - aExp;
4482     if ( shiftCount <= 0 ) shiftCount = 1;
4483     shift64RightJamming( aSig, shiftCount, &aSig );
4484     return roundAndPackInt32(aSign, aSig, status);
4485
4486 }
4487
4488 /*----------------------------------------------------------------------------
4489 | Returns the result of converting the extended double-precision floating-
4490 | point value `a' to the 32-bit two's complement integer format.  The
4491 | conversion is performed according to the IEC/IEEE Standard for Binary
4492 | Floating-Point Arithmetic, except that the conversion is always rounded
4493 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4494 | Otherwise, if the conversion overflows, the largest integer with the same
4495 | sign as `a' is returned.
4496 *----------------------------------------------------------------------------*/
4497
4498 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4499 {
4500     flag aSign;
4501     int32_t aExp, shiftCount;
4502     uint64_t aSig, savedASig;
4503     int32_t z;
4504
4505     if (floatx80_invalid_encoding(a)) {
4506         float_raise(float_flag_invalid, status);
4507         return 1 << 31;
4508     }
4509     aSig = extractFloatx80Frac( a );
4510     aExp = extractFloatx80Exp( a );
4511     aSign = extractFloatx80Sign( a );
4512     if ( 0x401E < aExp ) {
4513         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4514         goto invalid;
4515     }
4516     else if ( aExp < 0x3FFF ) {
4517         if (aExp || aSig) {
4518             status->float_exception_flags |= float_flag_inexact;
4519         }
4520         return 0;
4521     }
4522     shiftCount = 0x403E - aExp;
4523     savedASig = aSig;
4524     aSig >>= shiftCount;
4525     z = aSig;
4526     if ( aSign ) z = - z;
4527     if ( ( z < 0 ) ^ aSign ) {
4528  invalid:
4529         float_raise(float_flag_invalid, status);
4530         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4531     }
4532     if ( ( aSig<<shiftCount ) != savedASig ) {
4533         status->float_exception_flags |= float_flag_inexact;
4534     }
4535     return z;
4536
4537 }
4538
4539 /*----------------------------------------------------------------------------
4540 | Returns the result of converting the extended double-precision floating-
4541 | point value `a' to the 64-bit two's complement integer format.  The
4542 | conversion is performed according to the IEC/IEEE Standard for Binary
4543 | Floating-Point Arithmetic---which means in particular that the conversion
4544 | is rounded according to the current rounding mode.  If `a' is a NaN,
4545 | the largest positive integer is returned.  Otherwise, if the conversion
4546 | overflows, the largest integer with the same sign as `a' is returned.
4547 *----------------------------------------------------------------------------*/
4548
4549 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4550 {
4551     flag aSign;
4552     int32_t aExp, shiftCount;
4553     uint64_t aSig, aSigExtra;
4554
4555     if (floatx80_invalid_encoding(a)) {
4556         float_raise(float_flag_invalid, status);
4557         return 1ULL << 63;
4558     }
4559     aSig = extractFloatx80Frac( a );
4560     aExp = extractFloatx80Exp( a );
4561     aSign = extractFloatx80Sign( a );
4562     shiftCount = 0x403E - aExp;
4563     if ( shiftCount <= 0 ) {
4564         if ( shiftCount ) {
4565             float_raise(float_flag_invalid, status);
4566             if (!aSign || floatx80_is_any_nan(a)) {
4567                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4568             }
4569             return (int64_t) LIT64( 0x8000000000000000 );
4570         }
4571         aSigExtra = 0;
4572     }
4573     else {
4574         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4575     }
4576     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4577
4578 }
4579
4580 /*----------------------------------------------------------------------------
4581 | Returns the result of converting the extended double-precision floating-
4582 | point value `a' to the 64-bit two's complement integer format.  The
4583 | conversion is performed according to the IEC/IEEE Standard for Binary
4584 | Floating-Point Arithmetic, except that the conversion is always rounded
4585 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4586 | Otherwise, if the conversion overflows, the largest integer with the same
4587 | sign as `a' is returned.
4588 *----------------------------------------------------------------------------*/
4589
4590 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4591 {
4592     flag aSign;
4593     int32_t aExp, shiftCount;
4594     uint64_t aSig;
4595     int64_t z;
4596
4597     if (floatx80_invalid_encoding(a)) {
4598         float_raise(float_flag_invalid, status);
4599         return 1ULL << 63;
4600     }
4601     aSig = extractFloatx80Frac( a );
4602     aExp = extractFloatx80Exp( a );
4603     aSign = extractFloatx80Sign( a );
4604     shiftCount = aExp - 0x403E;
4605     if ( 0 <= shiftCount ) {
4606         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4607         if ( ( a.high != 0xC03E ) || aSig ) {
4608             float_raise(float_flag_invalid, status);
4609             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4610                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4611             }
4612         }
4613         return (int64_t) LIT64( 0x8000000000000000 );
4614     }
4615     else if ( aExp < 0x3FFF ) {
4616         if (aExp | aSig) {
4617             status->float_exception_flags |= float_flag_inexact;
4618         }
4619         return 0;
4620     }
4621     z = aSig>>( - shiftCount );
4622     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4623         status->float_exception_flags |= float_flag_inexact;
4624     }
4625     if ( aSign ) z = - z;
4626     return z;
4627
4628 }
4629
4630 /*----------------------------------------------------------------------------
4631 | Returns the result of converting the extended double-precision floating-
4632 | point value `a' to the single-precision floating-point format.  The
4633 | conversion is performed according to the IEC/IEEE Standard for Binary
4634 | Floating-Point Arithmetic.
4635 *----------------------------------------------------------------------------*/
4636
4637 float32 floatx80_to_float32(floatx80 a, float_status *status)
4638 {
4639     flag aSign;
4640     int32_t aExp;
4641     uint64_t aSig;
4642
4643     if (floatx80_invalid_encoding(a)) {
4644         float_raise(float_flag_invalid, status);
4645         return float32_default_nan(status);
4646     }
4647     aSig = extractFloatx80Frac( a );
4648     aExp = extractFloatx80Exp( a );
4649     aSign = extractFloatx80Sign( a );
4650     if ( aExp == 0x7FFF ) {
4651         if ( (uint64_t) ( aSig<<1 ) ) {
4652             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4653         }
4654         return packFloat32( aSign, 0xFF, 0 );
4655     }
4656     shift64RightJamming( aSig, 33, &aSig );
4657     if ( aExp || aSig ) aExp -= 0x3F81;
4658     return roundAndPackFloat32(aSign, aExp, aSig, status);
4659
4660 }
4661
4662 /*----------------------------------------------------------------------------
4663 | Returns the result of converting the extended double-precision floating-
4664 | point value `a' to the double-precision floating-point format.  The
4665 | conversion is performed according to the IEC/IEEE Standard for Binary
4666 | Floating-Point Arithmetic.
4667 *----------------------------------------------------------------------------*/
4668
4669 float64 floatx80_to_float64(floatx80 a, float_status *status)
4670 {
4671     flag aSign;
4672     int32_t aExp;
4673     uint64_t aSig, zSig;
4674
4675     if (floatx80_invalid_encoding(a)) {
4676         float_raise(float_flag_invalid, status);
4677         return float64_default_nan(status);
4678     }
4679     aSig = extractFloatx80Frac( a );
4680     aExp = extractFloatx80Exp( a );
4681     aSign = extractFloatx80Sign( a );
4682     if ( aExp == 0x7FFF ) {
4683         if ( (uint64_t) ( aSig<<1 ) ) {
4684             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4685         }
4686         return packFloat64( aSign, 0x7FF, 0 );
4687     }
4688     shift64RightJamming( aSig, 1, &zSig );
4689     if ( aExp || aSig ) aExp -= 0x3C01;
4690     return roundAndPackFloat64(aSign, aExp, zSig, status);
4691
4692 }
4693
4694 /*----------------------------------------------------------------------------
4695 | Returns the result of converting the extended double-precision floating-
4696 | point value `a' to the quadruple-precision floating-point format.  The
4697 | conversion is performed according to the IEC/IEEE Standard for Binary
4698 | Floating-Point Arithmetic.
4699 *----------------------------------------------------------------------------*/
4700
4701 float128 floatx80_to_float128(floatx80 a, float_status *status)
4702 {
4703     flag aSign;
4704     int aExp;
4705     uint64_t aSig, zSig0, zSig1;
4706
4707     if (floatx80_invalid_encoding(a)) {
4708         float_raise(float_flag_invalid, status);
4709         return float128_default_nan(status);
4710     }
4711     aSig = extractFloatx80Frac( a );
4712     aExp = extractFloatx80Exp( a );
4713     aSign = extractFloatx80Sign( a );
4714     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4715         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4716     }
4717     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4718     return packFloat128( aSign, aExp, zSig0, zSig1 );
4719
4720 }
4721
4722 /*----------------------------------------------------------------------------
4723 | Rounds the extended double-precision floating-point value `a'
4724 | to the precision provided by floatx80_rounding_precision and returns the
4725 | result as an extended double-precision floating-point value.
4726 | The operation is performed according to the IEC/IEEE Standard for Binary
4727 | Floating-Point Arithmetic.
4728 *----------------------------------------------------------------------------*/
4729
4730 floatx80 floatx80_round(floatx80 a, float_status *status)
4731 {
4732     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4733                                 extractFloatx80Sign(a),
4734                                 extractFloatx80Exp(a),
4735                                 extractFloatx80Frac(a), 0, status);
4736 }
4737
4738 /*----------------------------------------------------------------------------
4739 | Rounds the extended double-precision floating-point value `a' to an integer,
4740 | and returns the result as an extended quadruple-precision floating-point
4741 | value.  The operation is performed according to the IEC/IEEE Standard for
4742 | Binary Floating-Point Arithmetic.
4743 *----------------------------------------------------------------------------*/
4744
4745 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4746 {
4747     flag aSign;
4748     int32_t aExp;
4749     uint64_t lastBitMask, roundBitsMask;
4750     floatx80 z;
4751
4752     if (floatx80_invalid_encoding(a)) {
4753         float_raise(float_flag_invalid, status);
4754         return floatx80_default_nan(status);
4755     }
4756     aExp = extractFloatx80Exp( a );
4757     if ( 0x403E <= aExp ) {
4758         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4759             return propagateFloatx80NaN(a, a, status);
4760         }
4761         return a;
4762     }
4763     if ( aExp < 0x3FFF ) {
4764         if (    ( aExp == 0 )
4765              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4766             return a;
4767         }
4768         status->float_exception_flags |= float_flag_inexact;
4769         aSign = extractFloatx80Sign( a );
4770         switch (status->float_rounding_mode) {
4771          case float_round_nearest_even:
4772             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4773                ) {
4774                 return
4775                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4776             }
4777             break;
4778         case float_round_ties_away:
4779             if (aExp == 0x3FFE) {
4780                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4781             }
4782             break;
4783          case float_round_down:
4784             return
4785                   aSign ?
4786                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4787                 : packFloatx80( 0, 0, 0 );
4788          case float_round_up:
4789             return
4790                   aSign ? packFloatx80( 1, 0, 0 )
4791                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4792         }
4793         return packFloatx80( aSign, 0, 0 );
4794     }
4795     lastBitMask = 1;
4796     lastBitMask <<= 0x403E - aExp;
4797     roundBitsMask = lastBitMask - 1;
4798     z = a;
4799     switch (status->float_rounding_mode) {
4800     case float_round_nearest_even:
4801         z.low += lastBitMask>>1;
4802         if ((z.low & roundBitsMask) == 0) {
4803             z.low &= ~lastBitMask;
4804         }
4805         break;
4806     case float_round_ties_away:
4807         z.low += lastBitMask >> 1;
4808         break;
4809     case float_round_to_zero:
4810         break;
4811     case float_round_up:
4812         if (!extractFloatx80Sign(z)) {
4813             z.low += roundBitsMask;
4814         }
4815         break;
4816     case float_round_down:
4817         if (extractFloatx80Sign(z)) {
4818             z.low += roundBitsMask;
4819         }
4820         break;
4821     default:
4822         abort();
4823     }
4824     z.low &= ~ roundBitsMask;
4825     if ( z.low == 0 ) {
4826         ++z.high;
4827         z.low = LIT64( 0x8000000000000000 );
4828     }
4829     if (z.low != a.low) {
4830         status->float_exception_flags |= float_flag_inexact;
4831     }
4832     return z;
4833
4834 }
4835
4836 /*----------------------------------------------------------------------------
4837 | Returns the result of adding the absolute values of the extended double-
4838 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4839 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4840 | The addition is performed according to the IEC/IEEE Standard for Binary
4841 | Floating-Point Arithmetic.
4842 *----------------------------------------------------------------------------*/
4843
4844 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4845                                 float_status *status)
4846 {
4847     int32_t aExp, bExp, zExp;
4848     uint64_t aSig, bSig, zSig0, zSig1;
4849     int32_t expDiff;
4850
4851     aSig = extractFloatx80Frac( a );
4852     aExp = extractFloatx80Exp( a );
4853     bSig = extractFloatx80Frac( b );
4854     bExp = extractFloatx80Exp( b );
4855     expDiff = aExp - bExp;
4856     if ( 0 < expDiff ) {
4857         if ( aExp == 0x7FFF ) {
4858             if ((uint64_t)(aSig << 1)) {
4859                 return propagateFloatx80NaN(a, b, status);
4860             }
4861             return a;
4862         }
4863         if ( bExp == 0 ) --expDiff;
4864         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4865         zExp = aExp;
4866     }
4867     else if ( expDiff < 0 ) {
4868         if ( bExp == 0x7FFF ) {
4869             if ((uint64_t)(bSig << 1)) {
4870                 return propagateFloatx80NaN(a, b, status);
4871             }
4872             return packFloatx80(zSign,
4873                                 floatx80_infinity_high,
4874                                 floatx80_infinity_low);
4875         }
4876         if ( aExp == 0 ) ++expDiff;
4877         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4878         zExp = bExp;
4879     }
4880     else {
4881         if ( aExp == 0x7FFF ) {
4882             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4883                 return propagateFloatx80NaN(a, b, status);
4884             }
4885             return a;
4886         }
4887         zSig1 = 0;
4888         zSig0 = aSig + bSig;
4889         if ( aExp == 0 ) {
4890             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4891             goto roundAndPack;
4892         }
4893         zExp = aExp;
4894         goto shiftRight1;
4895     }
4896     zSig0 = aSig + bSig;
4897     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4898  shiftRight1:
4899     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4900     zSig0 |= LIT64( 0x8000000000000000 );
4901     ++zExp;
4902  roundAndPack:
4903     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4904                                 zSign, zExp, zSig0, zSig1, status);
4905 }
4906
4907 /*----------------------------------------------------------------------------
4908 | Returns the result of subtracting the absolute values of the extended
4909 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4910 | difference is negated before being returned.  `zSign' is ignored if the
4911 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4912 | Standard for Binary Floating-Point Arithmetic.
4913 *----------------------------------------------------------------------------*/
4914
4915 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4916                                 float_status *status)
4917 {
4918     int32_t aExp, bExp, zExp;
4919     uint64_t aSig, bSig, zSig0, zSig1;
4920     int32_t expDiff;
4921
4922     aSig = extractFloatx80Frac( a );
4923     aExp = extractFloatx80Exp( a );
4924     bSig = extractFloatx80Frac( b );
4925     bExp = extractFloatx80Exp( b );
4926     expDiff = aExp - bExp;
4927     if ( 0 < expDiff ) goto aExpBigger;
4928     if ( expDiff < 0 ) goto bExpBigger;
4929     if ( aExp == 0x7FFF ) {
4930         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4931             return propagateFloatx80NaN(a, b, status);
4932         }
4933         float_raise(float_flag_invalid, status);
4934         return floatx80_default_nan(status);
4935     }
4936     if ( aExp == 0 ) {
4937         aExp = 1;
4938         bExp = 1;
4939     }
4940     zSig1 = 0;
4941     if ( bSig < aSig ) goto aBigger;
4942     if ( aSig < bSig ) goto bBigger;
4943     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
4944  bExpBigger:
4945     if ( bExp == 0x7FFF ) {
4946         if ((uint64_t)(bSig << 1)) {
4947             return propagateFloatx80NaN(a, b, status);
4948         }
4949         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
4950                             floatx80_infinity_low);
4951     }
4952     if ( aExp == 0 ) ++expDiff;
4953     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4954  bBigger:
4955     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4956     zExp = bExp;
4957     zSign ^= 1;
4958     goto normalizeRoundAndPack;
4959  aExpBigger:
4960     if ( aExp == 0x7FFF ) {
4961         if ((uint64_t)(aSig << 1)) {
4962             return propagateFloatx80NaN(a, b, status);
4963         }
4964         return a;
4965     }
4966     if ( bExp == 0 ) --expDiff;
4967     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4968  aBigger:
4969     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4970     zExp = aExp;
4971  normalizeRoundAndPack:
4972     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
4973                                          zSign, zExp, zSig0, zSig1, status);
4974 }
4975
4976 /*----------------------------------------------------------------------------
4977 | Returns the result of adding the extended double-precision floating-point
4978 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4979 | Standard for Binary Floating-Point Arithmetic.
4980 *----------------------------------------------------------------------------*/
4981
4982 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
4983 {
4984     flag aSign, bSign;
4985
4986     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4987         float_raise(float_flag_invalid, status);
4988         return floatx80_default_nan(status);
4989     }
4990     aSign = extractFloatx80Sign( a );
4991     bSign = extractFloatx80Sign( b );
4992     if ( aSign == bSign ) {
4993         return addFloatx80Sigs(a, b, aSign, status);
4994     }
4995     else {
4996         return subFloatx80Sigs(a, b, aSign, status);
4997     }
4998
4999 }
5000
5001 /*----------------------------------------------------------------------------
5002 | Returns the result of subtracting the extended double-precision floating-
5003 | point values `a' and `b'.  The operation is performed according to the
5004 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5005 *----------------------------------------------------------------------------*/
5006
5007 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5008 {
5009     flag aSign, bSign;
5010
5011     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5012         float_raise(float_flag_invalid, status);
5013         return floatx80_default_nan(status);
5014     }
5015     aSign = extractFloatx80Sign( a );
5016     bSign = extractFloatx80Sign( b );
5017     if ( aSign == bSign ) {
5018         return subFloatx80Sigs(a, b, aSign, status);
5019     }
5020     else {
5021         return addFloatx80Sigs(a, b, aSign, status);
5022     }
5023
5024 }
5025
5026 /*----------------------------------------------------------------------------
5027 | Returns the result of multiplying the extended double-precision floating-
5028 | point values `a' and `b'.  The operation is performed according to the
5029 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5030 *----------------------------------------------------------------------------*/
5031
5032 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5033 {
5034     flag aSign, bSign, zSign;
5035     int32_t aExp, bExp, zExp;
5036     uint64_t aSig, bSig, zSig0, zSig1;
5037
5038     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5039         float_raise(float_flag_invalid, status);
5040         return floatx80_default_nan(status);
5041     }
5042     aSig = extractFloatx80Frac( a );
5043     aExp = extractFloatx80Exp( a );
5044     aSign = extractFloatx80Sign( a );
5045     bSig = extractFloatx80Frac( b );
5046     bExp = extractFloatx80Exp( b );
5047     bSign = extractFloatx80Sign( b );
5048     zSign = aSign ^ bSign;
5049     if ( aExp == 0x7FFF ) {
5050         if (    (uint64_t) ( aSig<<1 )
5051              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5052             return propagateFloatx80NaN(a, b, status);
5053         }
5054         if ( ( bExp | bSig ) == 0 ) goto invalid;
5055         return packFloatx80(zSign, floatx80_infinity_high,
5056                                    floatx80_infinity_low);
5057     }
5058     if ( bExp == 0x7FFF ) {
5059         if ((uint64_t)(bSig << 1)) {
5060             return propagateFloatx80NaN(a, b, status);
5061         }
5062         if ( ( aExp | aSig ) == 0 ) {
5063  invalid:
5064             float_raise(float_flag_invalid, status);
5065             return floatx80_default_nan(status);
5066         }
5067         return packFloatx80(zSign, floatx80_infinity_high,
5068                                    floatx80_infinity_low);
5069     }
5070     if ( aExp == 0 ) {
5071         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5072         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5073     }
5074     if ( bExp == 0 ) {
5075         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5076         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5077     }
5078     zExp = aExp + bExp - 0x3FFE;
5079     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5080     if ( 0 < (int64_t) zSig0 ) {
5081         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5082         --zExp;
5083     }
5084     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5085                                 zSign, zExp, zSig0, zSig1, status);
5086 }
5087
5088 /*----------------------------------------------------------------------------
5089 | Returns the result of dividing the extended double-precision floating-point
5090 | value `a' by the corresponding value `b'.  The operation is performed
5091 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5092 *----------------------------------------------------------------------------*/
5093
5094 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5095 {
5096     flag aSign, bSign, zSign;
5097     int32_t aExp, bExp, zExp;
5098     uint64_t aSig, bSig, zSig0, zSig1;
5099     uint64_t rem0, rem1, rem2, term0, term1, term2;
5100
5101     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5102         float_raise(float_flag_invalid, status);
5103         return floatx80_default_nan(status);
5104     }
5105     aSig = extractFloatx80Frac( a );
5106     aExp = extractFloatx80Exp( a );
5107     aSign = extractFloatx80Sign( a );
5108     bSig = extractFloatx80Frac( b );
5109     bExp = extractFloatx80Exp( b );
5110     bSign = extractFloatx80Sign( b );
5111     zSign = aSign ^ bSign;
5112     if ( aExp == 0x7FFF ) {
5113         if ((uint64_t)(aSig << 1)) {
5114             return propagateFloatx80NaN(a, b, status);
5115         }
5116         if ( bExp == 0x7FFF ) {
5117             if ((uint64_t)(bSig << 1)) {
5118                 return propagateFloatx80NaN(a, b, status);
5119             }
5120             goto invalid;
5121         }
5122         return packFloatx80(zSign, floatx80_infinity_high,
5123                                    floatx80_infinity_low);
5124     }
5125     if ( bExp == 0x7FFF ) {
5126         if ((uint64_t)(bSig << 1)) {
5127             return propagateFloatx80NaN(a, b, status);
5128         }
5129         return packFloatx80( zSign, 0, 0 );
5130     }
5131     if ( bExp == 0 ) {
5132         if ( bSig == 0 ) {
5133             if ( ( aExp | aSig ) == 0 ) {
5134  invalid:
5135                 float_raise(float_flag_invalid, status);
5136                 return floatx80_default_nan(status);
5137             }
5138             float_raise(float_flag_divbyzero, status);
5139             return packFloatx80(zSign, floatx80_infinity_high,
5140                                        floatx80_infinity_low);
5141         }
5142         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5143     }
5144     if ( aExp == 0 ) {
5145         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5146         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5147     }
5148     zExp = aExp - bExp + 0x3FFE;
5149     rem1 = 0;
5150     if ( bSig <= aSig ) {
5151         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5152         ++zExp;
5153     }
5154     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5155     mul64To128( bSig, zSig0, &term0, &term1 );
5156     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5157     while ( (int64_t) rem0 < 0 ) {
5158         --zSig0;
5159         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5160     }
5161     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5162     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5163         mul64To128( bSig, zSig1, &term1, &term2 );
5164         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5165         while ( (int64_t) rem1 < 0 ) {
5166             --zSig1;
5167             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5168         }
5169         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5170     }
5171     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5172                                 zSign, zExp, zSig0, zSig1, status);
5173 }
5174
5175 /*----------------------------------------------------------------------------
5176 | Returns the remainder of the extended double-precision floating-point value
5177 | `a' with respect to the corresponding value `b'.  The operation is performed
5178 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5179 *----------------------------------------------------------------------------*/
5180
5181 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5182 {
5183     flag aSign, zSign;
5184     int32_t aExp, bExp, expDiff;
5185     uint64_t aSig0, aSig1, bSig;
5186     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5187
5188     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5189         float_raise(float_flag_invalid, status);
5190         return floatx80_default_nan(status);
5191     }
5192     aSig0 = extractFloatx80Frac( a );
5193     aExp = extractFloatx80Exp( a );
5194     aSign = extractFloatx80Sign( a );
5195     bSig = extractFloatx80Frac( b );
5196     bExp = extractFloatx80Exp( b );
5197     if ( aExp == 0x7FFF ) {
5198         if (    (uint64_t) ( aSig0<<1 )
5199              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5200             return propagateFloatx80NaN(a, b, status);
5201         }
5202         goto invalid;
5203     }
5204     if ( bExp == 0x7FFF ) {
5205         if ((uint64_t)(bSig << 1)) {
5206             return propagateFloatx80NaN(a, b, status);
5207         }
5208         return a;
5209     }
5210     if ( bExp == 0 ) {
5211         if ( bSig == 0 ) {
5212  invalid:
5213             float_raise(float_flag_invalid, status);
5214             return floatx80_default_nan(status);
5215         }
5216         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5217     }
5218     if ( aExp == 0 ) {
5219         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5220         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5221     }
5222     bSig |= LIT64( 0x8000000000000000 );
5223     zSign = aSign;
5224     expDiff = aExp - bExp;
5225     aSig1 = 0;
5226     if ( expDiff < 0 ) {
5227         if ( expDiff < -1 ) return a;
5228         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5229         expDiff = 0;
5230     }
5231     q = ( bSig <= aSig0 );
5232     if ( q ) aSig0 -= bSig;
5233     expDiff -= 64;
5234     while ( 0 < expDiff ) {
5235         q = estimateDiv128To64( aSig0, aSig1, bSig );
5236         q = ( 2 < q ) ? q - 2 : 0;
5237         mul64To128( bSig, q, &term0, &term1 );
5238         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5239         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5240         expDiff -= 62;
5241     }
5242     expDiff += 64;
5243     if ( 0 < expDiff ) {
5244         q = estimateDiv128To64( aSig0, aSig1, bSig );
5245         q = ( 2 < q ) ? q - 2 : 0;
5246         q >>= 64 - expDiff;
5247         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5248         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5249         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5250         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5251             ++q;
5252             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5253         }
5254     }
5255     else {
5256         term1 = 0;
5257         term0 = bSig;
5258     }
5259     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5260     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5261          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5262               && ( q & 1 ) )
5263        ) {
5264         aSig0 = alternateASig0;
5265         aSig1 = alternateASig1;
5266         zSign = ! zSign;
5267     }
5268     return
5269         normalizeRoundAndPackFloatx80(
5270             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5271
5272 }
5273
5274 /*----------------------------------------------------------------------------
5275 | Returns the square root of the extended double-precision floating-point
5276 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5277 | for Binary Floating-Point Arithmetic.
5278 *----------------------------------------------------------------------------*/
5279
5280 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5281 {
5282     flag aSign;
5283     int32_t aExp, zExp;
5284     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5285     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5286
5287     if (floatx80_invalid_encoding(a)) {
5288         float_raise(float_flag_invalid, status);
5289         return floatx80_default_nan(status);
5290     }
5291     aSig0 = extractFloatx80Frac( a );
5292     aExp = extractFloatx80Exp( a );
5293     aSign = extractFloatx80Sign( a );
5294     if ( aExp == 0x7FFF ) {
5295         if ((uint64_t)(aSig0 << 1)) {
5296             return propagateFloatx80NaN(a, a, status);
5297         }
5298         if ( ! aSign ) return a;
5299         goto invalid;
5300     }
5301     if ( aSign ) {
5302         if ( ( aExp | aSig0 ) == 0 ) return a;
5303  invalid:
5304         float_raise(float_flag_invalid, status);
5305         return floatx80_default_nan(status);
5306     }
5307     if ( aExp == 0 ) {
5308         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5309         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5310     }
5311     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5312     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5313     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5314     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5315     doubleZSig0 = zSig0<<1;
5316     mul64To128( zSig0, zSig0, &term0, &term1 );
5317     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5318     while ( (int64_t) rem0 < 0 ) {
5319         --zSig0;
5320         doubleZSig0 -= 2;
5321         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5322     }
5323     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5324     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5325         if ( zSig1 == 0 ) zSig1 = 1;
5326         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5327         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5328         mul64To128( zSig1, zSig1, &term2, &term3 );
5329         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5330         while ( (int64_t) rem1 < 0 ) {
5331             --zSig1;
5332             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5333             term3 |= 1;
5334             term2 |= doubleZSig0;
5335             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5336         }
5337         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5338     }
5339     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5340     zSig0 |= doubleZSig0;
5341     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5342                                 0, zExp, zSig0, zSig1, status);
5343 }
5344
5345 /*----------------------------------------------------------------------------
5346 | Returns 1 if the extended double-precision floating-point value `a' is equal
5347 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5348 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5349 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5350 *----------------------------------------------------------------------------*/
5351
5352 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5353 {
5354
5355     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5356         || (extractFloatx80Exp(a) == 0x7FFF
5357             && (uint64_t) (extractFloatx80Frac(a) << 1))
5358         || (extractFloatx80Exp(b) == 0x7FFF
5359             && (uint64_t) (extractFloatx80Frac(b) << 1))
5360        ) {
5361         float_raise(float_flag_invalid, status);
5362         return 0;
5363     }
5364     return
5365            ( a.low == b.low )
5366         && (    ( a.high == b.high )
5367              || (    ( a.low == 0 )
5368                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5369            );
5370
5371 }
5372
5373 /*----------------------------------------------------------------------------
5374 | Returns 1 if the extended double-precision floating-point value `a' is
5375 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5376 | invalid exception is raised if either operand is a NaN.  The comparison is
5377 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5378 | Arithmetic.
5379 *----------------------------------------------------------------------------*/
5380
5381 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5382 {
5383     flag aSign, bSign;
5384
5385     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5386         || (extractFloatx80Exp(a) == 0x7FFF
5387             && (uint64_t) (extractFloatx80Frac(a) << 1))
5388         || (extractFloatx80Exp(b) == 0x7FFF
5389             && (uint64_t) (extractFloatx80Frac(b) << 1))
5390        ) {
5391         float_raise(float_flag_invalid, status);
5392         return 0;
5393     }
5394     aSign = extractFloatx80Sign( a );
5395     bSign = extractFloatx80Sign( b );
5396     if ( aSign != bSign ) {
5397         return
5398                aSign
5399             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5400                  == 0 );
5401     }
5402     return
5403           aSign ? le128( b.high, b.low, a.high, a.low )
5404         : le128( a.high, a.low, b.high, b.low );
5405
5406 }
5407
5408 /*----------------------------------------------------------------------------
5409 | Returns 1 if the extended double-precision floating-point value `a' is
5410 | less than the corresponding value `b', and 0 otherwise.  The invalid
5411 | exception is raised if either operand is a NaN.  The comparison is performed
5412 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5413 *----------------------------------------------------------------------------*/
5414
5415 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5416 {
5417     flag aSign, bSign;
5418
5419     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5420         || (extractFloatx80Exp(a) == 0x7FFF
5421             && (uint64_t) (extractFloatx80Frac(a) << 1))
5422         || (extractFloatx80Exp(b) == 0x7FFF
5423             && (uint64_t) (extractFloatx80Frac(b) << 1))
5424        ) {
5425         float_raise(float_flag_invalid, status);
5426         return 0;
5427     }
5428     aSign = extractFloatx80Sign( a );
5429     bSign = extractFloatx80Sign( b );
5430     if ( aSign != bSign ) {
5431         return
5432                aSign
5433             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5434                  != 0 );
5435     }
5436     return
5437           aSign ? lt128( b.high, b.low, a.high, a.low )
5438         : lt128( a.high, a.low, b.high, b.low );
5439
5440 }
5441
5442 /*----------------------------------------------------------------------------
5443 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5444 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5445 | either operand is a NaN.   The comparison is performed according to the
5446 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5447 *----------------------------------------------------------------------------*/
5448 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5449 {
5450     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5451         || (extractFloatx80Exp(a) == 0x7FFF
5452             && (uint64_t) (extractFloatx80Frac(a) << 1))
5453         || (extractFloatx80Exp(b) == 0x7FFF
5454             && (uint64_t) (extractFloatx80Frac(b) << 1))
5455        ) {
5456         float_raise(float_flag_invalid, status);
5457         return 1;
5458     }
5459     return 0;
5460 }
5461
5462 /*----------------------------------------------------------------------------
5463 | Returns 1 if the extended double-precision floating-point value `a' is
5464 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5465 | cause an exception.  The comparison is performed according to the IEC/IEEE
5466 | Standard for Binary Floating-Point Arithmetic.
5467 *----------------------------------------------------------------------------*/
5468
5469 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5470 {
5471
5472     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5473         float_raise(float_flag_invalid, status);
5474         return 0;
5475     }
5476     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5477               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5478          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5479               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5480        ) {
5481         if (floatx80_is_signaling_nan(a, status)
5482          || floatx80_is_signaling_nan(b, status)) {
5483             float_raise(float_flag_invalid, status);
5484         }
5485         return 0;
5486     }
5487     return
5488            ( a.low == b.low )
5489         && (    ( a.high == b.high )
5490              || (    ( a.low == 0 )
5491                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5492            );
5493
5494 }
5495
5496 /*----------------------------------------------------------------------------
5497 | Returns 1 if the extended double-precision floating-point value `a' is less
5498 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5499 | do not cause an exception.  Otherwise, the comparison is performed according
5500 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5501 *----------------------------------------------------------------------------*/
5502
5503 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5504 {
5505     flag aSign, bSign;
5506
5507     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5508         float_raise(float_flag_invalid, status);
5509         return 0;
5510     }
5511     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5512               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5513          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5514               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5515        ) {
5516         if (floatx80_is_signaling_nan(a, status)
5517          || floatx80_is_signaling_nan(b, status)) {
5518             float_raise(float_flag_invalid, status);
5519         }
5520         return 0;
5521     }
5522     aSign = extractFloatx80Sign( a );
5523     bSign = extractFloatx80Sign( b );
5524     if ( aSign != bSign ) {
5525         return
5526                aSign
5527             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5528                  == 0 );
5529     }
5530     return
5531           aSign ? le128( b.high, b.low, a.high, a.low )
5532         : le128( a.high, a.low, b.high, b.low );
5533
5534 }
5535
5536 /*----------------------------------------------------------------------------
5537 | Returns 1 if the extended double-precision floating-point value `a' is less
5538 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5539 | an exception.  Otherwise, the comparison is performed according to the
5540 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5541 *----------------------------------------------------------------------------*/
5542
5543 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5544 {
5545     flag aSign, bSign;
5546
5547     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5548         float_raise(float_flag_invalid, status);
5549         return 0;
5550     }
5551     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5552               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5553          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5554               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5555        ) {
5556         if (floatx80_is_signaling_nan(a, status)
5557          || floatx80_is_signaling_nan(b, status)) {
5558             float_raise(float_flag_invalid, status);
5559         }
5560         return 0;
5561     }
5562     aSign = extractFloatx80Sign( a );
5563     bSign = extractFloatx80Sign( b );
5564     if ( aSign != bSign ) {
5565         return
5566                aSign
5567             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5568                  != 0 );
5569     }
5570     return
5571           aSign ? lt128( b.high, b.low, a.high, a.low )
5572         : lt128( a.high, a.low, b.high, b.low );
5573
5574 }
5575
5576 /*----------------------------------------------------------------------------
5577 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5578 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5579 | The comparison is performed according to the IEC/IEEE Standard for Binary
5580 | Floating-Point Arithmetic.
5581 *----------------------------------------------------------------------------*/
5582 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5583 {
5584     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5585         float_raise(float_flag_invalid, status);
5586         return 1;
5587     }
5588     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5589               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5590          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5591               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5592        ) {
5593         if (floatx80_is_signaling_nan(a, status)
5594          || floatx80_is_signaling_nan(b, status)) {
5595             float_raise(float_flag_invalid, status);
5596         }
5597         return 1;
5598     }
5599     return 0;
5600 }
5601
5602 /*----------------------------------------------------------------------------
5603 | Returns the result of converting the quadruple-precision floating-point
5604 | value `a' to the 32-bit two's complement integer format.  The conversion
5605 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5606 | Arithmetic---which means in particular that the conversion is rounded
5607 | according to the current rounding mode.  If `a' is a NaN, the largest
5608 | positive integer is returned.  Otherwise, if the conversion overflows, the
5609 | largest integer with the same sign as `a' is returned.
5610 *----------------------------------------------------------------------------*/
5611
5612 int32_t float128_to_int32(float128 a, float_status *status)
5613 {
5614     flag aSign;
5615     int32_t aExp, shiftCount;
5616     uint64_t aSig0, aSig1;
5617
5618     aSig1 = extractFloat128Frac1( a );
5619     aSig0 = extractFloat128Frac0( a );
5620     aExp = extractFloat128Exp( a );
5621     aSign = extractFloat128Sign( a );
5622     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5623     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5624     aSig0 |= ( aSig1 != 0 );
5625     shiftCount = 0x4028 - aExp;
5626     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5627     return roundAndPackInt32(aSign, aSig0, status);
5628
5629 }
5630
5631 /*----------------------------------------------------------------------------
5632 | Returns the result of converting the quadruple-precision floating-point
5633 | value `a' to the 32-bit two's complement integer format.  The conversion
5634 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5635 | Arithmetic, except that the conversion is always rounded toward zero.  If
5636 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5637 | conversion overflows, the largest integer with the same sign as `a' is
5638 | returned.
5639 *----------------------------------------------------------------------------*/
5640
5641 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5642 {
5643     flag aSign;
5644     int32_t aExp, shiftCount;
5645     uint64_t aSig0, aSig1, savedASig;
5646     int32_t z;
5647
5648     aSig1 = extractFloat128Frac1( a );
5649     aSig0 = extractFloat128Frac0( a );
5650     aExp = extractFloat128Exp( a );
5651     aSign = extractFloat128Sign( a );
5652     aSig0 |= ( aSig1 != 0 );
5653     if ( 0x401E < aExp ) {
5654         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5655         goto invalid;
5656     }
5657     else if ( aExp < 0x3FFF ) {
5658         if (aExp || aSig0) {
5659             status->float_exception_flags |= float_flag_inexact;
5660         }
5661         return 0;
5662     }
5663     aSig0 |= LIT64( 0x0001000000000000 );
5664     shiftCount = 0x402F - aExp;
5665     savedASig = aSig0;
5666     aSig0 >>= shiftCount;
5667     z = aSig0;
5668     if ( aSign ) z = - z;
5669     if ( ( z < 0 ) ^ aSign ) {
5670  invalid:
5671         float_raise(float_flag_invalid, status);
5672         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5673     }
5674     if ( ( aSig0<<shiftCount ) != savedASig ) {
5675         status->float_exception_flags |= float_flag_inexact;
5676     }
5677     return z;
5678
5679 }
5680
5681 /*----------------------------------------------------------------------------
5682 | Returns the result of converting the quadruple-precision floating-point
5683 | value `a' to the 64-bit two's complement integer format.  The conversion
5684 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5685 | Arithmetic---which means in particular that the conversion is rounded
5686 | according to the current rounding mode.  If `a' is a NaN, the largest
5687 | positive integer is returned.  Otherwise, if the conversion overflows, the
5688 | largest integer with the same sign as `a' is returned.
5689 *----------------------------------------------------------------------------*/
5690
5691 int64_t float128_to_int64(float128 a, float_status *status)
5692 {
5693     flag aSign;
5694     int32_t aExp, shiftCount;
5695     uint64_t aSig0, aSig1;
5696
5697     aSig1 = extractFloat128Frac1( a );
5698     aSig0 = extractFloat128Frac0( a );
5699     aExp = extractFloat128Exp( a );
5700     aSign = extractFloat128Sign( a );
5701     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5702     shiftCount = 0x402F - aExp;
5703     if ( shiftCount <= 0 ) {
5704         if ( 0x403E < aExp ) {
5705             float_raise(float_flag_invalid, status);
5706             if (    ! aSign
5707                  || (    ( aExp == 0x7FFF )
5708                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5709                     )
5710                ) {
5711                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5712             }
5713             return (int64_t) LIT64( 0x8000000000000000 );
5714         }
5715         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5716     }
5717     else {
5718         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5719     }
5720     return roundAndPackInt64(aSign, aSig0, aSig1, status);
5721
5722 }
5723
5724 /*----------------------------------------------------------------------------
5725 | Returns the result of converting the quadruple-precision floating-point
5726 | value `a' to the 64-bit two's complement integer format.  The conversion
5727 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5728 | Arithmetic, except that the conversion is always rounded toward zero.
5729 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5730 | the conversion overflows, the largest integer with the same sign as `a' is
5731 | returned.
5732 *----------------------------------------------------------------------------*/
5733
5734 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5735 {
5736     flag aSign;
5737     int32_t aExp, shiftCount;
5738     uint64_t aSig0, aSig1;
5739     int64_t z;
5740
5741     aSig1 = extractFloat128Frac1( a );
5742     aSig0 = extractFloat128Frac0( a );
5743     aExp = extractFloat128Exp( a );
5744     aSign = extractFloat128Sign( a );
5745     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5746     shiftCount = aExp - 0x402F;
5747     if ( 0 < shiftCount ) {
5748         if ( 0x403E <= aExp ) {
5749             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5750             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5751                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5752                 if (aSig1) {
5753                     status->float_exception_flags |= float_flag_inexact;
5754                 }
5755             }
5756             else {
5757                 float_raise(float_flag_invalid, status);
5758                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5759                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5760                 }
5761             }
5762             return (int64_t) LIT64( 0x8000000000000000 );
5763         }
5764         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5765         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5766             status->float_exception_flags |= float_flag_inexact;
5767         }
5768     }
5769     else {
5770         if ( aExp < 0x3FFF ) {
5771             if ( aExp | aSig0 | aSig1 ) {
5772                 status->float_exception_flags |= float_flag_inexact;
5773             }
5774             return 0;
5775         }
5776         z = aSig0>>( - shiftCount );
5777         if (    aSig1
5778              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5779             status->float_exception_flags |= float_flag_inexact;
5780         }
5781     }
5782     if ( aSign ) z = - z;
5783     return z;
5784
5785 }
5786
5787 /*----------------------------------------------------------------------------
5788 | Returns the result of converting the quadruple-precision floating-point value
5789 | `a' to the 64-bit unsigned integer format.  The conversion is
5790 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5791 | Arithmetic---which means in particular that the conversion is rounded
5792 | according to the current rounding mode.  If `a' is a NaN, the largest
5793 | positive integer is returned.  If the conversion overflows, the
5794 | largest unsigned integer is returned.  If 'a' is negative, the value is
5795 | rounded and zero is returned; negative values that do not round to zero
5796 | will raise the inexact exception.
5797 *----------------------------------------------------------------------------*/
5798
5799 uint64_t float128_to_uint64(float128 a, float_status *status)
5800 {
5801     flag aSign;
5802     int aExp;
5803     int shiftCount;
5804     uint64_t aSig0, aSig1;
5805
5806     aSig0 = extractFloat128Frac0(a);
5807     aSig1 = extractFloat128Frac1(a);
5808     aExp = extractFloat128Exp(a);
5809     aSign = extractFloat128Sign(a);
5810     if (aSign && (aExp > 0x3FFE)) {
5811         float_raise(float_flag_invalid, status);
5812         if (float128_is_any_nan(a)) {
5813             return LIT64(0xFFFFFFFFFFFFFFFF);
5814         } else {
5815             return 0;
5816         }
5817     }
5818     if (aExp) {
5819         aSig0 |= LIT64(0x0001000000000000);
5820     }
5821     shiftCount = 0x402F - aExp;
5822     if (shiftCount <= 0) {
5823         if (0x403E < aExp) {
5824             float_raise(float_flag_invalid, status);
5825             return LIT64(0xFFFFFFFFFFFFFFFF);
5826         }
5827         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5828     } else {
5829         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5830     }
5831     return roundAndPackUint64(aSign, aSig0, aSig1, status);
5832 }
5833
5834 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5835 {
5836     uint64_t v;
5837     signed char current_rounding_mode = status->float_rounding_mode;
5838
5839     set_float_rounding_mode(float_round_to_zero, status);
5840     v = float128_to_uint64(a, status);
5841     set_float_rounding_mode(current_rounding_mode, status);
5842
5843     return v;
5844 }
5845
5846 /*----------------------------------------------------------------------------
5847 | Returns the result of converting the quadruple-precision floating-point
5848 | value `a' to the 32-bit unsigned integer format.  The conversion
5849 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5850 | Arithmetic except that the conversion is always rounded toward zero.
5851 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
5852 | if the conversion overflows, the largest unsigned integer is returned.
5853 | If 'a' is negative, the value is rounded and zero is returned; negative
5854 | values that do not round to zero will raise the inexact exception.
5855 *----------------------------------------------------------------------------*/
5856
5857 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5858 {
5859     uint64_t v;
5860     uint32_t res;
5861     int old_exc_flags = get_float_exception_flags(status);
5862
5863     v = float128_to_uint64_round_to_zero(a, status);
5864     if (v > 0xffffffff) {
5865         res = 0xffffffff;
5866     } else {
5867         return v;
5868     }
5869     set_float_exception_flags(old_exc_flags, status);
5870     float_raise(float_flag_invalid, status);
5871     return res;
5872 }
5873
5874 /*----------------------------------------------------------------------------
5875 | Returns the result of converting the quadruple-precision floating-point
5876 | value `a' to the single-precision floating-point format.  The conversion
5877 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5878 | Arithmetic.
5879 *----------------------------------------------------------------------------*/
5880
5881 float32 float128_to_float32(float128 a, float_status *status)
5882 {
5883     flag aSign;
5884     int32_t aExp;
5885     uint64_t aSig0, aSig1;
5886     uint32_t zSig;
5887
5888     aSig1 = extractFloat128Frac1( a );
5889     aSig0 = extractFloat128Frac0( a );
5890     aExp = extractFloat128Exp( a );
5891     aSign = extractFloat128Sign( a );
5892     if ( aExp == 0x7FFF ) {
5893         if ( aSig0 | aSig1 ) {
5894             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5895         }
5896         return packFloat32( aSign, 0xFF, 0 );
5897     }
5898     aSig0 |= ( aSig1 != 0 );
5899     shift64RightJamming( aSig0, 18, &aSig0 );
5900     zSig = aSig0;
5901     if ( aExp || zSig ) {
5902         zSig |= 0x40000000;
5903         aExp -= 0x3F81;
5904     }
5905     return roundAndPackFloat32(aSign, aExp, zSig, status);
5906
5907 }
5908
5909 /*----------------------------------------------------------------------------
5910 | Returns the result of converting the quadruple-precision floating-point
5911 | value `a' to the double-precision floating-point format.  The conversion
5912 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5913 | Arithmetic.
5914 *----------------------------------------------------------------------------*/
5915
5916 float64 float128_to_float64(float128 a, float_status *status)
5917 {
5918     flag aSign;
5919     int32_t aExp;
5920     uint64_t aSig0, aSig1;
5921
5922     aSig1 = extractFloat128Frac1( a );
5923     aSig0 = extractFloat128Frac0( a );
5924     aExp = extractFloat128Exp( a );
5925     aSign = extractFloat128Sign( a );
5926     if ( aExp == 0x7FFF ) {
5927         if ( aSig0 | aSig1 ) {
5928             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5929         }
5930         return packFloat64( aSign, 0x7FF, 0 );
5931     }
5932     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5933     aSig0 |= ( aSig1 != 0 );
5934     if ( aExp || aSig0 ) {
5935         aSig0 |= LIT64( 0x4000000000000000 );
5936         aExp -= 0x3C01;
5937     }
5938     return roundAndPackFloat64(aSign, aExp, aSig0, status);
5939
5940 }
5941
5942 /*----------------------------------------------------------------------------
5943 | Returns the result of converting the quadruple-precision floating-point
5944 | value `a' to the extended double-precision floating-point format.  The
5945 | conversion is performed according to the IEC/IEEE Standard for Binary
5946 | Floating-Point Arithmetic.
5947 *----------------------------------------------------------------------------*/
5948
5949 floatx80 float128_to_floatx80(float128 a, float_status *status)
5950 {
5951     flag aSign;
5952     int32_t aExp;
5953     uint64_t aSig0, aSig1;
5954
5955     aSig1 = extractFloat128Frac1( a );
5956     aSig0 = extractFloat128Frac0( a );
5957     aExp = extractFloat128Exp( a );
5958     aSign = extractFloat128Sign( a );
5959     if ( aExp == 0x7FFF ) {
5960         if ( aSig0 | aSig1 ) {
5961             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
5962         }
5963         return packFloatx80(aSign, floatx80_infinity_high,
5964                                    floatx80_infinity_low);
5965     }
5966     if ( aExp == 0 ) {
5967         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5968         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5969     }
5970     else {
5971         aSig0 |= LIT64( 0x0001000000000000 );
5972     }
5973     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5974     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
5975
5976 }
5977
5978 /*----------------------------------------------------------------------------
5979 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5980 | returns the result as a quadruple-precision floating-point value.  The
5981 | operation is performed according to the IEC/IEEE Standard for Binary
5982 | Floating-Point Arithmetic.
5983 *----------------------------------------------------------------------------*/
5984
5985 float128 float128_round_to_int(float128 a, float_status *status)
5986 {
5987     flag aSign;
5988     int32_t aExp;
5989     uint64_t lastBitMask, roundBitsMask;
5990     float128 z;
5991
5992     aExp = extractFloat128Exp( a );
5993     if ( 0x402F <= aExp ) {
5994         if ( 0x406F <= aExp ) {
5995             if (    ( aExp == 0x7FFF )
5996                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5997                ) {
5998                 return propagateFloat128NaN(a, a, status);
5999             }
6000             return a;
6001         }
6002         lastBitMask = 1;
6003         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6004         roundBitsMask = lastBitMask - 1;
6005         z = a;
6006         switch (status->float_rounding_mode) {
6007         case float_round_nearest_even:
6008             if ( lastBitMask ) {
6009                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6010                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6011             }
6012             else {
6013                 if ( (int64_t) z.low < 0 ) {
6014                     ++z.high;
6015                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6016                 }
6017             }
6018             break;
6019         case float_round_ties_away:
6020             if (lastBitMask) {
6021                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6022             } else {
6023                 if ((int64_t) z.low < 0) {
6024                     ++z.high;
6025                 }
6026             }
6027             break;
6028         case float_round_to_zero:
6029             break;
6030         case float_round_up:
6031             if (!extractFloat128Sign(z)) {
6032                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6033             }
6034             break;
6035         case float_round_down:
6036             if (extractFloat128Sign(z)) {
6037                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6038             }
6039             break;
6040         default:
6041             abort();
6042         }
6043         z.low &= ~ roundBitsMask;
6044     }
6045     else {
6046         if ( aExp < 0x3FFF ) {
6047             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6048             status->float_exception_flags |= float_flag_inexact;
6049             aSign = extractFloat128Sign( a );
6050             switch (status->float_rounding_mode) {
6051              case float_round_nearest_even:
6052                 if (    ( aExp == 0x3FFE )
6053                      && (   extractFloat128Frac0( a )
6054                           | extractFloat128Frac1( a ) )
6055                    ) {
6056                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6057                 }
6058                 break;
6059             case float_round_ties_away:
6060                 if (aExp == 0x3FFE) {
6061                     return packFloat128(aSign, 0x3FFF, 0, 0);
6062                 }
6063                 break;
6064              case float_round_down:
6065                 return
6066                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6067                     : packFloat128( 0, 0, 0, 0 );
6068              case float_round_up:
6069                 return
6070                       aSign ? packFloat128( 1, 0, 0, 0 )
6071                     : packFloat128( 0, 0x3FFF, 0, 0 );
6072             }
6073             return packFloat128( aSign, 0, 0, 0 );
6074         }
6075         lastBitMask = 1;
6076         lastBitMask <<= 0x402F - aExp;
6077         roundBitsMask = lastBitMask - 1;
6078         z.low = 0;
6079         z.high = a.high;
6080         switch (status->float_rounding_mode) {
6081         case float_round_nearest_even:
6082             z.high += lastBitMask>>1;
6083             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6084                 z.high &= ~ lastBitMask;
6085             }
6086             break;
6087         case float_round_ties_away:
6088             z.high += lastBitMask>>1;
6089             break;
6090         case float_round_to_zero:
6091             break;
6092         case float_round_up:
6093             if (!extractFloat128Sign(z)) {
6094                 z.high |= ( a.low != 0 );
6095                 z.high += roundBitsMask;
6096             }
6097             break;
6098         case float_round_down:
6099             if (extractFloat128Sign(z)) {
6100                 z.high |= (a.low != 0);
6101                 z.high += roundBitsMask;
6102             }
6103             break;
6104         default:
6105             abort();
6106         }
6107         z.high &= ~ roundBitsMask;
6108     }
6109     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6110         status->float_exception_flags |= float_flag_inexact;
6111     }
6112     return z;
6113
6114 }
6115
6116 /*----------------------------------------------------------------------------
6117 | Returns the result of adding the absolute values of the quadruple-precision
6118 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6119 | before being returned.  `zSign' is ignored if the result is a NaN.
6120 | The addition is performed according to the IEC/IEEE Standard for Binary
6121 | Floating-Point Arithmetic.
6122 *----------------------------------------------------------------------------*/
6123
6124 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6125                                 float_status *status)
6126 {
6127     int32_t aExp, bExp, zExp;
6128     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6129     int32_t expDiff;
6130
6131     aSig1 = extractFloat128Frac1( a );
6132     aSig0 = extractFloat128Frac0( a );
6133     aExp = extractFloat128Exp( a );
6134     bSig1 = extractFloat128Frac1( b );
6135     bSig0 = extractFloat128Frac0( b );
6136     bExp = extractFloat128Exp( b );
6137     expDiff = aExp - bExp;
6138     if ( 0 < expDiff ) {
6139         if ( aExp == 0x7FFF ) {
6140             if (aSig0 | aSig1) {
6141                 return propagateFloat128NaN(a, b, status);
6142             }
6143             return a;
6144         }
6145         if ( bExp == 0 ) {
6146             --expDiff;
6147         }
6148         else {
6149             bSig0 |= LIT64( 0x0001000000000000 );
6150         }
6151         shift128ExtraRightJamming(
6152             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6153         zExp = aExp;
6154     }
6155     else if ( expDiff < 0 ) {
6156         if ( bExp == 0x7FFF ) {
6157             if (bSig0 | bSig1) {
6158                 return propagateFloat128NaN(a, b, status);
6159             }
6160             return packFloat128( zSign, 0x7FFF, 0, 0 );
6161         }
6162         if ( aExp == 0 ) {
6163             ++expDiff;
6164         }
6165         else {
6166             aSig0 |= LIT64( 0x0001000000000000 );
6167         }
6168         shift128ExtraRightJamming(
6169             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6170         zExp = bExp;
6171     }
6172     else {
6173         if ( aExp == 0x7FFF ) {
6174             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6175                 return propagateFloat128NaN(a, b, status);
6176             }
6177             return a;
6178         }
6179         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6180         if ( aExp == 0 ) {
6181             if (status->flush_to_zero) {
6182                 if (zSig0 | zSig1) {
6183                     float_raise(float_flag_output_denormal, status);
6184                 }
6185                 return packFloat128(zSign, 0, 0, 0);
6186             }
6187             return packFloat128( zSign, 0, zSig0, zSig1 );
6188         }
6189         zSig2 = 0;
6190         zSig0 |= LIT64( 0x0002000000000000 );
6191         zExp = aExp;
6192         goto shiftRight1;
6193     }
6194     aSig0 |= LIT64( 0x0001000000000000 );
6195     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6196     --zExp;
6197     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6198     ++zExp;
6199  shiftRight1:
6200     shift128ExtraRightJamming(
6201         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6202  roundAndPack:
6203     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6204
6205 }
6206
6207 /*----------------------------------------------------------------------------
6208 | Returns the result of subtracting the absolute values of the quadruple-
6209 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6210 | difference is negated before being returned.  `zSign' is ignored if the
6211 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6212 | Standard for Binary Floating-Point Arithmetic.
6213 *----------------------------------------------------------------------------*/
6214
6215 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6216                                 float_status *status)
6217 {
6218     int32_t aExp, bExp, zExp;
6219     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6220     int32_t expDiff;
6221
6222     aSig1 = extractFloat128Frac1( a );
6223     aSig0 = extractFloat128Frac0( a );
6224     aExp = extractFloat128Exp( a );
6225     bSig1 = extractFloat128Frac1( b );
6226     bSig0 = extractFloat128Frac0( b );
6227     bExp = extractFloat128Exp( b );
6228     expDiff = aExp - bExp;
6229     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6230     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6231     if ( 0 < expDiff ) goto aExpBigger;
6232     if ( expDiff < 0 ) goto bExpBigger;
6233     if ( aExp == 0x7FFF ) {
6234         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6235             return propagateFloat128NaN(a, b, status);
6236         }
6237         float_raise(float_flag_invalid, status);
6238         return float128_default_nan(status);
6239     }
6240     if ( aExp == 0 ) {
6241         aExp = 1;
6242         bExp = 1;
6243     }
6244     if ( bSig0 < aSig0 ) goto aBigger;
6245     if ( aSig0 < bSig0 ) goto bBigger;
6246     if ( bSig1 < aSig1 ) goto aBigger;
6247     if ( aSig1 < bSig1 ) goto bBigger;
6248     return packFloat128(status->float_rounding_mode == float_round_down,
6249                         0, 0, 0);
6250  bExpBigger:
6251     if ( bExp == 0x7FFF ) {
6252         if (bSig0 | bSig1) {
6253             return propagateFloat128NaN(a, b, status);
6254         }
6255         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6256     }
6257     if ( aExp == 0 ) {
6258         ++expDiff;
6259     }
6260     else {
6261         aSig0 |= LIT64( 0x4000000000000000 );
6262     }
6263     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6264     bSig0 |= LIT64( 0x4000000000000000 );
6265  bBigger:
6266     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6267     zExp = bExp;
6268     zSign ^= 1;
6269     goto normalizeRoundAndPack;
6270  aExpBigger:
6271     if ( aExp == 0x7FFF ) {
6272         if (aSig0 | aSig1) {
6273             return propagateFloat128NaN(a, b, status);
6274         }
6275         return a;
6276     }
6277     if ( bExp == 0 ) {
6278         --expDiff;
6279     }
6280     else {
6281         bSig0 |= LIT64( 0x4000000000000000 );
6282     }
6283     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6284     aSig0 |= LIT64( 0x4000000000000000 );
6285  aBigger:
6286     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6287     zExp = aExp;
6288  normalizeRoundAndPack:
6289     --zExp;
6290     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6291                                          status);
6292
6293 }
6294
6295 /*----------------------------------------------------------------------------
6296 | Returns the result of adding the quadruple-precision floating-point values
6297 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6298 | for Binary Floating-Point Arithmetic.
6299 *----------------------------------------------------------------------------*/
6300
6301 float128 float128_add(float128 a, float128 b, float_status *status)
6302 {
6303     flag aSign, bSign;
6304
6305     aSign = extractFloat128Sign( a );
6306     bSign = extractFloat128Sign( b );
6307     if ( aSign == bSign ) {
6308         return addFloat128Sigs(a, b, aSign, status);
6309     }
6310     else {
6311         return subFloat128Sigs(a, b, aSign, status);
6312     }
6313
6314 }
6315
6316 /*----------------------------------------------------------------------------
6317 | Returns the result of subtracting the quadruple-precision floating-point
6318 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6319 | Standard for Binary Floating-Point Arithmetic.
6320 *----------------------------------------------------------------------------*/
6321
6322 float128 float128_sub(float128 a, float128 b, float_status *status)
6323 {
6324     flag aSign, bSign;
6325
6326     aSign = extractFloat128Sign( a );
6327     bSign = extractFloat128Sign( b );
6328     if ( aSign == bSign ) {
6329         return subFloat128Sigs(a, b, aSign, status);
6330     }
6331     else {
6332         return addFloat128Sigs(a, b, aSign, status);
6333     }
6334
6335 }
6336
6337 /*----------------------------------------------------------------------------
6338 | Returns the result of multiplying the quadruple-precision floating-point
6339 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6340 | Standard for Binary Floating-Point Arithmetic.
6341 *----------------------------------------------------------------------------*/
6342
6343 float128 float128_mul(float128 a, float128 b, float_status *status)
6344 {
6345     flag aSign, bSign, zSign;
6346     int32_t aExp, bExp, zExp;
6347     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6348
6349     aSig1 = extractFloat128Frac1( a );
6350     aSig0 = extractFloat128Frac0( a );
6351     aExp = extractFloat128Exp( a );
6352     aSign = extractFloat128Sign( a );
6353     bSig1 = extractFloat128Frac1( b );
6354     bSig0 = extractFloat128Frac0( b );
6355     bExp = extractFloat128Exp( b );
6356     bSign = extractFloat128Sign( b );
6357     zSign = aSign ^ bSign;
6358     if ( aExp == 0x7FFF ) {
6359         if (    ( aSig0 | aSig1 )
6360              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6361             return propagateFloat128NaN(a, b, status);
6362         }
6363         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6364         return packFloat128( zSign, 0x7FFF, 0, 0 );
6365     }
6366     if ( bExp == 0x7FFF ) {
6367         if (bSig0 | bSig1) {
6368             return propagateFloat128NaN(a, b, status);
6369         }
6370         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6371  invalid:
6372             float_raise(float_flag_invalid, status);
6373             return float128_default_nan(status);
6374         }
6375         return packFloat128( zSign, 0x7FFF, 0, 0 );
6376     }
6377     if ( aExp == 0 ) {
6378         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6379         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6380     }
6381     if ( bExp == 0 ) {
6382         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6383         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6384     }
6385     zExp = aExp + bExp - 0x4000;
6386     aSig0 |= LIT64( 0x0001000000000000 );
6387     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6388     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6389     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6390     zSig2 |= ( zSig3 != 0 );
6391     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6392         shift128ExtraRightJamming(
6393             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6394         ++zExp;
6395     }
6396     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6397
6398 }
6399
6400 /*----------------------------------------------------------------------------
6401 | Returns the result of dividing the quadruple-precision floating-point value
6402 | `a' by the corresponding value `b'.  The operation is performed according to
6403 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6404 *----------------------------------------------------------------------------*/
6405
6406 float128 float128_div(float128 a, float128 b, float_status *status)
6407 {
6408     flag aSign, bSign, zSign;
6409     int32_t aExp, bExp, zExp;
6410     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6411     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6412
6413     aSig1 = extractFloat128Frac1( a );
6414     aSig0 = extractFloat128Frac0( a );
6415     aExp = extractFloat128Exp( a );
6416     aSign = extractFloat128Sign( a );
6417     bSig1 = extractFloat128Frac1( b );
6418     bSig0 = extractFloat128Frac0( b );
6419     bExp = extractFloat128Exp( b );
6420     bSign = extractFloat128Sign( b );
6421     zSign = aSign ^ bSign;
6422     if ( aExp == 0x7FFF ) {
6423         if (aSig0 | aSig1) {
6424             return propagateFloat128NaN(a, b, status);
6425         }
6426         if ( bExp == 0x7FFF ) {
6427             if (bSig0 | bSig1) {
6428                 return propagateFloat128NaN(a, b, status);
6429             }
6430             goto invalid;
6431         }
6432         return packFloat128( zSign, 0x7FFF, 0, 0 );
6433     }
6434     if ( bExp == 0x7FFF ) {
6435         if (bSig0 | bSig1) {
6436             return propagateFloat128NaN(a, b, status);
6437         }
6438         return packFloat128( zSign, 0, 0, 0 );
6439     }
6440     if ( bExp == 0 ) {
6441         if ( ( bSig0 | bSig1 ) == 0 ) {
6442             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6443  invalid:
6444                 float_raise(float_flag_invalid, status);
6445                 return float128_default_nan(status);
6446             }
6447             float_raise(float_flag_divbyzero, status);
6448             return packFloat128( zSign, 0x7FFF, 0, 0 );
6449         }
6450         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6451     }
6452     if ( aExp == 0 ) {
6453         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6454         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6455     }
6456     zExp = aExp - bExp + 0x3FFD;
6457     shortShift128Left(
6458         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6459     shortShift128Left(
6460         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6461     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6462         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6463         ++zExp;
6464     }
6465     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6466     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6467     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6468     while ( (int64_t) rem0 < 0 ) {
6469         --zSig0;
6470         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6471     }
6472     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6473     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6474         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6475         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6476         while ( (int64_t) rem1 < 0 ) {
6477             --zSig1;
6478             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6479         }
6480         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6481     }
6482     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6483     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6484
6485 }
6486
6487 /*----------------------------------------------------------------------------
6488 | Returns the remainder of the quadruple-precision floating-point value `a'
6489 | with respect to the corresponding value `b'.  The operation is performed
6490 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6491 *----------------------------------------------------------------------------*/
6492
6493 float128 float128_rem(float128 a, float128 b, float_status *status)
6494 {
6495     flag aSign, zSign;
6496     int32_t aExp, bExp, expDiff;
6497     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6498     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6499     int64_t sigMean0;
6500
6501     aSig1 = extractFloat128Frac1( a );
6502     aSig0 = extractFloat128Frac0( a );
6503     aExp = extractFloat128Exp( a );
6504     aSign = extractFloat128Sign( a );
6505     bSig1 = extractFloat128Frac1( b );
6506     bSig0 = extractFloat128Frac0( b );
6507     bExp = extractFloat128Exp( b );
6508     if ( aExp == 0x7FFF ) {
6509         if (    ( aSig0 | aSig1 )
6510              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6511             return propagateFloat128NaN(a, b, status);
6512         }
6513         goto invalid;
6514     }
6515     if ( bExp == 0x7FFF ) {
6516         if (bSig0 | bSig1) {
6517             return propagateFloat128NaN(a, b, status);
6518         }
6519         return a;
6520     }
6521     if ( bExp == 0 ) {
6522         if ( ( bSig0 | bSig1 ) == 0 ) {
6523  invalid:
6524             float_raise(float_flag_invalid, status);
6525             return float128_default_nan(status);
6526         }
6527         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6528     }
6529     if ( aExp == 0 ) {
6530         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6531         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6532     }
6533     expDiff = aExp - bExp;
6534     if ( expDiff < -1 ) return a;
6535     shortShift128Left(
6536         aSig0 | LIT64( 0x0001000000000000 ),
6537         aSig1,
6538         15 - ( expDiff < 0 ),
6539         &aSig0,
6540         &aSig1
6541     );
6542     shortShift128Left(
6543         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6544     q = le128( bSig0, bSig1, aSig0, aSig1 );
6545     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6546     expDiff -= 64;
6547     while ( 0 < expDiff ) {
6548         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6549         q = ( 4 < q ) ? q - 4 : 0;
6550         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6551         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6552         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6553         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6554         expDiff -= 61;
6555     }
6556     if ( -64 < expDiff ) {
6557         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6558         q = ( 4 < q ) ? q - 4 : 0;
6559         q >>= - expDiff;
6560         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6561         expDiff += 52;
6562         if ( expDiff < 0 ) {
6563             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6564         }
6565         else {
6566             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6567         }
6568         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6569         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6570     }
6571     else {
6572         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6573         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6574     }
6575     do {
6576         alternateASig0 = aSig0;
6577         alternateASig1 = aSig1;
6578         ++q;
6579         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6580     } while ( 0 <= (int64_t) aSig0 );
6581     add128(
6582         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6583     if (    ( sigMean0 < 0 )
6584          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6585         aSig0 = alternateASig0;
6586         aSig1 = alternateASig1;
6587     }
6588     zSign = ( (int64_t) aSig0 < 0 );
6589     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6590     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6591                                          status);
6592 }
6593
6594 /*----------------------------------------------------------------------------
6595 | Returns the square root of the quadruple-precision floating-point value `a'.
6596 | The operation is performed according to the IEC/IEEE Standard for Binary
6597 | Floating-Point Arithmetic.
6598 *----------------------------------------------------------------------------*/
6599
6600 float128 float128_sqrt(float128 a, float_status *status)
6601 {
6602     flag aSign;
6603     int32_t aExp, zExp;
6604     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6605     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6606
6607     aSig1 = extractFloat128Frac1( a );
6608     aSig0 = extractFloat128Frac0( a );
6609     aExp = extractFloat128Exp( a );
6610     aSign = extractFloat128Sign( a );
6611     if ( aExp == 0x7FFF ) {
6612         if (aSig0 | aSig1) {
6613             return propagateFloat128NaN(a, a, status);
6614         }
6615         if ( ! aSign ) return a;
6616         goto invalid;
6617     }
6618     if ( aSign ) {
6619         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6620  invalid:
6621         float_raise(float_flag_invalid, status);
6622         return float128_default_nan(status);
6623     }
6624     if ( aExp == 0 ) {
6625         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6626         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6627     }
6628     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6629     aSig0 |= LIT64( 0x0001000000000000 );
6630     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6631     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6632     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6633     doubleZSig0 = zSig0<<1;
6634     mul64To128( zSig0, zSig0, &term0, &term1 );
6635     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6636     while ( (int64_t) rem0 < 0 ) {
6637         --zSig0;
6638         doubleZSig0 -= 2;
6639         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6640     }
6641     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6642     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6643         if ( zSig1 == 0 ) zSig1 = 1;
6644         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6645         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6646         mul64To128( zSig1, zSig1, &term2, &term3 );
6647         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6648         while ( (int64_t) rem1 < 0 ) {
6649             --zSig1;
6650             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6651             term3 |= 1;
6652             term2 |= doubleZSig0;
6653             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6654         }
6655         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6656     }
6657     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6658     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6659
6660 }
6661
6662 /*----------------------------------------------------------------------------
6663 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6664 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6665 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6666 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6667 *----------------------------------------------------------------------------*/
6668
6669 int float128_eq(float128 a, float128 b, float_status *status)
6670 {
6671
6672     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6673               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6674          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6675               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6676        ) {
6677         float_raise(float_flag_invalid, status);
6678         return 0;
6679     }
6680     return
6681            ( a.low == b.low )
6682         && (    ( a.high == b.high )
6683              || (    ( a.low == 0 )
6684                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6685            );
6686
6687 }
6688
6689 /*----------------------------------------------------------------------------
6690 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6691 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6692 | exception is raised if either operand is a NaN.  The comparison is performed
6693 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6694 *----------------------------------------------------------------------------*/
6695
6696 int float128_le(float128 a, float128 b, float_status *status)
6697 {
6698     flag aSign, bSign;
6699
6700     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6701               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6702          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6703               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6704        ) {
6705         float_raise(float_flag_invalid, status);
6706         return 0;
6707     }
6708     aSign = extractFloat128Sign( a );
6709     bSign = extractFloat128Sign( b );
6710     if ( aSign != bSign ) {
6711         return
6712                aSign
6713             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6714                  == 0 );
6715     }
6716     return
6717           aSign ? le128( b.high, b.low, a.high, a.low )
6718         : le128( a.high, a.low, b.high, b.low );
6719
6720 }
6721
6722 /*----------------------------------------------------------------------------
6723 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6724 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6725 | raised if either operand is a NaN.  The comparison is performed according
6726 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6727 *----------------------------------------------------------------------------*/
6728
6729 int float128_lt(float128 a, float128 b, float_status *status)
6730 {
6731     flag aSign, bSign;
6732
6733     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6734               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6735          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6736               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6737        ) {
6738         float_raise(float_flag_invalid, status);
6739         return 0;
6740     }
6741     aSign = extractFloat128Sign( a );
6742     bSign = extractFloat128Sign( b );
6743     if ( aSign != bSign ) {
6744         return
6745                aSign
6746             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6747                  != 0 );
6748     }
6749     return
6750           aSign ? lt128( b.high, b.low, a.high, a.low )
6751         : lt128( a.high, a.low, b.high, b.low );
6752
6753 }
6754
6755 /*----------------------------------------------------------------------------
6756 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6757 | be compared, and 0 otherwise.  The invalid exception is raised if either
6758 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6759 | Standard for Binary Floating-Point Arithmetic.
6760 *----------------------------------------------------------------------------*/
6761
6762 int float128_unordered(float128 a, float128 b, float_status *status)
6763 {
6764     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6765               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6766          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6767               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6768        ) {
6769         float_raise(float_flag_invalid, status);
6770         return 1;
6771     }
6772     return 0;
6773 }
6774
6775 /*----------------------------------------------------------------------------
6776 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6777 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6778 | exception.  The comparison is performed according to the IEC/IEEE Standard
6779 | for Binary Floating-Point Arithmetic.
6780 *----------------------------------------------------------------------------*/
6781
6782 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6783 {
6784
6785     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6786               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6787          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6788               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6789        ) {
6790         if (float128_is_signaling_nan(a, status)
6791          || float128_is_signaling_nan(b, status)) {
6792             float_raise(float_flag_invalid, status);
6793         }
6794         return 0;
6795     }
6796     return
6797            ( a.low == b.low )
6798         && (    ( a.high == b.high )
6799              || (    ( a.low == 0 )
6800                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6801            );
6802
6803 }
6804
6805 /*----------------------------------------------------------------------------
6806 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6807 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6808 | cause an exception.  Otherwise, the comparison is performed according to the
6809 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6810 *----------------------------------------------------------------------------*/
6811
6812 int float128_le_quiet(float128 a, float128 b, float_status *status)
6813 {
6814     flag aSign, bSign;
6815
6816     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6817               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6818          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6819               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6820        ) {
6821         if (float128_is_signaling_nan(a, status)
6822          || float128_is_signaling_nan(b, status)) {
6823             float_raise(float_flag_invalid, status);
6824         }
6825         return 0;
6826     }
6827     aSign = extractFloat128Sign( a );
6828     bSign = extractFloat128Sign( b );
6829     if ( aSign != bSign ) {
6830         return
6831                aSign
6832             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6833                  == 0 );
6834     }
6835     return
6836           aSign ? le128( b.high, b.low, a.high, a.low )
6837         : le128( a.high, a.low, b.high, b.low );
6838
6839 }
6840
6841 /*----------------------------------------------------------------------------
6842 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6843 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6844 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6845 | Standard for Binary Floating-Point Arithmetic.
6846 *----------------------------------------------------------------------------*/
6847
6848 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6849 {
6850     flag aSign, bSign;
6851
6852     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6853               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6854          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6855               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6856        ) {
6857         if (float128_is_signaling_nan(a, status)
6858          || float128_is_signaling_nan(b, status)) {
6859             float_raise(float_flag_invalid, status);
6860         }
6861         return 0;
6862     }
6863     aSign = extractFloat128Sign( a );
6864     bSign = extractFloat128Sign( b );
6865     if ( aSign != bSign ) {
6866         return
6867                aSign
6868             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6869                  != 0 );
6870     }
6871     return
6872           aSign ? lt128( b.high, b.low, a.high, a.low )
6873         : lt128( a.high, a.low, b.high, b.low );
6874
6875 }
6876
6877 /*----------------------------------------------------------------------------
6878 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6879 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6880 | comparison is performed according to the IEC/IEEE Standard for Binary
6881 | Floating-Point Arithmetic.
6882 *----------------------------------------------------------------------------*/
6883
6884 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6885 {
6886     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6887               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6888          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6889               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6890        ) {
6891         if (float128_is_signaling_nan(a, status)
6892          || float128_is_signaling_nan(b, status)) {
6893             float_raise(float_flag_invalid, status);
6894         }
6895         return 1;
6896     }
6897     return 0;
6898 }
6899
6900 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6901                                             int is_quiet, float_status *status)
6902 {
6903     flag aSign, bSign;
6904
6905     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6906         float_raise(float_flag_invalid, status);
6907         return float_relation_unordered;
6908     }
6909     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6910           ( extractFloatx80Frac( a )<<1 ) ) ||
6911         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6912           ( extractFloatx80Frac( b )<<1 ) )) {
6913         if (!is_quiet ||
6914             floatx80_is_signaling_nan(a, status) ||
6915             floatx80_is_signaling_nan(b, status)) {
6916             float_raise(float_flag_invalid, status);
6917         }
6918         return float_relation_unordered;
6919     }
6920     aSign = extractFloatx80Sign( a );
6921     bSign = extractFloatx80Sign( b );
6922     if ( aSign != bSign ) {
6923
6924         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6925              ( ( a.low | b.low ) == 0 ) ) {
6926             /* zero case */
6927             return float_relation_equal;
6928         } else {
6929             return 1 - (2 * aSign);
6930         }
6931     } else {
6932         if (a.low == b.low && a.high == b.high) {
6933             return float_relation_equal;
6934         } else {
6935             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6936         }
6937     }
6938 }
6939
6940 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6941 {
6942     return floatx80_compare_internal(a, b, 0, status);
6943 }
6944
6945 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6946 {
6947     return floatx80_compare_internal(a, b, 1, status);
6948 }
6949
6950 static inline int float128_compare_internal(float128 a, float128 b,
6951                                             int is_quiet, float_status *status)
6952 {
6953     flag aSign, bSign;
6954
6955     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6956           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6957         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6958           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6959         if (!is_quiet ||
6960             float128_is_signaling_nan(a, status) ||
6961             float128_is_signaling_nan(b, status)) {
6962             float_raise(float_flag_invalid, status);
6963         }
6964         return float_relation_unordered;
6965     }
6966     aSign = extractFloat128Sign( a );
6967     bSign = extractFloat128Sign( b );
6968     if ( aSign != bSign ) {
6969         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6970             /* zero case */
6971             return float_relation_equal;
6972         } else {
6973             return 1 - (2 * aSign);
6974         }
6975     } else {
6976         if (a.low == b.low && a.high == b.high) {
6977             return float_relation_equal;
6978         } else {
6979             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6980         }
6981     }
6982 }
6983
6984 int float128_compare(float128 a, float128 b, float_status *status)
6985 {
6986     return float128_compare_internal(a, b, 0, status);
6987 }
6988
6989 int float128_compare_quiet(float128 a, float128 b, float_status *status)
6990 {
6991     return float128_compare_internal(a, b, 1, status);
6992 }
6993
6994 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
6995 {
6996     flag aSign;
6997     int32_t aExp;
6998     uint64_t aSig;
6999
7000     if (floatx80_invalid_encoding(a)) {
7001         float_raise(float_flag_invalid, status);
7002         return floatx80_default_nan(status);
7003     }
7004     aSig = extractFloatx80Frac( a );
7005     aExp = extractFloatx80Exp( a );
7006     aSign = extractFloatx80Sign( a );
7007
7008     if ( aExp == 0x7FFF ) {
7009         if ( aSig<<1 ) {
7010             return propagateFloatx80NaN(a, a, status);
7011         }
7012         return a;
7013     }
7014
7015     if (aExp == 0) {
7016         if (aSig == 0) {
7017             return a;
7018         }
7019         aExp++;
7020     }
7021
7022     if (n > 0x10000) {
7023         n = 0x10000;
7024     } else if (n < -0x10000) {
7025         n = -0x10000;
7026     }
7027
7028     aExp += n;
7029     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7030                                          aSign, aExp, aSig, 0, status);
7031 }
7032
7033 float128 float128_scalbn(float128 a, int n, float_status *status)
7034 {
7035     flag aSign;
7036     int32_t aExp;
7037     uint64_t aSig0, aSig1;
7038
7039     aSig1 = extractFloat128Frac1( a );
7040     aSig0 = extractFloat128Frac0( a );
7041     aExp = extractFloat128Exp( a );
7042     aSign = extractFloat128Sign( a );
7043     if ( aExp == 0x7FFF ) {
7044         if ( aSig0 | aSig1 ) {
7045             return propagateFloat128NaN(a, a, status);
7046         }
7047         return a;
7048     }
7049     if (aExp != 0) {
7050         aSig0 |= LIT64( 0x0001000000000000 );
7051     } else if (aSig0 == 0 && aSig1 == 0) {
7052         return a;
7053     } else {
7054         aExp++;
7055     }
7056
7057     if (n > 0x10000) {
7058         n = 0x10000;
7059     } else if (n < -0x10000) {
7060         n = -0x10000;
7061     }
7062
7063     aExp += n - 1;
7064     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7065                                          , status);
7066
7067 }