fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * The code in this source file is derived from release 2a of the SoftFloat
   5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6  * some later contributions) are provided under that license, as detailed below.
   7  * It has subsequently been modified by contributors to the QEMU Project,
   8  * so some portions are provided under:
   9  *  the SoftFloat-2a license
  10  *  the BSD license
  11  *  GPL-v2-or-later
  12  *
  13  * Any future contributions to this file after December 1st 2014 will be
  14  * taken to be licensed under the Softfloat-2a license unless specifically
  15  * indicated otherwise.
  16  */
  17
  18 /*
  19 ===============================================================================
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* BSD licensing:
  48  * Copyright (c) 2006, Fabrice Bellard
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are met:
  53  *
  54  * 1. Redistributions of source code must retain the above copyright notice,
  55  * this list of conditions and the following disclaimer.
  56  *
  57  * 2. Redistributions in binary form must reproduce the above copyright notice,
  58  * this list of conditions and the following disclaimer in the documentation
  59  * and/or other materials provided with the distribution.
  60  *
  61  * 3. Neither the name of the copyright holder nor the names of its contributors
  62  * may be used to endorse or promote products derived from this software without
  63  * specific prior written permission.
  64  *
  65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75  * THE POSSIBILITY OF SUCH DAMAGE.
  76  */
  77
  78 /* Portions of this work are licensed under the terms of the GNU GPL,
  79  * version 2 or later. See the COPYING file in the top-level directory.
  80  */
  81
  82 /* softfloat (and in particular the code in softfloat-specialize.h) is
  83  * target-dependent and needs the TARGET_* macros.
  84  */
  85 #include "qemu/osdep.h"
  86 #include "qemu/bitops.h"
  87 #include "fpu/softfloat.h"
  88
  89 /* We only need stdlib for abort() */
  90
  91 /*----------------------------------------------------------------------------
  92 | Primitive arithmetic functions, including multi-word arithmetic, and
  93 | division and square root approximations.  (Can be specialized to target if
  94 | desired.)
  95 *----------------------------------------------------------------------------*/
  96 #include "fpu/softfloat-macros.h"
  97
  98 /*----------------------------------------------------------------------------
  99 | Functions and definitions to determine:  (1) whether tininess for underflow
 100 | is detected before or after rounding by default, (2) what (if anything)
 101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
 102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 103 | are propagated from function inputs to output.  These details are target-
 104 | specific.
 105 *----------------------------------------------------------------------------*/
 106 #include "softfloat-specialize.h"
 107
 108 /*----------------------------------------------------------------------------
 109 | Returns the fraction bits of the half-precision floating-point value `a'.
 110 *----------------------------------------------------------------------------*/
 111
 112 static inline uint32_t extractFloat16Frac(float16 a)
 113 {
 114     return float16_val(a) & 0x3ff;
 115 }
 116
 117 /*----------------------------------------------------------------------------
 118 | Returns the exponent bits of the half-precision floating-point value `a'.
 119 *----------------------------------------------------------------------------*/
 120
 121 static inline int extractFloat16Exp(float16 a)
 122 {
 123     return (float16_val(a) >> 10) & 0x1f;
 124 }
 125
 126 /*----------------------------------------------------------------------------
 127 | Returns the sign bit of the single-precision floating-point value `a'.
 128 *----------------------------------------------------------------------------*/
 129
 130 static inline flag extractFloat16Sign(float16 a)
 131 {
 132     return float16_val(a)>>15;
 133 }
 134
 135 /*----------------------------------------------------------------------------
 136 | Returns the fraction bits of the single-precision floating-point value `a'.
 137 *----------------------------------------------------------------------------*/
 138
 139 static inline uint32_t extractFloat32Frac(float32 a)
 140 {
 141     return float32_val(a) & 0x007FFFFF;
 142 }
 143
 144 /*----------------------------------------------------------------------------
 145 | Returns the exponent bits of the single-precision floating-point value `a'.
 146 *----------------------------------------------------------------------------*/
 147
 148 static inline int extractFloat32Exp(float32 a)
 149 {
 150     return (float32_val(a) >> 23) & 0xFF;
 151 }
 152
 153 /*----------------------------------------------------------------------------
 154 | Returns the sign bit of the single-precision floating-point value `a'.
 155 *----------------------------------------------------------------------------*/
 156
 157 static inline flag extractFloat32Sign(float32 a)
 158 {
 159     return float32_val(a) >> 31;
 160 }
 161
 162 /*----------------------------------------------------------------------------
 163 | Returns the fraction bits of the double-precision floating-point value `a'.
 164 *----------------------------------------------------------------------------*/
 165
 166 static inline uint64_t extractFloat64Frac(float64 a)
 167 {
 168     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
 169 }
 170
 171 /*----------------------------------------------------------------------------
 172 | Returns the exponent bits of the double-precision floating-point value `a'.
 173 *----------------------------------------------------------------------------*/
 174
 175 static inline int extractFloat64Exp(float64 a)
 176 {
 177     return (float64_val(a) >> 52) & 0x7FF;
 178 }
 179
 180 /*----------------------------------------------------------------------------
 181 | Returns the sign bit of the double-precision floating-point value `a'.
 182 *----------------------------------------------------------------------------*/
 183
 184 static inline flag extractFloat64Sign(float64 a)
 185 {
 186     return float64_val(a) >> 63;
 187 }
 188
 189 /*
 190  * Classify a floating point number. Everything above float_class_qnan
 191  * is a NaN so cls >= float_class_qnan is any NaN.
 192  */
 193
 194 typedef enum __attribute__ ((__packed__)) {
 195     float_class_unclassified,
 196     float_class_zero,
 197     float_class_normal,
 198     float_class_inf,
 199     float_class_qnan,  /* all NaNs from here */
 200     float_class_snan,
 201     float_class_dnan,
 202     float_class_msnan, /* maybe silenced */
 203 } FloatClass;
 204
 205 /*
 206  * Structure holding all of the decomposed parts of a float. The
 207  * exponent is unbiased and the fraction is normalized. All
 208  * calculations are done with a 64 bit fraction and then rounded as
 209  * appropriate for the final format.
 210  *
 211  * Thanks to the packed FloatClass a decent compiler should be able to
 212  * fit the whole structure into registers and avoid using the stack
 213  * for parameter passing.
 214  */
 215
 216 typedef struct {
 217     uint64_t frac;
 218     int32_t  exp;
 219     FloatClass cls;
 220     bool sign;
 221 } FloatParts;
 222
 223 #define DECOMPOSED_BINARY_POINT    (64 - 2)
 224 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 225 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
 226
 227 /* Structure holding all of the relevant parameters for a format.
 228  *   exp_size: the size of the exponent field
 229  *   exp_bias: the offset applied to the exponent field
 230  *   exp_max: the maximum normalised exponent
 231  *   frac_size: the size of the fraction field
 232  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
 233  * The following are computed based the size of fraction
 234  *   frac_lsb: least significant bit of fraction
 235  *   fram_lsbm1: the bit bellow the least significant bit (for rounding)
 236  *   round_mask/roundeven_mask: masks used for rounding
 237  */
 238 typedef struct {
 239     int exp_size;
 240     int exp_bias;
 241     int exp_max;
 242     int frac_size;
 243     int frac_shift;
 244     uint64_t frac_lsb;
 245     uint64_t frac_lsbm1;
 246     uint64_t round_mask;
 247     uint64_t roundeven_mask;
 248 } FloatFmt;
 249
 250 /* Expand fields based on the size of exponent and fraction */
 251 #define FLOAT_PARAMS(E, F)                                           \
 252     .exp_size       = E,                                             \
 253     .exp_bias       = ((1 << E) - 1) >> 1,                           \
 254     .exp_max        = (1 << E) - 1,                                  \
 255     .frac_size      = F,                                             \
 256     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
 257     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
 258     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
 259     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
 260     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
 261
 262 static const FloatFmt float16_params = {
 263     FLOAT_PARAMS(5, 10)
 264 };
 265
 266 static const FloatFmt float32_params = {
 267     FLOAT_PARAMS(8, 23)
 268 };
 269
 270 static const FloatFmt float64_params = {
 271     FLOAT_PARAMS(11, 52)
 272 };
 273
 274 /* Unpack a float to parts, but do not canonicalize.  */
 275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
 276 {
 277     const int sign_pos = fmt.frac_size + fmt.exp_size;
 278
 279     return (FloatParts) {
 280         .cls = float_class_unclassified,
 281         .sign = extract64(raw, sign_pos, 1),
 282         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
 283         .frac = extract64(raw, 0, fmt.frac_size),
 284     };
 285 }
 286
 287 static inline FloatParts float16_unpack_raw(float16 f)
 288 {
 289     return unpack_raw(float16_params, f);
 290 }
 291
 292 static inline FloatParts float32_unpack_raw(float32 f)
 293 {
 294     return unpack_raw(float32_params, f);
 295 }
 296
 297 static inline FloatParts float64_unpack_raw(float64 f)
 298 {
 299     return unpack_raw(float64_params, f);
 300 }
 301
 302 /* Pack a float from parts, but do not canonicalize.  */
 303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
 304 {
 305     const int sign_pos = fmt.frac_size + fmt.exp_size;
 306     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
 307     return deposit64(ret, sign_pos, 1, p.sign);
 308 }
 309
 310 static inline float16 float16_pack_raw(FloatParts p)
 311 {
 312     return make_float16(pack_raw(float16_params, p));
 313 }
 314
 315 static inline float32 float32_pack_raw(FloatParts p)
 316 {
 317     return make_float32(pack_raw(float32_params, p));
 318 }
 319
 320 static inline float64 float64_pack_raw(FloatParts p)
 321 {
 322     return make_float64(pack_raw(float64_params, p));
 323 }
 324
 325 /* Canonicalize EXP and FRAC, setting CLS.  */
 326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
 327                                float_status *status)
 328 {
 329     if (part.exp == parm->exp_max) {
 330         if (part.frac == 0) {
 331             part.cls = float_class_inf;
 332         } else {
 333 #ifdef NO_SIGNALING_NANS
 334             part.cls = float_class_qnan;
 335 #else
 336             int64_t msb = part.frac << (parm->frac_shift + 2);
 337             if ((msb < 0) == status->snan_bit_is_one) {
 338                 part.cls = float_class_snan;
 339             } else {
 340                 part.cls = float_class_qnan;
 341             }
 342 #endif
 343         }
 344     } else if (part.exp == 0) {
 345         if (likely(part.frac == 0)) {
 346             part.cls = float_class_zero;
 347         } else if (status->flush_inputs_to_zero) {
 348             float_raise(float_flag_input_denormal, status);
 349             part.cls = float_class_zero;
 350             part.frac = 0;
 351         } else {
 352             int shift = clz64(part.frac) - 1;
 353             part.cls = float_class_normal;
 354             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
 355             part.frac <<= shift;
 356         }
 357     } else {
 358         part.cls = float_class_normal;
 359         part.exp -= parm->exp_bias;
 360         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
 361     }
 362     return part;
 363 }
 364
 365 /* Round and uncanonicalize a floating-point number by parts. There
 366  * are FRAC_SHIFT bits that may require rounding at the bottom of the
 367  * fraction; these bits will be removed. The exponent will be biased
 368  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
 369  */
 370
 371 static FloatParts round_canonical(FloatParts p, float_status *s,
 372                                   const FloatFmt *parm)
 373 {
 374     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
 375     const uint64_t round_mask = parm->round_mask;
 376     const uint64_t roundeven_mask = parm->roundeven_mask;
 377     const int exp_max = parm->exp_max;
 378     const int frac_shift = parm->frac_shift;
 379     uint64_t frac, inc;
 380     int exp, flags = 0;
 381     bool overflow_norm;
 382
 383     frac = p.frac;
 384     exp = p.exp;
 385
 386     switch (p.cls) {
 387     case float_class_normal:
 388         switch (s->float_rounding_mode) {
 389         case float_round_nearest_even:
 390             overflow_norm = false;
 391             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
 392             break;
 393         case float_round_ties_away:
 394             overflow_norm = false;
 395             inc = frac_lsbm1;
 396             break;
 397         case float_round_to_zero:
 398             overflow_norm = true;
 399             inc = 0;
 400             break;
 401         case float_round_up:
 402             inc = p.sign ? 0 : round_mask;
 403             overflow_norm = p.sign;
 404             break;
 405         case float_round_down:
 406             inc = p.sign ? round_mask : 0;
 407             overflow_norm = !p.sign;
 408             break;
 409         default:
 410             g_assert_not_reached();
 411         }
 412
 413         exp += parm->exp_bias;
 414         if (likely(exp > 0)) {
 415             if (frac & round_mask) {
 416                 flags |= float_flag_inexact;
 417                 frac += inc;
 418                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
 419                     frac >>= 1;
 420                     exp++;
 421                 }
 422             }
 423             frac >>= frac_shift;
 424
 425             if (unlikely(exp >= exp_max)) {
 426                 flags |= float_flag_overflow | float_flag_inexact;
 427                 if (overflow_norm) {
 428                     exp = exp_max - 1;
 429                     frac = -1;
 430                 } else {
 431                     p.cls = float_class_inf;
 432                     goto do_inf;
 433                 }
 434             }
 435         } else if (s->flush_to_zero) {
 436             flags |= float_flag_output_denormal;
 437             p.cls = float_class_zero;
 438             goto do_zero;
 439         } else {
 440             bool is_tiny = (s->float_detect_tininess
 441                             == float_tininess_before_rounding)
 442                         || (exp < 0)
 443                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
 444
 445             shift64RightJamming(frac, 1 - exp, &frac);
 446             if (frac & round_mask) {
 447                 /* Need to recompute round-to-even.  */
 448                 if (s->float_rounding_mode == float_round_nearest_even) {
 449                     inc = ((frac & roundeven_mask) != frac_lsbm1
 450                            ? frac_lsbm1 : 0);
 451                 }
 452                 flags |= float_flag_inexact;
 453                 frac += inc;
 454             }
 455
 456             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
 457             frac >>= frac_shift;
 458
 459             if (is_tiny && (flags & float_flag_inexact)) {
 460                 flags |= float_flag_underflow;
 461             }
 462             if (exp == 0 && frac == 0) {
 463                 p.cls = float_class_zero;
 464             }
 465         }
 466         break;
 467
 468     case float_class_zero:
 469     do_zero:
 470         exp = 0;
 471         frac = 0;
 472         break;
 473
 474     case float_class_inf:
 475     do_inf:
 476         exp = exp_max;
 477         frac = 0;
 478         break;
 479
 480     case float_class_qnan:
 481     case float_class_snan:
 482         exp = exp_max;
 483         break;
 484
 485     default:
 486         g_assert_not_reached();
 487     }
 488
 489     float_raise(flags, s);
 490     p.exp = exp;
 491     p.frac = frac;
 492     return p;
 493 }
 494
 495 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
 496 {
 497     return canonicalize(float16_unpack_raw(f), &float16_params, s);
 498 }
 499
 500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
 501 {
 502     switch (p.cls) {
 503     case float_class_dnan:
 504         return float16_default_nan(s);
 505     case float_class_msnan:
 506         return float16_maybe_silence_nan(float16_pack_raw(p), s);
 507     default:
 508         p = round_canonical(p, s, &float16_params);
 509         return float16_pack_raw(p);
 510     }
 511 }
 512
 513 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
 514 {
 515     return canonicalize(float32_unpack_raw(f), &float32_params, s);
 516 }
 517
 518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
 519 {
 520     switch (p.cls) {
 521     case float_class_dnan:
 522         return float32_default_nan(s);
 523     case float_class_msnan:
 524         return float32_maybe_silence_nan(float32_pack_raw(p), s);
 525     default:
 526         p = round_canonical(p, s, &float32_params);
 527         return float32_pack_raw(p);
 528     }
 529 }
 530
 531 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
 532 {
 533     return canonicalize(float64_unpack_raw(f), &float64_params, s);
 534 }
 535
 536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
 537 {
 538     switch (p.cls) {
 539     case float_class_dnan:
 540         return float64_default_nan(s);
 541     case float_class_msnan:
 542         return float64_maybe_silence_nan(float64_pack_raw(p), s);
 543     default:
 544         p = round_canonical(p, s, &float64_params);
 545         return float64_pack_raw(p);
 546     }
 547 }
 548
 549 /* Simple helpers for checking if what NaN we have */
 550 static bool is_nan(FloatClass c)
 551 {
 552     return unlikely(c >= float_class_qnan);
 553 }
 554 static bool is_snan(FloatClass c)
 555 {
 556     return c == float_class_snan;
 557 }
 558 static bool is_qnan(FloatClass c)
 559 {
 560     return c == float_class_qnan;
 561 }
 562
 563 static FloatParts return_nan(FloatParts a, float_status *s)
 564 {
 565     switch (a.cls) {
 566     case float_class_snan:
 567         s->float_exception_flags |= float_flag_invalid;
 568         a.cls = float_class_msnan;
 569         /* fall through */
 570     case float_class_qnan:
 571         if (s->default_nan_mode) {
 572             a.cls = float_class_dnan;
 573         }
 574         break;
 575
 576     default:
 577         g_assert_not_reached();
 578     }
 579     return a;
 580 }
 581
 582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
 583 {
 584     if (is_snan(a.cls) || is_snan(b.cls)) {
 585         s->float_exception_flags |= float_flag_invalid;
 586     }
 587
 588     if (s->default_nan_mode) {
 589         a.cls = float_class_dnan;
 590     } else {
 591         if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
 592                     is_qnan(b.cls), is_snan(b.cls),
 593                     a.frac > b.frac ||
 594                     (a.frac == b.frac && a.sign < b.sign))) {
 595             a = b;
 596         }
 597         a.cls = float_class_msnan;
 598     }
 599     return a;
 600 }
 601
 602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
 603                                   bool inf_zero, float_status *s)
 604 {
 605     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
 606         s->float_exception_flags |= float_flag_invalid;
 607     }
 608
 609     if (s->default_nan_mode) {
 610         a.cls = float_class_dnan;
 611     } else {
 612         switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
 613                               is_qnan(b.cls), is_snan(b.cls),
 614                               is_qnan(c.cls), is_snan(c.cls),
 615                               inf_zero, s)) {
 616         case 0:
 617             break;
 618         case 1:
 619             a = b;
 620             break;
 621         case 2:
 622             a = c;
 623             break;
 624         case 3:
 625             a.cls = float_class_dnan;
 626             return a;
 627         default:
 628             g_assert_not_reached();
 629         }
 630
 631         a.cls = float_class_msnan;
 632     }
 633     return a;
 634 }
 635
 636 /*
 637  * Returns the result of adding or subtracting the values of the
 638  * floating-point values `a' and `b'. The operation is performed
 639  * according to the IEC/IEEE Standard for Binary Floating-Point
 640  * Arithmetic.
 641  */
 642
 643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
 644                                 float_status *s)
 645 {
 646     bool a_sign = a.sign;
 647     bool b_sign = b.sign ^ subtract;
 648
 649     if (a_sign != b_sign) {
 650         /* Subtraction */
 651
 652         if (a.cls == float_class_normal && b.cls == float_class_normal) {
 653             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
 654                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 655                 a.frac = a.frac - b.frac;
 656             } else {
 657                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 658                 a.frac = b.frac - a.frac;
 659                 a.exp = b.exp;
 660                 a_sign ^= 1;
 661             }
 662
 663             if (a.frac == 0) {
 664                 a.cls = float_class_zero;
 665                 a.sign = s->float_rounding_mode == float_round_down;
 666             } else {
 667                 int shift = clz64(a.frac) - 1;
 668                 a.frac = a.frac << shift;
 669                 a.exp = a.exp - shift;
 670                 a.sign = a_sign;
 671             }
 672             return a;
 673         }
 674         if (is_nan(a.cls) || is_nan(b.cls)) {
 675             return pick_nan(a, b, s);
 676         }
 677         if (a.cls == float_class_inf) {
 678             if (b.cls == float_class_inf) {
 679                 float_raise(float_flag_invalid, s);
 680                 a.cls = float_class_dnan;
 681             }
 682             return a;
 683         }
 684         if (a.cls == float_class_zero && b.cls == float_class_zero) {
 685             a.sign = s->float_rounding_mode == float_round_down;
 686             return a;
 687         }
 688         if (a.cls == float_class_zero || b.cls == float_class_inf) {
 689             b.sign = a_sign ^ 1;
 690             return b;
 691         }
 692         if (b.cls == float_class_zero) {
 693             return a;
 694         }
 695     } else {
 696         /* Addition */
 697         if (a.cls == float_class_normal && b.cls == float_class_normal) {
 698             if (a.exp > b.exp) {
 699                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 700             } else if (a.exp < b.exp) {
 701                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 702                 a.exp = b.exp;
 703             }
 704             a.frac += b.frac;
 705             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
 706                 a.frac >>= 1;
 707                 a.exp += 1;
 708             }
 709             return a;
 710         }
 711         if (is_nan(a.cls) || is_nan(b.cls)) {
 712             return pick_nan(a, b, s);
 713         }
 714         if (a.cls == float_class_inf || b.cls == float_class_zero) {
 715             return a;
 716         }
 717         if (b.cls == float_class_inf || a.cls == float_class_zero) {
 718             b.sign = b_sign;
 719             return b;
 720         }
 721     }
 722     g_assert_not_reached();
 723 }
 724
 725 /*
 726  * Returns the result of adding or subtracting the floating-point
 727  * values `a' and `b'. The operation is performed according to the
 728  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 729  */
 730
 731 float16  __attribute__((flatten)) float16_add(float16 a, float16 b,
 732                                               float_status *status)
 733 {
 734     FloatParts pa = float16_unpack_canonical(a, status);
 735     FloatParts pb = float16_unpack_canonical(b, status);
 736     FloatParts pr = addsub_floats(pa, pb, false, status);
 737
 738     return float16_round_pack_canonical(pr, status);
 739 }
 740
 741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
 742                                              float_status *status)
 743 {
 744     FloatParts pa = float32_unpack_canonical(a, status);
 745     FloatParts pb = float32_unpack_canonical(b, status);
 746     FloatParts pr = addsub_floats(pa, pb, false, status);
 747
 748     return float32_round_pack_canonical(pr, status);
 749 }
 750
 751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
 752                                              float_status *status)
 753 {
 754     FloatParts pa = float64_unpack_canonical(a, status);
 755     FloatParts pb = float64_unpack_canonical(b, status);
 756     FloatParts pr = addsub_floats(pa, pb, false, status);
 757
 758     return float64_round_pack_canonical(pr, status);
 759 }
 760
 761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
 762                                              float_status *status)
 763 {
 764     FloatParts pa = float16_unpack_canonical(a, status);
 765     FloatParts pb = float16_unpack_canonical(b, status);
 766     FloatParts pr = addsub_floats(pa, pb, true, status);
 767
 768     return float16_round_pack_canonical(pr, status);
 769 }
 770
 771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
 772                                              float_status *status)
 773 {
 774     FloatParts pa = float32_unpack_canonical(a, status);
 775     FloatParts pb = float32_unpack_canonical(b, status);
 776     FloatParts pr = addsub_floats(pa, pb, true, status);
 777
 778     return float32_round_pack_canonical(pr, status);
 779 }
 780
 781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
 782                                              float_status *status)
 783 {
 784     FloatParts pa = float64_unpack_canonical(a, status);
 785     FloatParts pb = float64_unpack_canonical(b, status);
 786     FloatParts pr = addsub_floats(pa, pb, true, status);
 787
 788     return float64_round_pack_canonical(pr, status);
 789 }
 790
 791 /*
 792  * Returns the result of multiplying the floating-point values `a' and
 793  * `b'. The operation is performed according to the IEC/IEEE Standard
 794  * for Binary Floating-Point Arithmetic.
 795  */
 796
 797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
 798 {
 799     bool sign = a.sign ^ b.sign;
 800
 801     if (a.cls == float_class_normal && b.cls == float_class_normal) {
 802         uint64_t hi, lo;
 803         int exp = a.exp + b.exp;
 804
 805         mul64To128(a.frac, b.frac, &hi, &lo);
 806         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 807         if (lo & DECOMPOSED_OVERFLOW_BIT) {
 808             shift64RightJamming(lo, 1, &lo);
 809             exp += 1;
 810         }
 811
 812         /* Re-use a */
 813         a.exp = exp;
 814         a.sign = sign;
 815         a.frac = lo;
 816         return a;
 817     }
 818     /* handle all the NaN cases */
 819     if (is_nan(a.cls) || is_nan(b.cls)) {
 820         return pick_nan(a, b, s);
 821     }
 822     /* Inf * Zero == NaN */
 823     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
 824         (a.cls == float_class_zero && b.cls == float_class_inf)) {
 825         s->float_exception_flags |= float_flag_invalid;
 826         a.cls = float_class_dnan;
 827         a.sign = sign;
 828         return a;
 829     }
 830     /* Multiply by 0 or Inf */
 831     if (a.cls == float_class_inf || a.cls == float_class_zero) {
 832         a.sign = sign;
 833         return a;
 834     }
 835     if (b.cls == float_class_inf || b.cls == float_class_zero) {
 836         b.sign = sign;
 837         return b;
 838     }
 839     g_assert_not_reached();
 840 }
 841
 842 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
 843                                              float_status *status)
 844 {
 845     FloatParts pa = float16_unpack_canonical(a, status);
 846     FloatParts pb = float16_unpack_canonical(b, status);
 847     FloatParts pr = mul_floats(pa, pb, status);
 848
 849     return float16_round_pack_canonical(pr, status);
 850 }
 851
 852 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
 853                                              float_status *status)
 854 {
 855     FloatParts pa = float32_unpack_canonical(a, status);
 856     FloatParts pb = float32_unpack_canonical(b, status);
 857     FloatParts pr = mul_floats(pa, pb, status);
 858
 859     return float32_round_pack_canonical(pr, status);
 860 }
 861
 862 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
 863                                              float_status *status)
 864 {
 865     FloatParts pa = float64_unpack_canonical(a, status);
 866     FloatParts pb = float64_unpack_canonical(b, status);
 867     FloatParts pr = mul_floats(pa, pb, status);
 868
 869     return float64_round_pack_canonical(pr, status);
 870 }
 871
 872 /*
 873  * Returns the result of multiplying the floating-point values `a' and
 874  * `b' then adding 'c', with no intermediate rounding step after the
 875  * multiplication. The operation is performed according to the
 876  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
 877  * The flags argument allows the caller to select negation of the
 878  * addend, the intermediate product, or the final result. (The
 879  * difference between this and having the caller do a separate
 880  * negation is that negating externally will flip the sign bit on
 881  * NaNs.)
 882  */
 883
 884 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
 885                                 int flags, float_status *s)
 886 {
 887     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
 888                     ((1 << float_class_inf) | (1 << float_class_zero));
 889     bool p_sign;
 890     bool sign_flip = flags & float_muladd_negate_result;
 891     FloatClass p_class;
 892     uint64_t hi, lo;
 893     int p_exp;
 894
 895     /* It is implementation-defined whether the cases of (0,inf,qnan)
 896      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
 897      * they return if they do), so we have to hand this information
 898      * off to the target-specific pick-a-NaN routine.
 899      */
 900     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
 901         return pick_nan_muladd(a, b, c, inf_zero, s);
 902     }
 903
 904     if (inf_zero) {
 905         s->float_exception_flags |= float_flag_invalid;
 906         a.cls = float_class_dnan;
 907         return a;
 908     }
 909
 910     if (flags & float_muladd_negate_c) {
 911         c.sign ^= 1;
 912     }
 913
 914     p_sign = a.sign ^ b.sign;
 915
 916     if (flags & float_muladd_negate_product) {
 917         p_sign ^= 1;
 918     }
 919
 920     if (a.cls == float_class_inf || b.cls == float_class_inf) {
 921         p_class = float_class_inf;
 922     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
 923         p_class = float_class_zero;
 924     } else {
 925         p_class = float_class_normal;
 926     }
 927
 928     if (c.cls == float_class_inf) {
 929         if (p_class == float_class_inf && p_sign != c.sign) {
 930             s->float_exception_flags |= float_flag_invalid;
 931             a.cls = float_class_dnan;
 932         } else {
 933             a.cls = float_class_inf;
 934             a.sign = c.sign ^ sign_flip;
 935         }
 936         return a;
 937     }
 938
 939     if (p_class == float_class_inf) {
 940         a.cls = float_class_inf;
 941         a.sign = p_sign ^ sign_flip;
 942         return a;
 943     }
 944
 945     if (p_class == float_class_zero) {
 946         if (c.cls == float_class_zero) {
 947             if (p_sign != c.sign) {
 948                 p_sign = s->float_rounding_mode == float_round_down;
 949             }
 950             c.sign = p_sign;
 951         } else if (flags & float_muladd_halve_result) {
 952             c.exp -= 1;
 953         }
 954         c.sign ^= sign_flip;
 955         return c;
 956     }
 957
 958     /* a & b should be normals now... */
 959     assert(a.cls == float_class_normal &&
 960            b.cls == float_class_normal);
 961
 962     p_exp = a.exp + b.exp;
 963
 964     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
 965      * result.
 966      */
 967     mul64To128(a.frac, b.frac, &hi, &lo);
 968     /* binary point now at bit 124 */
 969
 970     /* check for overflow */
 971     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
 972         shift128RightJamming(hi, lo, 1, &hi, &lo);
 973         p_exp += 1;
 974     }
 975
 976     /* + add/sub */
 977     if (c.cls == float_class_zero) {
 978         /* move binary point back to 62 */
 979         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 980     } else {
 981         int exp_diff = p_exp - c.exp;
 982         if (p_sign == c.sign) {
 983             /* Addition */
 984             if (exp_diff <= 0) {
 985                 shift128RightJamming(hi, lo,
 986                                      DECOMPOSED_BINARY_POINT - exp_diff,
 987                                      &hi, &lo);
 988                 lo += c.frac;
 989                 p_exp = c.exp;
 990             } else {
 991                 uint64_t c_hi, c_lo;
 992                 /* shift c to the same binary point as the product (124) */
 993                 c_hi = c.frac >> 2;
 994                 c_lo = 0;
 995                 shift128RightJamming(c_hi, c_lo,
 996                                      exp_diff,
 997                                      &c_hi, &c_lo);
 998                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
 999                 /* move binary point back to 62 */
1000                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1001             }
1002
1003             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1004                 shift64RightJamming(lo, 1, &lo);
1005                 p_exp += 1;
1006             }
1007
1008         } else {
1009             /* Subtraction */
1010             uint64_t c_hi, c_lo;
1011             /* make C binary point match product at bit 124 */
1012             c_hi = c.frac >> 2;
1013             c_lo = 0;
1014
1015             if (exp_diff <= 0) {
1016                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1017                 if (exp_diff == 0
1018                     &&
1019                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1020                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1021                 } else {
1022                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1023                     p_sign ^= 1;
1024                     p_exp = c.exp;
1025                 }
1026             } else {
1027                 shift128RightJamming(c_hi, c_lo,
1028                                      exp_diff,
1029                                      &c_hi, &c_lo);
1030                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1031             }
1032
1033             if (hi == 0 && lo == 0) {
1034                 a.cls = float_class_zero;
1035                 a.sign = s->float_rounding_mode == float_round_down;
1036                 a.sign ^= sign_flip;
1037                 return a;
1038             } else {
1039                 int shift;
1040                 if (hi != 0) {
1041                     shift = clz64(hi);
1042                 } else {
1043                     shift = clz64(lo) + 64;
1044                 }
1045                 /* Normalizing to a binary point of 124 is the
1046                    correct adjust for the exponent.  However since we're
1047                    shifting, we might as well put the binary point back
1048                    at 62 where we really want it.  Therefore shift as
1049                    if we're leaving 1 bit at the top of the word, but
1050                    adjust the exponent as if we're leaving 3 bits.  */
1051                 shift -= 1;
1052                 if (shift >= 64) {
1053                     lo = lo << (shift - 64);
1054                 } else {
1055                     hi = (hi << shift) | (lo >> (64 - shift));
1056                     lo = hi | ((lo << shift) != 0);
1057                 }
1058                 p_exp -= shift - 2;
1059             }
1060         }
1061     }
1062
1063     if (flags & float_muladd_halve_result) {
1064         p_exp -= 1;
1065     }
1066
1067     /* finally prepare our result */
1068     a.cls = float_class_normal;
1069     a.sign = p_sign ^ sign_flip;
1070     a.exp = p_exp;
1071     a.frac = lo;
1072
1073     return a;
1074 }
1075
1076 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1077                                                 int flags, float_status *status)
1078 {
1079     FloatParts pa = float16_unpack_canonical(a, status);
1080     FloatParts pb = float16_unpack_canonical(b, status);
1081     FloatParts pc = float16_unpack_canonical(c, status);
1082     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1083
1084     return float16_round_pack_canonical(pr, status);
1085 }
1086
1087 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1088                                                 int flags, float_status *status)
1089 {
1090     FloatParts pa = float32_unpack_canonical(a, status);
1091     FloatParts pb = float32_unpack_canonical(b, status);
1092     FloatParts pc = float32_unpack_canonical(c, status);
1093     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1094
1095     return float32_round_pack_canonical(pr, status);
1096 }
1097
1098 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1099                                                 int flags, float_status *status)
1100 {
1101     FloatParts pa = float64_unpack_canonical(a, status);
1102     FloatParts pb = float64_unpack_canonical(b, status);
1103     FloatParts pc = float64_unpack_canonical(c, status);
1104     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1105
1106     return float64_round_pack_canonical(pr, status);
1107 }
1108
1109 /*
1110  * Returns the result of dividing the floating-point value `a' by the
1111  * corresponding value `b'. The operation is performed according to
1112  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1113  */
1114
1115 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1116 {
1117     bool sign = a.sign ^ b.sign;
1118
1119     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1120         uint64_t temp_lo, temp_hi;
1121         int exp = a.exp - b.exp;
1122         if (a.frac < b.frac) {
1123             exp -= 1;
1124             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1125                               &temp_hi, &temp_lo);
1126         } else {
1127             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1128                               &temp_hi, &temp_lo);
1129         }
1130         /* LSB of quot is set if inexact which roundandpack will use
1131          * to set flags. Yet again we re-use a for the result */
1132         a.frac = div128To64(temp_lo, temp_hi, b.frac);
1133         a.sign = sign;
1134         a.exp = exp;
1135         return a;
1136     }
1137     /* handle all the NaN cases */
1138     if (is_nan(a.cls) || is_nan(b.cls)) {
1139         return pick_nan(a, b, s);
1140     }
1141     /* 0/0 or Inf/Inf */
1142     if (a.cls == b.cls
1143         &&
1144         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1145         s->float_exception_flags |= float_flag_invalid;
1146         a.cls = float_class_dnan;
1147         return a;
1148     }
1149     /* Div 0 => Inf */
1150     if (b.cls == float_class_zero) {
1151         s->float_exception_flags |= float_flag_divbyzero;
1152         a.cls = float_class_inf;
1153         a.sign = sign;
1154         return a;
1155     }
1156     /* Inf / x or 0 / x */
1157     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1158         a.sign = sign;
1159         return a;
1160     }
1161     /* Div by Inf */
1162     if (b.cls == float_class_inf) {
1163         a.cls = float_class_zero;
1164         a.sign = sign;
1165         return a;
1166     }
1167     g_assert_not_reached();
1168 }
1169
1170 float16 float16_div(float16 a, float16 b, float_status *status)
1171 {
1172     FloatParts pa = float16_unpack_canonical(a, status);
1173     FloatParts pb = float16_unpack_canonical(b, status);
1174     FloatParts pr = div_floats(pa, pb, status);
1175
1176     return float16_round_pack_canonical(pr, status);
1177 }
1178
1179 float32 float32_div(float32 a, float32 b, float_status *status)
1180 {
1181     FloatParts pa = float32_unpack_canonical(a, status);
1182     FloatParts pb = float32_unpack_canonical(b, status);
1183     FloatParts pr = div_floats(pa, pb, status);
1184
1185     return float32_round_pack_canonical(pr, status);
1186 }
1187
1188 float64 float64_div(float64 a, float64 b, float_status *status)
1189 {
1190     FloatParts pa = float64_unpack_canonical(a, status);
1191     FloatParts pb = float64_unpack_canonical(b, status);
1192     FloatParts pr = div_floats(pa, pb, status);
1193
1194     return float64_round_pack_canonical(pr, status);
1195 }
1196
1197 /*
1198  * Rounds the floating-point value `a' to an integer, and returns the
1199  * result as a floating-point value. The operation is performed
1200  * according to the IEC/IEEE Standard for Binary Floating-Point
1201  * Arithmetic.
1202  */
1203
1204 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1205 {
1206     if (is_nan(a.cls)) {
1207         return return_nan(a, s);
1208     }
1209
1210     switch (a.cls) {
1211     case float_class_zero:
1212     case float_class_inf:
1213     case float_class_qnan:
1214         /* already "integral" */
1215         break;
1216     case float_class_normal:
1217         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1218             /* already integral */
1219             break;
1220         }
1221         if (a.exp < 0) {
1222             bool one;
1223             /* all fractional */
1224             s->float_exception_flags |= float_flag_inexact;
1225             switch (rounding_mode) {
1226             case float_round_nearest_even:
1227                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1228                 break;
1229             case float_round_ties_away:
1230                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1231                 break;
1232             case float_round_to_zero:
1233                 one = false;
1234                 break;
1235             case float_round_up:
1236                 one = !a.sign;
1237                 break;
1238             case float_round_down:
1239                 one = a.sign;
1240                 break;
1241             default:
1242                 g_assert_not_reached();
1243             }
1244
1245             if (one) {
1246                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1247                 a.exp = 0;
1248             } else {
1249                 a.cls = float_class_zero;
1250             }
1251         } else {
1252             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1253             uint64_t frac_lsbm1 = frac_lsb >> 1;
1254             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1255             uint64_t rnd_mask = rnd_even_mask >> 1;
1256             uint64_t inc;
1257
1258             switch (rounding_mode) {
1259             case float_round_nearest_even:
1260                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1261                 break;
1262             case float_round_ties_away:
1263                 inc = frac_lsbm1;
1264                 break;
1265             case float_round_to_zero:
1266                 inc = 0;
1267                 break;
1268             case float_round_up:
1269                 inc = a.sign ? 0 : rnd_mask;
1270                 break;
1271             case float_round_down:
1272                 inc = a.sign ? rnd_mask : 0;
1273                 break;
1274             default:
1275                 g_assert_not_reached();
1276             }
1277
1278             if (a.frac & rnd_mask) {
1279                 s->float_exception_flags |= float_flag_inexact;
1280                 a.frac += inc;
1281                 a.frac &= ~rnd_mask;
1282                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1283                     a.frac >>= 1;
1284                     a.exp++;
1285                 }
1286             }
1287         }
1288         break;
1289     default:
1290         g_assert_not_reached();
1291     }
1292     return a;
1293 }
1294
1295 float16 float16_round_to_int(float16 a, float_status *s)
1296 {
1297     FloatParts pa = float16_unpack_canonical(a, s);
1298     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1299     return float16_round_pack_canonical(pr, s);
1300 }
1301
1302 float32 float32_round_to_int(float32 a, float_status *s)
1303 {
1304     FloatParts pa = float32_unpack_canonical(a, s);
1305     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1306     return float32_round_pack_canonical(pr, s);
1307 }
1308
1309 float64 float64_round_to_int(float64 a, float_status *s)
1310 {
1311     FloatParts pa = float64_unpack_canonical(a, s);
1312     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1313     return float64_round_pack_canonical(pr, s);
1314 }
1315
1316 float64 float64_trunc_to_int(float64 a, float_status *s)
1317 {
1318     FloatParts pa = float64_unpack_canonical(a, s);
1319     FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1320     return float64_round_pack_canonical(pr, s);
1321 }
1322
1323 /*
1324  * Returns the result of converting the floating-point value `a' to
1325  * the two's complement integer format. The conversion is performed
1326  * according to the IEC/IEEE Standard for Binary Floating-Point
1327  * Arithmetic---which means in particular that the conversion is
1328  * rounded according to the current rounding mode. If `a' is a NaN,
1329  * the largest positive integer is returned. Otherwise, if the
1330  * conversion overflows, the largest integer with the same sign as `a'
1331  * is returned.
1332 */
1333
1334 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1335                                      int64_t min, int64_t max,
1336                                      float_status *s)
1337 {
1338     uint64_t r;
1339     int orig_flags = get_float_exception_flags(s);
1340     FloatParts p = round_to_int(in, rmode, s);
1341
1342     switch (p.cls) {
1343     case float_class_snan:
1344     case float_class_qnan:
1345         return max;
1346     case float_class_inf:
1347         return p.sign ? min : max;
1348     case float_class_zero:
1349         return 0;
1350     case float_class_normal:
1351         if (p.exp < DECOMPOSED_BINARY_POINT) {
1352             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1353         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1354             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1355         } else {
1356             r = UINT64_MAX;
1357         }
1358         if (p.sign) {
1359             if (r < -(uint64_t) min) {
1360                 return -r;
1361             } else {
1362                 s->float_exception_flags = orig_flags | float_flag_invalid;
1363                 return min;
1364             }
1365         } else {
1366             if (r < max) {
1367                 return r;
1368             } else {
1369                 s->float_exception_flags = orig_flags | float_flag_invalid;
1370                 return max;
1371             }
1372         }
1373     default:
1374         g_assert_not_reached();
1375     }
1376 }
1377
1378 #define FLOAT_TO_INT(fsz, isz)                                          \
1379 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a,         \
1380                                                 float_status *s)        \
1381 {                                                                       \
1382     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1383     return round_to_int_and_pack(p, s->float_rounding_mode,             \
1384                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1385                                  s);                                    \
1386 }                                                                       \
1387                                                                         \
1388 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero       \
1389  (float ## fsz a, float_status *s)                                      \
1390 {                                                                       \
1391     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1392     return round_to_int_and_pack(p, float_round_to_zero,                \
1393                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1394                                  s);                                    \
1395 }
1396
1397 FLOAT_TO_INT(16, 16)
1398 FLOAT_TO_INT(16, 32)
1399 FLOAT_TO_INT(16, 64)
1400
1401 FLOAT_TO_INT(32, 16)
1402 FLOAT_TO_INT(32, 32)
1403 FLOAT_TO_INT(32, 64)
1404
1405 FLOAT_TO_INT(64, 16)
1406 FLOAT_TO_INT(64, 32)
1407 FLOAT_TO_INT(64, 64)
1408
1409 #undef FLOAT_TO_INT
1410
1411 /*
1412  *  Returns the result of converting the floating-point value `a' to
1413  *  the unsigned integer format. The conversion is performed according
1414  *  to the IEC/IEEE Standard for Binary Floating-Point
1415  *  Arithmetic---which means in particular that the conversion is
1416  *  rounded according to the current rounding mode. If `a' is a NaN,
1417  *  the largest unsigned integer is returned. Otherwise, if the
1418  *  conversion overflows, the largest unsigned integer is returned. If
1419  *  the 'a' is negative, the result is rounded and zero is returned;
1420  *  values that do not round to zero will raise the inexact exception
1421  *  flag.
1422  */
1423
1424 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1425                                        float_status *s)
1426 {
1427     int orig_flags = get_float_exception_flags(s);
1428     FloatParts p = round_to_int(in, rmode, s);
1429
1430     switch (p.cls) {
1431     case float_class_snan:
1432     case float_class_qnan:
1433         s->float_exception_flags = orig_flags | float_flag_invalid;
1434         return max;
1435     case float_class_inf:
1436         return p.sign ? 0 : max;
1437     case float_class_zero:
1438         return 0;
1439     case float_class_normal:
1440     {
1441         uint64_t r;
1442         if (p.sign) {
1443             s->float_exception_flags = orig_flags | float_flag_invalid;
1444             return 0;
1445         }
1446
1447         if (p.exp < DECOMPOSED_BINARY_POINT) {
1448             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1449         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1450             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1451         } else {
1452             s->float_exception_flags = orig_flags | float_flag_invalid;
1453             return max;
1454         }
1455
1456         /* For uint64 this will never trip, but if p.exp is too large
1457          * to shift a decomposed fraction we shall have exited via the
1458          * 3rd leg above.
1459          */
1460         if (r > max) {
1461             s->float_exception_flags = orig_flags | float_flag_invalid;
1462             return max;
1463         } else {
1464             return r;
1465         }
1466     }
1467     default:
1468         g_assert_not_reached();
1469     }
1470 }
1471
1472 #define FLOAT_TO_UINT(fsz, isz) \
1473 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a,       \
1474                                                   float_status *s)      \
1475 {                                                                       \
1476     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1477     return round_to_uint_and_pack(p, s->float_rounding_mode,            \
1478                                  UINT ## isz ## _MAX, s);               \
1479 }                                                                       \
1480                                                                         \
1481 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero     \
1482  (float ## fsz a, float_status *s)                                      \
1483 {                                                                       \
1484     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1485     return round_to_uint_and_pack(p, s->float_rounding_mode,            \
1486                                  UINT ## isz ## _MAX, s);               \
1487 }
1488
1489 FLOAT_TO_UINT(16, 16)
1490 FLOAT_TO_UINT(16, 32)
1491 FLOAT_TO_UINT(16, 64)
1492
1493 FLOAT_TO_UINT(32, 16)
1494 FLOAT_TO_UINT(32, 32)
1495 FLOAT_TO_UINT(32, 64)
1496
1497 FLOAT_TO_UINT(64, 16)
1498 FLOAT_TO_UINT(64, 32)
1499 FLOAT_TO_UINT(64, 64)
1500
1501 #undef FLOAT_TO_UINT
1502
1503 /*
1504  * Integer to float conversions
1505  *
1506  * Returns the result of converting the two's complement integer `a'
1507  * to the floating-point format. The conversion is performed according
1508  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1509  */
1510
1511 static FloatParts int_to_float(int64_t a, float_status *status)
1512 {
1513     FloatParts r;
1514     if (a == 0) {
1515         r.cls = float_class_zero;
1516         r.sign = false;
1517     } else if (a == (1ULL << 63)) {
1518         r.cls = float_class_normal;
1519         r.sign = true;
1520         r.frac = DECOMPOSED_IMPLICIT_BIT;
1521         r.exp = 63;
1522     } else {
1523         uint64_t f;
1524         if (a < 0) {
1525             f = -a;
1526             r.sign = true;
1527         } else {
1528             f = a;
1529             r.sign = false;
1530         }
1531         int shift = clz64(f) - 1;
1532         r.cls = float_class_normal;
1533         r.exp = (DECOMPOSED_BINARY_POINT - shift);
1534         r.frac = f << shift;
1535     }
1536
1537     return r;
1538 }
1539
1540 float16 int64_to_float16(int64_t a, float_status *status)
1541 {
1542     FloatParts pa = int_to_float(a, status);
1543     return float16_round_pack_canonical(pa, status);
1544 }
1545
1546 float16 int32_to_float16(int32_t a, float_status *status)
1547 {
1548     return int64_to_float16(a, status);
1549 }
1550
1551 float16 int16_to_float16(int16_t a, float_status *status)
1552 {
1553     return int64_to_float16(a, status);
1554 }
1555
1556 float32 int64_to_float32(int64_t a, float_status *status)
1557 {
1558     FloatParts pa = int_to_float(a, status);
1559     return float32_round_pack_canonical(pa, status);
1560 }
1561
1562 float32 int32_to_float32(int32_t a, float_status *status)
1563 {
1564     return int64_to_float32(a, status);
1565 }
1566
1567 float32 int16_to_float32(int16_t a, float_status *status)
1568 {
1569     return int64_to_float32(a, status);
1570 }
1571
1572 float64 int64_to_float64(int64_t a, float_status *status)
1573 {
1574     FloatParts pa = int_to_float(a, status);
1575     return float64_round_pack_canonical(pa, status);
1576 }
1577
1578 float64 int32_to_float64(int32_t a, float_status *status)
1579 {
1580     return int64_to_float64(a, status);
1581 }
1582
1583 float64 int16_to_float64(int16_t a, float_status *status)
1584 {
1585     return int64_to_float64(a, status);
1586 }
1587
1588
1589 /*
1590  * Unsigned Integer to float conversions
1591  *
1592  * Returns the result of converting the unsigned integer `a' to the
1593  * floating-point format. The conversion is performed according to the
1594  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1595  */
1596
1597 static FloatParts uint_to_float(uint64_t a, float_status *status)
1598 {
1599     FloatParts r = { .sign = false};
1600
1601     if (a == 0) {
1602         r.cls = float_class_zero;
1603     } else {
1604         int spare_bits = clz64(a) - 1;
1605         r.cls = float_class_normal;
1606         r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1607         if (spare_bits < 0) {
1608             shift64RightJamming(a, -spare_bits, &a);
1609             r.frac = a;
1610         } else {
1611             r.frac = a << spare_bits;
1612         }
1613     }
1614
1615     return r;
1616 }
1617
1618 float16 uint64_to_float16(uint64_t a, float_status *status)
1619 {
1620     FloatParts pa = uint_to_float(a, status);
1621     return float16_round_pack_canonical(pa, status);
1622 }
1623
1624 float16 uint32_to_float16(uint32_t a, float_status *status)
1625 {
1626     return uint64_to_float16(a, status);
1627 }
1628
1629 float16 uint16_to_float16(uint16_t a, float_status *status)
1630 {
1631     return uint64_to_float16(a, status);
1632 }
1633
1634 float32 uint64_to_float32(uint64_t a, float_status *status)
1635 {
1636     FloatParts pa = uint_to_float(a, status);
1637     return float32_round_pack_canonical(pa, status);
1638 }
1639
1640 float32 uint32_to_float32(uint32_t a, float_status *status)
1641 {
1642     return uint64_to_float32(a, status);
1643 }
1644
1645 float32 uint16_to_float32(uint16_t a, float_status *status)
1646 {
1647     return uint64_to_float32(a, status);
1648 }
1649
1650 float64 uint64_to_float64(uint64_t a, float_status *status)
1651 {
1652     FloatParts pa = uint_to_float(a, status);
1653     return float64_round_pack_canonical(pa, status);
1654 }
1655
1656 float64 uint32_to_float64(uint32_t a, float_status *status)
1657 {
1658     return uint64_to_float64(a, status);
1659 }
1660
1661 float64 uint16_to_float64(uint16_t a, float_status *status)
1662 {
1663     return uint64_to_float64(a, status);
1664 }
1665
1666 /* Float Min/Max */
1667 /* min() and max() functions. These can't be implemented as
1668  * 'compare and pick one input' because that would mishandle
1669  * NaNs and +0 vs -0.
1670  *
1671  * minnum() and maxnum() functions. These are similar to the min()
1672  * and max() functions but if one of the arguments is a QNaN and
1673  * the other is numerical then the numerical argument is returned.
1674  * SNaNs will get quietened before being returned.
1675  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1676  * and maxNum() operations. min() and max() are the typical min/max
1677  * semantics provided by many CPUs which predate that specification.
1678  *
1679  * minnummag() and maxnummag() functions correspond to minNumMag()
1680  * and minNumMag() from the IEEE-754 2008.
1681  */
1682 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1683                                 bool ieee, bool ismag, float_status *s)
1684 {
1685     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1686         if (ieee) {
1687             /* Takes two floating-point values `a' and `b', one of
1688              * which is a NaN, and returns the appropriate NaN
1689              * result. If either `a' or `b' is a signaling NaN,
1690              * the invalid exception is raised.
1691              */
1692             if (is_snan(a.cls) || is_snan(b.cls)) {
1693                 return pick_nan(a, b, s);
1694             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1695                 return b;
1696             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1697                 return a;
1698             }
1699         }
1700         return pick_nan(a, b, s);
1701     } else {
1702         int a_exp, b_exp;
1703         bool a_sign, b_sign;
1704
1705         switch (a.cls) {
1706         case float_class_normal:
1707             a_exp = a.exp;
1708             break;
1709         case float_class_inf:
1710             a_exp = INT_MAX;
1711             break;
1712         case float_class_zero:
1713             a_exp = INT_MIN;
1714             break;
1715         default:
1716             g_assert_not_reached();
1717             break;
1718         }
1719         switch (b.cls) {
1720         case float_class_normal:
1721             b_exp = b.exp;
1722             break;
1723         case float_class_inf:
1724             b_exp = INT_MAX;
1725             break;
1726         case float_class_zero:
1727             b_exp = INT_MIN;
1728             break;
1729         default:
1730             g_assert_not_reached();
1731             break;
1732         }
1733
1734         a_sign = a.sign;
1735         b_sign = b.sign;
1736         if (ismag) {
1737             a_sign = b_sign = 0;
1738         }
1739
1740         if (a_sign == b_sign) {
1741             bool a_less = a_exp < b_exp;
1742             if (a_exp == b_exp) {
1743                 a_less = a.frac < b.frac;
1744             }
1745             return a_sign ^ a_less ^ ismin ? b : a;
1746         } else {
1747             return a_sign ^ ismin ? b : a;
1748         }
1749     }
1750 }
1751
1752 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
1753 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
1754                                      float_status *s)                   \
1755 {                                                                       \
1756     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1757     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1758     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
1759                                                                         \
1760     return float ## sz ## _round_pack_canonical(pr, s);                 \
1761 }
1762
1763 MINMAX(16, min, true, false, false)
1764 MINMAX(16, minnum, true, true, false)
1765 MINMAX(16, minnummag, true, true, true)
1766 MINMAX(16, max, false, false, false)
1767 MINMAX(16, maxnum, false, true, false)
1768 MINMAX(16, maxnummag, false, true, true)
1769
1770 MINMAX(32, min, true, false, false)
1771 MINMAX(32, minnum, true, true, false)
1772 MINMAX(32, minnummag, true, true, true)
1773 MINMAX(32, max, false, false, false)
1774 MINMAX(32, maxnum, false, true, false)
1775 MINMAX(32, maxnummag, false, true, true)
1776
1777 MINMAX(64, min, true, false, false)
1778 MINMAX(64, minnum, true, true, false)
1779 MINMAX(64, minnummag, true, true, true)
1780 MINMAX(64, max, false, false, false)
1781 MINMAX(64, maxnum, false, true, false)
1782 MINMAX(64, maxnummag, false, true, true)
1783
1784 #undef MINMAX
1785
1786 /* Floating point compare */
1787 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1788                           float_status *s)
1789 {
1790     if (is_nan(a.cls) || is_nan(b.cls)) {
1791         if (!is_quiet ||
1792             a.cls == float_class_snan ||
1793             b.cls == float_class_snan) {
1794             s->float_exception_flags |= float_flag_invalid;
1795         }
1796         return float_relation_unordered;
1797     }
1798
1799     if (a.cls == float_class_zero) {
1800         if (b.cls == float_class_zero) {
1801             return float_relation_equal;
1802         }
1803         return b.sign ? float_relation_greater : float_relation_less;
1804     } else if (b.cls == float_class_zero) {
1805         return a.sign ? float_relation_less : float_relation_greater;
1806     }
1807
1808     /* The only really important thing about infinity is its sign. If
1809      * both are infinities the sign marks the smallest of the two.
1810      */
1811     if (a.cls == float_class_inf) {
1812         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1813             return float_relation_equal;
1814         }
1815         return a.sign ? float_relation_less : float_relation_greater;
1816     } else if (b.cls == float_class_inf) {
1817         return b.sign ? float_relation_greater : float_relation_less;
1818     }
1819
1820     if (a.sign != b.sign) {
1821         return a.sign ? float_relation_less : float_relation_greater;
1822     }
1823
1824     if (a.exp == b.exp) {
1825         if (a.frac == b.frac) {
1826             return float_relation_equal;
1827         }
1828         if (a.sign) {
1829             return a.frac > b.frac ?
1830                 float_relation_less : float_relation_greater;
1831         } else {
1832             return a.frac > b.frac ?
1833                 float_relation_greater : float_relation_less;
1834         }
1835     } else {
1836         if (a.sign) {
1837             return a.exp > b.exp ? float_relation_less : float_relation_greater;
1838         } else {
1839             return a.exp > b.exp ? float_relation_greater : float_relation_less;
1840         }
1841     }
1842 }
1843
1844 #define COMPARE(sz)                                                     \
1845 int float ## sz ## _compare(float ## sz a, float ## sz b,               \
1846                             float_status *s)                            \
1847 {                                                                       \
1848     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1849     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1850     return compare_floats(pa, pb, false, s);                            \
1851 }                                                                       \
1852 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b,         \
1853                                   float_status *s)                      \
1854 {                                                                       \
1855     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1856     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1857     return compare_floats(pa, pb, true, s);                             \
1858 }
1859
1860 COMPARE(16)
1861 COMPARE(32)
1862 COMPARE(64)
1863
1864 #undef COMPARE
1865
1866 /* Multiply A by 2 raised to the power N.  */
1867 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1868 {
1869     if (unlikely(is_nan(a.cls))) {
1870         return return_nan(a, s);
1871     }
1872     if (a.cls == float_class_normal) {
1873         a.exp += n;
1874     }
1875     return a;
1876 }
1877
1878 float16 float16_scalbn(float16 a, int n, float_status *status)
1879 {
1880     FloatParts pa = float16_unpack_canonical(a, status);
1881     FloatParts pr = scalbn_decomposed(pa, n, status);
1882     return float16_round_pack_canonical(pr, status);
1883 }
1884
1885 float32 float32_scalbn(float32 a, int n, float_status *status)
1886 {
1887     FloatParts pa = float32_unpack_canonical(a, status);
1888     FloatParts pr = scalbn_decomposed(pa, n, status);
1889     return float32_round_pack_canonical(pr, status);
1890 }
1891
1892 float64 float64_scalbn(float64 a, int n, float_status *status)
1893 {
1894     FloatParts pa = float64_unpack_canonical(a, status);
1895     FloatParts pr = scalbn_decomposed(pa, n, status);
1896     return float64_round_pack_canonical(pr, status);
1897 }
1898
1899 /*
1900  * Square Root
1901  *
1902  * The old softfloat code did an approximation step before zeroing in
1903  * on the final result. However for simpleness we just compute the
1904  * square root by iterating down from the implicit bit to enough extra
1905  * bits to ensure we get a correctly rounded result.
1906  *
1907  * This does mean however the calculation is slower than before,
1908  * especially for 64 bit floats.
1909  */
1910
1911 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
1912 {
1913     uint64_t a_frac, r_frac, s_frac;
1914     int bit, last_bit;
1915
1916     if (is_nan(a.cls)) {
1917         return return_nan(a, s);
1918     }
1919     if (a.cls == float_class_zero) {
1920         return a;  /* sqrt(+-0) = +-0 */
1921     }
1922     if (a.sign) {
1923         s->float_exception_flags |= float_flag_invalid;
1924         a.cls = float_class_dnan;
1925         return a;
1926     }
1927     if (a.cls == float_class_inf) {
1928         return a;  /* sqrt(+inf) = +inf */
1929     }
1930
1931     assert(a.cls == float_class_normal);
1932
1933     /* We need two overflow bits at the top. Adding room for that is a
1934      * right shift. If the exponent is odd, we can discard the low bit
1935      * by multiplying the fraction by 2; that's a left shift. Combine
1936      * those and we shift right if the exponent is even.
1937      */
1938     a_frac = a.frac;
1939     if (!(a.exp & 1)) {
1940         a_frac >>= 1;
1941     }
1942     a.exp >>= 1;
1943
1944     /* Bit-by-bit computation of sqrt.  */
1945     r_frac = 0;
1946     s_frac = 0;
1947
1948     /* Iterate from implicit bit down to the 3 extra bits to compute a
1949      * properly rounded result. Remember we've inserted one more bit
1950      * at the top, so these positions are one less.
1951      */
1952     bit = DECOMPOSED_BINARY_POINT - 1;
1953     last_bit = MAX(p->frac_shift - 4, 0);
1954     do {
1955         uint64_t q = 1ULL << bit;
1956         uint64_t t_frac = s_frac + q;
1957         if (t_frac <= a_frac) {
1958             s_frac = t_frac + q;
1959             a_frac -= t_frac;
1960             r_frac += q;
1961         }
1962         a_frac <<= 1;
1963     } while (--bit >= last_bit);
1964
1965     /* Undo the right shift done above. If there is any remaining
1966      * fraction, the result is inexact. Set the sticky bit.
1967      */
1968     a.frac = (r_frac << 1) + (a_frac != 0);
1969
1970     return a;
1971 }
1972
1973 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
1974 {
1975     FloatParts pa = float16_unpack_canonical(a, status);
1976     FloatParts pr = sqrt_float(pa, status, &float16_params);
1977     return float16_round_pack_canonical(pr, status);
1978 }
1979
1980 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
1981 {
1982     FloatParts pa = float32_unpack_canonical(a, status);
1983     FloatParts pr = sqrt_float(pa, status, &float32_params);
1984     return float32_round_pack_canonical(pr, status);
1985 }
1986
1987 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
1988 {
1989     FloatParts pa = float64_unpack_canonical(a, status);
1990     FloatParts pr = sqrt_float(pa, status, &float64_params);
1991     return float64_round_pack_canonical(pr, status);
1992 }
1993
1994
1995 /*----------------------------------------------------------------------------
1996 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
1997 | and 7, and returns the properly rounded 32-bit integer corresponding to the
1998 | input.  If `zSign' is 1, the input is negated before being converted to an
1999 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
2000 | is simply rounded to an integer, with the inexact exception raised if the
2001 | input cannot be represented exactly as an integer.  However, if the fixed-
2002 | point input is too large, the invalid exception is raised and the largest
2003 | positive or negative integer is returned.
2004 *----------------------------------------------------------------------------*/
2005
2006 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2007 {
2008     int8_t roundingMode;
2009     flag roundNearestEven;
2010     int8_t roundIncrement, roundBits;
2011     int32_t z;
2012
2013     roundingMode = status->float_rounding_mode;
2014     roundNearestEven = ( roundingMode == float_round_nearest_even );
2015     switch (roundingMode) {
2016     case float_round_nearest_even:
2017     case float_round_ties_away:
2018         roundIncrement = 0x40;
2019         break;
2020     case float_round_to_zero:
2021         roundIncrement = 0;
2022         break;
2023     case float_round_up:
2024         roundIncrement = zSign ? 0 : 0x7f;
2025         break;
2026     case float_round_down:
2027         roundIncrement = zSign ? 0x7f : 0;
2028         break;
2029     default:
2030         abort();
2031     }
2032     roundBits = absZ & 0x7F;
2033     absZ = ( absZ + roundIncrement )>>7;
2034     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2035     z = absZ;
2036     if ( zSign ) z = - z;
2037     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2038         float_raise(float_flag_invalid, status);
2039         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2040     }
2041     if (roundBits) {
2042         status->float_exception_flags |= float_flag_inexact;
2043     }
2044     return z;
2045
2046 }
2047
2048 /*----------------------------------------------------------------------------
2049 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2050 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2051 | and returns the properly rounded 64-bit integer corresponding to the input.
2052 | If `zSign' is 1, the input is negated before being converted to an integer.
2053 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2054 | the inexact exception raised if the input cannot be represented exactly as
2055 | an integer.  However, if the fixed-point input is too large, the invalid
2056 | exception is raised and the largest positive or negative integer is
2057 | returned.
2058 *----------------------------------------------------------------------------*/
2059
2060 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2061                                float_status *status)
2062 {
2063     int8_t roundingMode;
2064     flag roundNearestEven, increment;
2065     int64_t z;
2066
2067     roundingMode = status->float_rounding_mode;
2068     roundNearestEven = ( roundingMode == float_round_nearest_even );
2069     switch (roundingMode) {
2070     case float_round_nearest_even:
2071     case float_round_ties_away:
2072         increment = ((int64_t) absZ1 < 0);
2073         break;
2074     case float_round_to_zero:
2075         increment = 0;
2076         break;
2077     case float_round_up:
2078         increment = !zSign && absZ1;
2079         break;
2080     case float_round_down:
2081         increment = zSign && absZ1;
2082         break;
2083     default:
2084         abort();
2085     }
2086     if ( increment ) {
2087         ++absZ0;
2088         if ( absZ0 == 0 ) goto overflow;
2089         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2090     }
2091     z = absZ0;
2092     if ( zSign ) z = - z;
2093     if ( z && ( ( z < 0 ) ^ zSign ) ) {
2094  overflow:
2095         float_raise(float_flag_invalid, status);
2096         return
2097               zSign ? (int64_t) LIT64( 0x8000000000000000 )
2098             : LIT64( 0x7FFFFFFFFFFFFFFF );
2099     }
2100     if (absZ1) {
2101         status->float_exception_flags |= float_flag_inexact;
2102     }
2103     return z;
2104
2105 }
2106
2107 /*----------------------------------------------------------------------------
2108 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2109 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2110 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2111 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
2112 | with the inexact exception raised if the input cannot be represented exactly
2113 | as an integer.  However, if the fixed-point input is too large, the invalid
2114 | exception is raised and the largest unsigned integer is returned.
2115 *----------------------------------------------------------------------------*/
2116
2117 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2118                                 uint64_t absZ1, float_status *status)
2119 {
2120     int8_t roundingMode;
2121     flag roundNearestEven, increment;
2122
2123     roundingMode = status->float_rounding_mode;
2124     roundNearestEven = (roundingMode == float_round_nearest_even);
2125     switch (roundingMode) {
2126     case float_round_nearest_even:
2127     case float_round_ties_away:
2128         increment = ((int64_t)absZ1 < 0);
2129         break;
2130     case float_round_to_zero:
2131         increment = 0;
2132         break;
2133     case float_round_up:
2134         increment = !zSign && absZ1;
2135         break;
2136     case float_round_down:
2137         increment = zSign && absZ1;
2138         break;
2139     default:
2140         abort();
2141     }
2142     if (increment) {
2143         ++absZ0;
2144         if (absZ0 == 0) {
2145             float_raise(float_flag_invalid, status);
2146             return LIT64(0xFFFFFFFFFFFFFFFF);
2147         }
2148         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2149     }
2150
2151     if (zSign && absZ0) {
2152         float_raise(float_flag_invalid, status);
2153         return 0;
2154     }
2155
2156     if (absZ1) {
2157         status->float_exception_flags |= float_flag_inexact;
2158     }
2159     return absZ0;
2160 }
2161
2162 /*----------------------------------------------------------------------------
2163 | If `a' is denormal and we are in flush-to-zero mode then set the
2164 | input-denormal exception and return zero. Otherwise just return the value.
2165 *----------------------------------------------------------------------------*/
2166 float32 float32_squash_input_denormal(float32 a, float_status *status)
2167 {
2168     if (status->flush_inputs_to_zero) {
2169         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2170             float_raise(float_flag_input_denormal, status);
2171             return make_float32(float32_val(a) & 0x80000000);
2172         }
2173     }
2174     return a;
2175 }
2176
2177 /*----------------------------------------------------------------------------
2178 | Normalizes the subnormal single-precision floating-point value represented
2179 | by the denormalized significand `aSig'.  The normalized exponent and
2180 | significand are stored at the locations pointed to by `zExpPtr' and
2181 | `zSigPtr', respectively.
2182 *----------------------------------------------------------------------------*/
2183
2184 static void
2185  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2186 {
2187     int8_t shiftCount;
2188
2189     shiftCount = countLeadingZeros32( aSig ) - 8;
2190     *zSigPtr = aSig<<shiftCount;
2191     *zExpPtr = 1 - shiftCount;
2192
2193 }
2194
2195 /*----------------------------------------------------------------------------
2196 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2197 | and significand `zSig', and returns the proper single-precision floating-
2198 | point value corresponding to the abstract input.  Ordinarily, the abstract
2199 | value is simply rounded and packed into the single-precision format, with
2200 | the inexact exception raised if the abstract input cannot be represented
2201 | exactly.  However, if the abstract value is too large, the overflow and
2202 | inexact exceptions are raised and an infinity or maximal finite value is
2203 | returned.  If the abstract value is too small, the input value is rounded to
2204 | a subnormal number, and the underflow and inexact exceptions are raised if
2205 | the abstract input cannot be represented exactly as a subnormal single-
2206 | precision floating-point number.
2207 |     The input significand `zSig' has its binary point between bits 30
2208 | and 29, which is 7 bits to the left of the usual location.  This shifted
2209 | significand must be normalized or smaller.  If `zSig' is not normalized,
2210 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2211 | and it must not require rounding.  In the usual case that `zSig' is
2212 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2213 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2214 | Binary Floating-Point Arithmetic.
2215 *----------------------------------------------------------------------------*/
2216
2217 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2218                                    float_status *status)
2219 {
2220     int8_t roundingMode;
2221     flag roundNearestEven;
2222     int8_t roundIncrement, roundBits;
2223     flag isTiny;
2224
2225     roundingMode = status->float_rounding_mode;
2226     roundNearestEven = ( roundingMode == float_round_nearest_even );
2227     switch (roundingMode) {
2228     case float_round_nearest_even:
2229     case float_round_ties_away:
2230         roundIncrement = 0x40;
2231         break;
2232     case float_round_to_zero:
2233         roundIncrement = 0;
2234         break;
2235     case float_round_up:
2236         roundIncrement = zSign ? 0 : 0x7f;
2237         break;
2238     case float_round_down:
2239         roundIncrement = zSign ? 0x7f : 0;
2240         break;
2241     default:
2242         abort();
2243         break;
2244     }
2245     roundBits = zSig & 0x7F;
2246     if ( 0xFD <= (uint16_t) zExp ) {
2247         if (    ( 0xFD < zExp )
2248              || (    ( zExp == 0xFD )
2249                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2250            ) {
2251             float_raise(float_flag_overflow | float_flag_inexact, status);
2252             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2253         }
2254         if ( zExp < 0 ) {
2255             if (status->flush_to_zero) {
2256                 float_raise(float_flag_output_denormal, status);
2257                 return packFloat32(zSign, 0, 0);
2258             }
2259             isTiny =
2260                 (status->float_detect_tininess
2261                  == float_tininess_before_rounding)
2262                 || ( zExp < -1 )
2263                 || ( zSig + roundIncrement < 0x80000000 );
2264             shift32RightJamming( zSig, - zExp, &zSig );
2265             zExp = 0;
2266             roundBits = zSig & 0x7F;
2267             if (isTiny && roundBits) {
2268                 float_raise(float_flag_underflow, status);
2269             }
2270         }
2271     }
2272     if (roundBits) {
2273         status->float_exception_flags |= float_flag_inexact;
2274     }
2275     zSig = ( zSig + roundIncrement )>>7;
2276     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2277     if ( zSig == 0 ) zExp = 0;
2278     return packFloat32( zSign, zExp, zSig );
2279
2280 }
2281
2282 /*----------------------------------------------------------------------------
2283 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2284 | and significand `zSig', and returns the proper single-precision floating-
2285 | point value corresponding to the abstract input.  This routine is just like
2286 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2287 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2288 | floating-point exponent.
2289 *----------------------------------------------------------------------------*/
2290
2291 static float32
2292  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2293                               float_status *status)
2294 {
2295     int8_t shiftCount;
2296
2297     shiftCount = countLeadingZeros32( zSig ) - 1;
2298     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2299                                status);
2300
2301 }
2302
2303 /*----------------------------------------------------------------------------
2304 | If `a' is denormal and we are in flush-to-zero mode then set the
2305 | input-denormal exception and return zero. Otherwise just return the value.
2306 *----------------------------------------------------------------------------*/
2307 float64 float64_squash_input_denormal(float64 a, float_status *status)
2308 {
2309     if (status->flush_inputs_to_zero) {
2310         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2311             float_raise(float_flag_input_denormal, status);
2312             return make_float64(float64_val(a) & (1ULL << 63));
2313         }
2314     }
2315     return a;
2316 }
2317
2318 /*----------------------------------------------------------------------------
2319 | Normalizes the subnormal double-precision floating-point value represented
2320 | by the denormalized significand `aSig'.  The normalized exponent and
2321 | significand are stored at the locations pointed to by `zExpPtr' and
2322 | `zSigPtr', respectively.
2323 *----------------------------------------------------------------------------*/
2324
2325 static void
2326  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2327 {
2328     int8_t shiftCount;
2329
2330     shiftCount = countLeadingZeros64( aSig ) - 11;
2331     *zSigPtr = aSig<<shiftCount;
2332     *zExpPtr = 1 - shiftCount;
2333
2334 }
2335
2336 /*----------------------------------------------------------------------------
2337 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2338 | double-precision floating-point value, returning the result.  After being
2339 | shifted into the proper positions, the three fields are simply added
2340 | together to form the result.  This means that any integer portion of `zSig'
2341 | will be added into the exponent.  Since a properly normalized significand
2342 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2343 | than the desired result exponent whenever `zSig' is a complete, normalized
2344 | significand.
2345 *----------------------------------------------------------------------------*/
2346
2347 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2348 {
2349
2350     return make_float64(
2351         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2352
2353 }
2354
2355 /*----------------------------------------------------------------------------
2356 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2357 | and significand `zSig', and returns the proper double-precision floating-
2358 | point value corresponding to the abstract input.  Ordinarily, the abstract
2359 | value is simply rounded and packed into the double-precision format, with
2360 | the inexact exception raised if the abstract input cannot be represented
2361 | exactly.  However, if the abstract value is too large, the overflow and
2362 | inexact exceptions are raised and an infinity or maximal finite value is
2363 | returned.  If the abstract value is too small, the input value is rounded to
2364 | a subnormal number, and the underflow and inexact exceptions are raised if
2365 | the abstract input cannot be represented exactly as a subnormal double-
2366 | precision floating-point number.
2367 |     The input significand `zSig' has its binary point between bits 62
2368 | and 61, which is 10 bits to the left of the usual location.  This shifted
2369 | significand must be normalized or smaller.  If `zSig' is not normalized,
2370 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2371 | and it must not require rounding.  In the usual case that `zSig' is
2372 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2373 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2374 | Binary Floating-Point Arithmetic.
2375 *----------------------------------------------------------------------------*/
2376
2377 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2378                                    float_status *status)
2379 {
2380     int8_t roundingMode;
2381     flag roundNearestEven;
2382     int roundIncrement, roundBits;
2383     flag isTiny;
2384
2385     roundingMode = status->float_rounding_mode;
2386     roundNearestEven = ( roundingMode == float_round_nearest_even );
2387     switch (roundingMode) {
2388     case float_round_nearest_even:
2389     case float_round_ties_away:
2390         roundIncrement = 0x200;
2391         break;
2392     case float_round_to_zero:
2393         roundIncrement = 0;
2394         break;
2395     case float_round_up:
2396         roundIncrement = zSign ? 0 : 0x3ff;
2397         break;
2398     case float_round_down:
2399         roundIncrement = zSign ? 0x3ff : 0;
2400         break;
2401     case float_round_to_odd:
2402         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2403         break;
2404     default:
2405         abort();
2406     }
2407     roundBits = zSig & 0x3FF;
2408     if ( 0x7FD <= (uint16_t) zExp ) {
2409         if (    ( 0x7FD < zExp )
2410              || (    ( zExp == 0x7FD )
2411                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2412            ) {
2413             bool overflow_to_inf = roundingMode != float_round_to_odd &&
2414                                    roundIncrement != 0;
2415             float_raise(float_flag_overflow | float_flag_inexact, status);
2416             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2417         }
2418         if ( zExp < 0 ) {
2419             if (status->flush_to_zero) {
2420                 float_raise(float_flag_output_denormal, status);
2421                 return packFloat64(zSign, 0, 0);
2422             }
2423             isTiny =
2424                    (status->float_detect_tininess
2425                     == float_tininess_before_rounding)
2426                 || ( zExp < -1 )
2427                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2428             shift64RightJamming( zSig, - zExp, &zSig );
2429             zExp = 0;
2430             roundBits = zSig & 0x3FF;
2431             if (isTiny && roundBits) {
2432                 float_raise(float_flag_underflow, status);
2433             }
2434             if (roundingMode == float_round_to_odd) {
2435                 /*
2436                  * For round-to-odd case, the roundIncrement depends on
2437                  * zSig which just changed.
2438                  */
2439                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2440             }
2441         }
2442     }
2443     if (roundBits) {
2444         status->float_exception_flags |= float_flag_inexact;
2445     }
2446     zSig = ( zSig + roundIncrement )>>10;
2447     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2448     if ( zSig == 0 ) zExp = 0;
2449     return packFloat64( zSign, zExp, zSig );
2450
2451 }
2452
2453 /*----------------------------------------------------------------------------
2454 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2455 | and significand `zSig', and returns the proper double-precision floating-
2456 | point value corresponding to the abstract input.  This routine is just like
2457 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2458 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2459 | floating-point exponent.
2460 *----------------------------------------------------------------------------*/
2461
2462 static float64
2463  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2464                               float_status *status)
2465 {
2466     int8_t shiftCount;
2467
2468     shiftCount = countLeadingZeros64( zSig ) - 1;
2469     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2470                                status);
2471
2472 }
2473
2474 /*----------------------------------------------------------------------------
2475 | Normalizes the subnormal extended double-precision floating-point value
2476 | represented by the denormalized significand `aSig'.  The normalized exponent
2477 | and significand are stored at the locations pointed to by `zExpPtr' and
2478 | `zSigPtr', respectively.
2479 *----------------------------------------------------------------------------*/
2480
2481 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2482                                 uint64_t *zSigPtr)
2483 {
2484     int8_t shiftCount;
2485
2486     shiftCount = countLeadingZeros64( aSig );
2487     *zSigPtr = aSig<<shiftCount;
2488     *zExpPtr = 1 - shiftCount;
2489 }
2490
2491 /*----------------------------------------------------------------------------
2492 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2493 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2494 | and returns the proper extended double-precision floating-point value
2495 | corresponding to the abstract input.  Ordinarily, the abstract value is
2496 | rounded and packed into the extended double-precision format, with the
2497 | inexact exception raised if the abstract input cannot be represented
2498 | exactly.  However, if the abstract value is too large, the overflow and
2499 | inexact exceptions are raised and an infinity or maximal finite value is
2500 | returned.  If the abstract value is too small, the input value is rounded to
2501 | a subnormal number, and the underflow and inexact exceptions are raised if
2502 | the abstract input cannot be represented exactly as a subnormal extended
2503 | double-precision floating-point number.
2504 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
2505 | number of bits as single or double precision, respectively.  Otherwise, the
2506 | result is rounded to the full precision of the extended double-precision
2507 | format.
2508 |     The input significand must be normalized or smaller.  If the input
2509 | significand is not normalized, `zExp' must be 0; in that case, the result
2510 | returned is a subnormal number, and it must not require rounding.  The
2511 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2512 | Floating-Point Arithmetic.
2513 *----------------------------------------------------------------------------*/
2514
2515 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2516                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2517                               float_status *status)
2518 {
2519     int8_t roundingMode;
2520     flag roundNearestEven, increment, isTiny;
2521     int64_t roundIncrement, roundMask, roundBits;
2522
2523     roundingMode = status->float_rounding_mode;
2524     roundNearestEven = ( roundingMode == float_round_nearest_even );
2525     if ( roundingPrecision == 80 ) goto precision80;
2526     if ( roundingPrecision == 64 ) {
2527         roundIncrement = LIT64( 0x0000000000000400 );
2528         roundMask = LIT64( 0x00000000000007FF );
2529     }
2530     else if ( roundingPrecision == 32 ) {
2531         roundIncrement = LIT64( 0x0000008000000000 );
2532         roundMask = LIT64( 0x000000FFFFFFFFFF );
2533     }
2534     else {
2535         goto precision80;
2536     }
2537     zSig0 |= ( zSig1 != 0 );
2538     switch (roundingMode) {
2539     case float_round_nearest_even:
2540     case float_round_ties_away:
2541         break;
2542     case float_round_to_zero:
2543         roundIncrement = 0;
2544         break;
2545     case float_round_up:
2546         roundIncrement = zSign ? 0 : roundMask;
2547         break;
2548     case float_round_down:
2549         roundIncrement = zSign ? roundMask : 0;
2550         break;
2551     default:
2552         abort();
2553     }
2554     roundBits = zSig0 & roundMask;
2555     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2556         if (    ( 0x7FFE < zExp )
2557              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2558            ) {
2559             goto overflow;
2560         }
2561         if ( zExp <= 0 ) {
2562             if (status->flush_to_zero) {
2563                 float_raise(float_flag_output_denormal, status);
2564                 return packFloatx80(zSign, 0, 0);
2565             }
2566             isTiny =
2567                    (status->float_detect_tininess
2568                     == float_tininess_before_rounding)
2569                 || ( zExp < 0 )
2570                 || ( zSig0 <= zSig0 + roundIncrement );
2571             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2572             zExp = 0;
2573             roundBits = zSig0 & roundMask;
2574             if (isTiny && roundBits) {
2575                 float_raise(float_flag_underflow, status);
2576             }
2577             if (roundBits) {
2578                 status->float_exception_flags |= float_flag_inexact;
2579             }
2580             zSig0 += roundIncrement;
2581             if ( (int64_t) zSig0 < 0 ) zExp = 1;
2582             roundIncrement = roundMask + 1;
2583             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2584                 roundMask |= roundIncrement;
2585             }
2586             zSig0 &= ~ roundMask;
2587             return packFloatx80( zSign, zExp, zSig0 );
2588         }
2589     }
2590     if (roundBits) {
2591         status->float_exception_flags |= float_flag_inexact;
2592     }
2593     zSig0 += roundIncrement;
2594     if ( zSig0 < roundIncrement ) {
2595         ++zExp;
2596         zSig0 = LIT64( 0x8000000000000000 );
2597     }
2598     roundIncrement = roundMask + 1;
2599     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2600         roundMask |= roundIncrement;
2601     }
2602     zSig0 &= ~ roundMask;
2603     if ( zSig0 == 0 ) zExp = 0;
2604     return packFloatx80( zSign, zExp, zSig0 );
2605  precision80:
2606     switch (roundingMode) {
2607     case float_round_nearest_even:
2608     case float_round_ties_away:
2609         increment = ((int64_t)zSig1 < 0);
2610         break;
2611     case float_round_to_zero:
2612         increment = 0;
2613         break;
2614     case float_round_up:
2615         increment = !zSign && zSig1;
2616         break;
2617     case float_round_down:
2618         increment = zSign && zSig1;
2619         break;
2620     default:
2621         abort();
2622     }
2623     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2624         if (    ( 0x7FFE < zExp )
2625              || (    ( zExp == 0x7FFE )
2626                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2627                   && increment
2628                 )
2629            ) {
2630             roundMask = 0;
2631  overflow:
2632             float_raise(float_flag_overflow | float_flag_inexact, status);
2633             if (    ( roundingMode == float_round_to_zero )
2634                  || ( zSign && ( roundingMode == float_round_up ) )
2635                  || ( ! zSign && ( roundingMode == float_round_down ) )
2636                ) {
2637                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2638             }
2639             return packFloatx80(zSign,
2640                                 floatx80_infinity_high,
2641                                 floatx80_infinity_low);
2642         }
2643         if ( zExp <= 0 ) {
2644             isTiny =
2645                    (status->float_detect_tininess
2646                     == float_tininess_before_rounding)
2647                 || ( zExp < 0 )
2648                 || ! increment
2649                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2650             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2651             zExp = 0;
2652             if (isTiny && zSig1) {
2653                 float_raise(float_flag_underflow, status);
2654             }
2655             if (zSig1) {
2656                 status->float_exception_flags |= float_flag_inexact;
2657             }
2658             switch (roundingMode) {
2659             case float_round_nearest_even:
2660             case float_round_ties_away:
2661                 increment = ((int64_t)zSig1 < 0);
2662                 break;
2663             case float_round_to_zero:
2664                 increment = 0;
2665                 break;
2666             case float_round_up:
2667                 increment = !zSign && zSig1;
2668                 break;
2669             case float_round_down:
2670                 increment = zSign && zSig1;
2671                 break;
2672             default:
2673                 abort();
2674             }
2675             if ( increment ) {
2676                 ++zSig0;
2677                 zSig0 &=
2678                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2679                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2680             }
2681             return packFloatx80( zSign, zExp, zSig0 );
2682         }
2683     }
2684     if (zSig1) {
2685         status->float_exception_flags |= float_flag_inexact;
2686     }
2687     if ( increment ) {
2688         ++zSig0;
2689         if ( zSig0 == 0 ) {
2690             ++zExp;
2691             zSig0 = LIT64( 0x8000000000000000 );
2692         }
2693         else {
2694             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2695         }
2696     }
2697     else {
2698         if ( zSig0 == 0 ) zExp = 0;
2699     }
2700     return packFloatx80( zSign, zExp, zSig0 );
2701
2702 }
2703
2704 /*----------------------------------------------------------------------------
2705 | Takes an abstract floating-point value having sign `zSign', exponent
2706 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2707 | and returns the proper extended double-precision floating-point value
2708 | corresponding to the abstract input.  This routine is just like
2709 | `roundAndPackFloatx80' except that the input significand does not have to be
2710 | normalized.
2711 *----------------------------------------------------------------------------*/
2712
2713 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2714                                        flag zSign, int32_t zExp,
2715                                        uint64_t zSig0, uint64_t zSig1,
2716                                        float_status *status)
2717 {
2718     int8_t shiftCount;
2719
2720     if ( zSig0 == 0 ) {
2721         zSig0 = zSig1;
2722         zSig1 = 0;
2723         zExp -= 64;
2724     }
2725     shiftCount = countLeadingZeros64( zSig0 );
2726     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2727     zExp -= shiftCount;
2728     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2729                                 zSig0, zSig1, status);
2730
2731 }
2732
2733 /*----------------------------------------------------------------------------
2734 | Returns the least-significant 64 fraction bits of the quadruple-precision
2735 | floating-point value `a'.
2736 *----------------------------------------------------------------------------*/
2737
2738 static inline uint64_t extractFloat128Frac1( float128 a )
2739 {
2740
2741     return a.low;
2742
2743 }
2744
2745 /*----------------------------------------------------------------------------
2746 | Returns the most-significant 48 fraction bits of the quadruple-precision
2747 | floating-point value `a'.
2748 *----------------------------------------------------------------------------*/
2749
2750 static inline uint64_t extractFloat128Frac0( float128 a )
2751 {
2752
2753     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2754
2755 }
2756
2757 /*----------------------------------------------------------------------------
2758 | Returns the exponent bits of the quadruple-precision floating-point value
2759 | `a'.
2760 *----------------------------------------------------------------------------*/
2761
2762 static inline int32_t extractFloat128Exp( float128 a )
2763 {
2764
2765     return ( a.high>>48 ) & 0x7FFF;
2766
2767 }
2768
2769 /*----------------------------------------------------------------------------
2770 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2771 *----------------------------------------------------------------------------*/
2772
2773 static inline flag extractFloat128Sign( float128 a )
2774 {
2775
2776     return a.high>>63;
2777
2778 }
2779
2780 /*----------------------------------------------------------------------------
2781 | Normalizes the subnormal quadruple-precision floating-point value
2782 | represented by the denormalized significand formed by the concatenation of
2783 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
2784 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
2785 | significand are stored at the location pointed to by `zSig0Ptr', and the
2786 | least significant 64 bits of the normalized significand are stored at the
2787 | location pointed to by `zSig1Ptr'.
2788 *----------------------------------------------------------------------------*/
2789
2790 static void
2791  normalizeFloat128Subnormal(
2792      uint64_t aSig0,
2793      uint64_t aSig1,
2794      int32_t *zExpPtr,
2795      uint64_t *zSig0Ptr,
2796      uint64_t *zSig1Ptr
2797  )
2798 {
2799     int8_t shiftCount;
2800
2801     if ( aSig0 == 0 ) {
2802         shiftCount = countLeadingZeros64( aSig1 ) - 15;
2803         if ( shiftCount < 0 ) {
2804             *zSig0Ptr = aSig1>>( - shiftCount );
2805             *zSig1Ptr = aSig1<<( shiftCount & 63 );
2806         }
2807         else {
2808             *zSig0Ptr = aSig1<<shiftCount;
2809             *zSig1Ptr = 0;
2810         }
2811         *zExpPtr = - shiftCount - 63;
2812     }
2813     else {
2814         shiftCount = countLeadingZeros64( aSig0 ) - 15;
2815         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2816         *zExpPtr = 1 - shiftCount;
2817     }
2818
2819 }
2820
2821 /*----------------------------------------------------------------------------
2822 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2823 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2824 | floating-point value, returning the result.  After being shifted into the
2825 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2826 | added together to form the most significant 32 bits of the result.  This
2827 | means that any integer portion of `zSig0' will be added into the exponent.
2828 | Since a properly normalized significand will have an integer portion equal
2829 | to 1, the `zExp' input should be 1 less than the desired result exponent
2830 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2831 | significand.
2832 *----------------------------------------------------------------------------*/
2833
2834 static inline float128
2835  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
2836 {
2837     float128 z;
2838
2839     z.low = zSig1;
2840     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
2841     return z;
2842
2843 }
2844
2845 /*----------------------------------------------------------------------------
2846 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2847 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2848 | and `zSig2', and returns the proper quadruple-precision floating-point value
2849 | corresponding to the abstract input.  Ordinarily, the abstract value is
2850 | simply rounded and packed into the quadruple-precision format, with the
2851 | inexact exception raised if the abstract input cannot be represented
2852 | exactly.  However, if the abstract value is too large, the overflow and
2853 | inexact exceptions are raised and an infinity or maximal finite value is
2854 | returned.  If the abstract value is too small, the input value is rounded to
2855 | a subnormal number, and the underflow and inexact exceptions are raised if
2856 | the abstract input cannot be represented exactly as a subnormal quadruple-
2857 | precision floating-point number.
2858 |     The input significand must be normalized or smaller.  If the input
2859 | significand is not normalized, `zExp' must be 0; in that case, the result
2860 | returned is a subnormal number, and it must not require rounding.  In the
2861 | usual case that the input significand is normalized, `zExp' must be 1 less
2862 | than the ``true'' floating-point exponent.  The handling of underflow and
2863 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2864 *----------------------------------------------------------------------------*/
2865
2866 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
2867                                      uint64_t zSig0, uint64_t zSig1,
2868                                      uint64_t zSig2, float_status *status)
2869 {
2870     int8_t roundingMode;
2871     flag roundNearestEven, increment, isTiny;
2872
2873     roundingMode = status->float_rounding_mode;
2874     roundNearestEven = ( roundingMode == float_round_nearest_even );
2875     switch (roundingMode) {
2876     case float_round_nearest_even:
2877     case float_round_ties_away:
2878         increment = ((int64_t)zSig2 < 0);
2879         break;
2880     case float_round_to_zero:
2881         increment = 0;
2882         break;
2883     case float_round_up:
2884         increment = !zSign && zSig2;
2885         break;
2886     case float_round_down:
2887         increment = zSign && zSig2;
2888         break;
2889     case float_round_to_odd:
2890         increment = !(zSig1 & 0x1) && zSig2;
2891         break;
2892     default:
2893         abort();
2894     }
2895     if ( 0x7FFD <= (uint32_t) zExp ) {
2896         if (    ( 0x7FFD < zExp )
2897              || (    ( zExp == 0x7FFD )
2898                   && eq128(
2899                          LIT64( 0x0001FFFFFFFFFFFF ),
2900                          LIT64( 0xFFFFFFFFFFFFFFFF ),
2901                          zSig0,
2902                          zSig1
2903                      )
2904                   && increment
2905                 )
2906            ) {
2907             float_raise(float_flag_overflow | float_flag_inexact, status);
2908             if (    ( roundingMode == float_round_to_zero )
2909                  || ( zSign && ( roundingMode == float_round_up ) )
2910                  || ( ! zSign && ( roundingMode == float_round_down ) )
2911                  || (roundingMode == float_round_to_odd)
2912                ) {
2913                 return
2914                     packFloat128(
2915                         zSign,
2916                         0x7FFE,
2917                         LIT64( 0x0000FFFFFFFFFFFF ),
2918                         LIT64( 0xFFFFFFFFFFFFFFFF )
2919                     );
2920             }
2921             return packFloat128( zSign, 0x7FFF, 0, 0 );
2922         }
2923         if ( zExp < 0 ) {
2924             if (status->flush_to_zero) {
2925                 float_raise(float_flag_output_denormal, status);
2926                 return packFloat128(zSign, 0, 0, 0);
2927             }
2928             isTiny =
2929                    (status->float_detect_tininess
2930                     == float_tininess_before_rounding)
2931                 || ( zExp < -1 )
2932                 || ! increment
2933                 || lt128(
2934                        zSig0,
2935                        zSig1,
2936                        LIT64( 0x0001FFFFFFFFFFFF ),
2937                        LIT64( 0xFFFFFFFFFFFFFFFF )
2938                    );
2939             shift128ExtraRightJamming(
2940                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2941             zExp = 0;
2942             if (isTiny && zSig2) {
2943                 float_raise(float_flag_underflow, status);
2944             }
2945             switch (roundingMode) {
2946             case float_round_nearest_even:
2947             case float_round_ties_away:
2948                 increment = ((int64_t)zSig2 < 0);
2949                 break;
2950             case float_round_to_zero:
2951                 increment = 0;
2952                 break;
2953             case float_round_up:
2954                 increment = !zSign && zSig2;
2955                 break;
2956             case float_round_down:
2957                 increment = zSign && zSig2;
2958                 break;
2959             case float_round_to_odd:
2960                 increment = !(zSig1 & 0x1) && zSig2;
2961                 break;
2962             default:
2963                 abort();
2964             }
2965         }
2966     }
2967     if (zSig2) {
2968         status->float_exception_flags |= float_flag_inexact;
2969     }
2970     if ( increment ) {
2971         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2972         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2973     }
2974     else {
2975         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2976     }
2977     return packFloat128( zSign, zExp, zSig0, zSig1 );
2978
2979 }
2980
2981 /*----------------------------------------------------------------------------
2982 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2983 | and significand formed by the concatenation of `zSig0' and `zSig1', and
2984 | returns the proper quadruple-precision floating-point value corresponding
2985 | to the abstract input.  This routine is just like `roundAndPackFloat128'
2986 | except that the input significand has fewer bits and does not have to be
2987 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
2988 | point exponent.
2989 *----------------------------------------------------------------------------*/
2990
2991 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
2992                                               uint64_t zSig0, uint64_t zSig1,
2993                                               float_status *status)
2994 {
2995     int8_t shiftCount;
2996     uint64_t zSig2;
2997
2998     if ( zSig0 == 0 ) {
2999         zSig0 = zSig1;
3000         zSig1 = 0;
3001         zExp -= 64;
3002     }
3003     shiftCount = countLeadingZeros64( zSig0 ) - 15;
3004     if ( 0 <= shiftCount ) {
3005         zSig2 = 0;
3006         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3007     }
3008     else {
3009         shift128ExtraRightJamming(
3010             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3011     }
3012     zExp -= shiftCount;
3013     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3014
3015 }
3016
3017
3018 /*----------------------------------------------------------------------------
3019 | Returns the result of converting the 32-bit two's complement integer `a'
3020 | to the extended double-precision floating-point format.  The conversion
3021 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3022 | Arithmetic.
3023 *----------------------------------------------------------------------------*/
3024
3025 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3026 {
3027     flag zSign;
3028     uint32_t absA;
3029     int8_t shiftCount;
3030     uint64_t zSig;
3031
3032     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3033     zSign = ( a < 0 );
3034     absA = zSign ? - a : a;
3035     shiftCount = countLeadingZeros32( absA ) + 32;
3036     zSig = absA;
3037     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3038
3039 }
3040
3041 /*----------------------------------------------------------------------------
3042 | Returns the result of converting the 32-bit two's complement integer `a' to
3043 | the quadruple-precision floating-point format.  The conversion is performed
3044 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3045 *----------------------------------------------------------------------------*/
3046
3047 float128 int32_to_float128(int32_t a, float_status *status)
3048 {
3049     flag zSign;
3050     uint32_t absA;
3051     int8_t shiftCount;
3052     uint64_t zSig0;
3053
3054     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3055     zSign = ( a < 0 );
3056     absA = zSign ? - a : a;
3057     shiftCount = countLeadingZeros32( absA ) + 17;
3058     zSig0 = absA;
3059     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3060
3061 }
3062
3063 /*----------------------------------------------------------------------------
3064 | Returns the result of converting the 64-bit two's complement integer `a'
3065 | to the extended double-precision floating-point format.  The conversion
3066 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3067 | Arithmetic.
3068 *----------------------------------------------------------------------------*/
3069
3070 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3071 {
3072     flag zSign;
3073     uint64_t absA;
3074     int8_t shiftCount;
3075
3076     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3077     zSign = ( a < 0 );
3078     absA = zSign ? - a : a;
3079     shiftCount = countLeadingZeros64( absA );
3080     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3081
3082 }
3083
3084 /*----------------------------------------------------------------------------
3085 | Returns the result of converting the 64-bit two's complement integer `a' to
3086 | the quadruple-precision floating-point format.  The conversion is performed
3087 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3088 *----------------------------------------------------------------------------*/
3089
3090 float128 int64_to_float128(int64_t a, float_status *status)
3091 {
3092     flag zSign;
3093     uint64_t absA;
3094     int8_t shiftCount;
3095     int32_t zExp;
3096     uint64_t zSig0, zSig1;
3097
3098     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3099     zSign = ( a < 0 );
3100     absA = zSign ? - a : a;
3101     shiftCount = countLeadingZeros64( absA ) + 49;
3102     zExp = 0x406E - shiftCount;
3103     if ( 64 <= shiftCount ) {
3104         zSig1 = 0;
3105         zSig0 = absA;
3106         shiftCount -= 64;
3107     }
3108     else {
3109         zSig1 = absA;
3110         zSig0 = 0;
3111     }
3112     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3113     return packFloat128( zSign, zExp, zSig0, zSig1 );
3114
3115 }
3116
3117 /*----------------------------------------------------------------------------
3118 | Returns the result of converting the 64-bit unsigned integer `a'
3119 | to the quadruple-precision floating-point format.  The conversion is performed
3120 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3121 *----------------------------------------------------------------------------*/
3122
3123 float128 uint64_to_float128(uint64_t a, float_status *status)
3124 {
3125     if (a == 0) {
3126         return float128_zero;
3127     }
3128     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
3129 }
3130
3131
3132
3133
3134 /*----------------------------------------------------------------------------
3135 | Returns the result of converting the single-precision floating-point value
3136 | `a' to the double-precision floating-point format.  The conversion is
3137 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3138 | Arithmetic.
3139 *----------------------------------------------------------------------------*/
3140
3141 float64 float32_to_float64(float32 a, float_status *status)
3142 {
3143     flag aSign;
3144     int aExp;
3145     uint32_t aSig;
3146     a = float32_squash_input_denormal(a, status);
3147
3148     aSig = extractFloat32Frac( a );
3149     aExp = extractFloat32Exp( a );
3150     aSign = extractFloat32Sign( a );
3151     if ( aExp == 0xFF ) {
3152         if (aSig) {
3153             return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
3154         }
3155         return packFloat64( aSign, 0x7FF, 0 );
3156     }
3157     if ( aExp == 0 ) {
3158         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
3159         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3160         --aExp;
3161     }
3162     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
3163
3164 }
3165
3166 /*----------------------------------------------------------------------------
3167 | Returns the result of converting the single-precision floating-point value
3168 | `a' to the extended double-precision floating-point format.  The conversion
3169 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3170 | Arithmetic.
3171 *----------------------------------------------------------------------------*/
3172
3173 floatx80 float32_to_floatx80(float32 a, float_status *status)
3174 {
3175     flag aSign;
3176     int aExp;
3177     uint32_t aSig;
3178
3179     a = float32_squash_input_denormal(a, status);
3180     aSig = extractFloat32Frac( a );
3181     aExp = extractFloat32Exp( a );
3182     aSign = extractFloat32Sign( a );
3183     if ( aExp == 0xFF ) {
3184         if (aSig) {
3185             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3186         }
3187         return packFloatx80(aSign,
3188                             floatx80_infinity_high,
3189                             floatx80_infinity_low);
3190     }
3191     if ( aExp == 0 ) {
3192         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3193         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3194     }
3195     aSig |= 0x00800000;
3196     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3197
3198 }
3199
3200 /*----------------------------------------------------------------------------
3201 | Returns the result of converting the single-precision floating-point value
3202 | `a' to the double-precision floating-point format.  The conversion is
3203 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3204 | Arithmetic.
3205 *----------------------------------------------------------------------------*/
3206
3207 float128 float32_to_float128(float32 a, float_status *status)
3208 {
3209     flag aSign;
3210     int aExp;
3211     uint32_t aSig;
3212
3213     a = float32_squash_input_denormal(a, status);
3214     aSig = extractFloat32Frac( a );
3215     aExp = extractFloat32Exp( a );
3216     aSign = extractFloat32Sign( a );
3217     if ( aExp == 0xFF ) {
3218         if (aSig) {
3219             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3220         }
3221         return packFloat128( aSign, 0x7FFF, 0, 0 );
3222     }
3223     if ( aExp == 0 ) {
3224         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3225         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3226         --aExp;
3227     }
3228     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3229
3230 }
3231
3232 /*----------------------------------------------------------------------------
3233 | Returns the remainder of the single-precision floating-point value `a'
3234 | with respect to the corresponding value `b'.  The operation is performed
3235 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3236 *----------------------------------------------------------------------------*/
3237
3238 float32 float32_rem(float32 a, float32 b, float_status *status)
3239 {
3240     flag aSign, zSign;
3241     int aExp, bExp, expDiff;
3242     uint32_t aSig, bSig;
3243     uint32_t q;
3244     uint64_t aSig64, bSig64, q64;
3245     uint32_t alternateASig;
3246     int32_t sigMean;
3247     a = float32_squash_input_denormal(a, status);
3248     b = float32_squash_input_denormal(b, status);
3249
3250     aSig = extractFloat32Frac( a );
3251     aExp = extractFloat32Exp( a );
3252     aSign = extractFloat32Sign( a );
3253     bSig = extractFloat32Frac( b );
3254     bExp = extractFloat32Exp( b );
3255     if ( aExp == 0xFF ) {
3256         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3257             return propagateFloat32NaN(a, b, status);
3258         }
3259         float_raise(float_flag_invalid, status);
3260         return float32_default_nan(status);
3261     }
3262     if ( bExp == 0xFF ) {
3263         if (bSig) {
3264             return propagateFloat32NaN(a, b, status);
3265         }
3266         return a;
3267     }
3268     if ( bExp == 0 ) {
3269         if ( bSig == 0 ) {
3270             float_raise(float_flag_invalid, status);
3271             return float32_default_nan(status);
3272         }
3273         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3274     }
3275     if ( aExp == 0 ) {
3276         if ( aSig == 0 ) return a;
3277         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3278     }
3279     expDiff = aExp - bExp;
3280     aSig |= 0x00800000;
3281     bSig |= 0x00800000;
3282     if ( expDiff < 32 ) {
3283         aSig <<= 8;
3284         bSig <<= 8;
3285         if ( expDiff < 0 ) {
3286             if ( expDiff < -1 ) return a;
3287             aSig >>= 1;
3288         }
3289         q = ( bSig <= aSig );
3290         if ( q ) aSig -= bSig;
3291         if ( 0 < expDiff ) {
3292             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3293             q >>= 32 - expDiff;
3294             bSig >>= 2;
3295             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3296         }
3297         else {
3298             aSig >>= 2;
3299             bSig >>= 2;
3300         }
3301     }
3302     else {
3303         if ( bSig <= aSig ) aSig -= bSig;
3304         aSig64 = ( (uint64_t) aSig )<<40;
3305         bSig64 = ( (uint64_t) bSig )<<40;
3306         expDiff -= 64;
3307         while ( 0 < expDiff ) {
3308             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3309             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3310             aSig64 = - ( ( bSig * q64 )<<38 );
3311             expDiff -= 62;
3312         }
3313         expDiff += 64;
3314         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3315         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3316         q = q64>>( 64 - expDiff );
3317         bSig <<= 6;
3318         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3319     }
3320     do {
3321         alternateASig = aSig;
3322         ++q;
3323         aSig -= bSig;
3324     } while ( 0 <= (int32_t) aSig );
3325     sigMean = aSig + alternateASig;
3326     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3327         aSig = alternateASig;
3328     }
3329     zSign = ( (int32_t) aSig < 0 );
3330     if ( zSign ) aSig = - aSig;
3331     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3332 }
3333
3334
3335
3336 /*----------------------------------------------------------------------------
3337 | Returns the binary exponential of the single-precision floating-point value
3338 | `a'. The operation is performed according to the IEC/IEEE Standard for
3339 | Binary Floating-Point Arithmetic.
3340 |
3341 | Uses the following identities:
3342 |
3343 | 1. -------------------------------------------------------------------------
3344 |      x    x*ln(2)
3345 |     2  = e
3346 |
3347 | 2. -------------------------------------------------------------------------
3348 |                      2     3     4     5           n
3349 |      x        x     x     x     x     x           x
3350 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3351 |               1!    2!    3!    4!    5!          n!
3352 *----------------------------------------------------------------------------*/
3353
3354 static const float64 float32_exp2_coefficients[15] =
3355 {
3356     const_float64( 0x3ff0000000000000ll ), /*  1 */
3357     const_float64( 0x3fe0000000000000ll ), /*  2 */
3358     const_float64( 0x3fc5555555555555ll ), /*  3 */
3359     const_float64( 0x3fa5555555555555ll ), /*  4 */
3360     const_float64( 0x3f81111111111111ll ), /*  5 */
3361     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
3362     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
3363     const_float64( 0x3efa01a01a01a01all ), /*  8 */
3364     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
3365     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3366     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3367     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3368     const_float64( 0x3de6124613a86d09ll ), /* 13 */
3369     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3370     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3371 };
3372
3373 float32 float32_exp2(float32 a, float_status *status)
3374 {
3375     flag aSign;
3376     int aExp;
3377     uint32_t aSig;
3378     float64 r, x, xn;
3379     int i;
3380     a = float32_squash_input_denormal(a, status);
3381
3382     aSig = extractFloat32Frac( a );
3383     aExp = extractFloat32Exp( a );
3384     aSign = extractFloat32Sign( a );
3385
3386     if ( aExp == 0xFF) {
3387         if (aSig) {
3388             return propagateFloat32NaN(a, float32_zero, status);
3389         }
3390         return (aSign) ? float32_zero : a;
3391     }
3392     if (aExp == 0) {
3393         if (aSig == 0) return float32_one;
3394     }
3395
3396     float_raise(float_flag_inexact, status);
3397
3398     /* ******************************* */
3399     /* using float64 for approximation */
3400     /* ******************************* */
3401     x = float32_to_float64(a, status);
3402     x = float64_mul(x, float64_ln2, status);
3403
3404     xn = x;
3405     r = float64_one;
3406     for (i = 0 ; i < 15 ; i++) {
3407         float64 f;
3408
3409         f = float64_mul(xn, float32_exp2_coefficients[i], status);
3410         r = float64_add(r, f, status);
3411
3412         xn = float64_mul(xn, x, status);
3413     }
3414
3415     return float64_to_float32(r, status);
3416 }
3417
3418 /*----------------------------------------------------------------------------
3419 | Returns the binary log of the single-precision floating-point value `a'.
3420 | The operation is performed according to the IEC/IEEE Standard for Binary
3421 | Floating-Point Arithmetic.
3422 *----------------------------------------------------------------------------*/
3423 float32 float32_log2(float32 a, float_status *status)
3424 {
3425     flag aSign, zSign;
3426     int aExp;
3427     uint32_t aSig, zSig, i;
3428
3429     a = float32_squash_input_denormal(a, status);
3430     aSig = extractFloat32Frac( a );
3431     aExp = extractFloat32Exp( a );
3432     aSign = extractFloat32Sign( a );
3433
3434     if ( aExp == 0 ) {
3435         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3436         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3437     }
3438     if ( aSign ) {
3439         float_raise(float_flag_invalid, status);
3440         return float32_default_nan(status);
3441     }
3442     if ( aExp == 0xFF ) {
3443         if (aSig) {
3444             return propagateFloat32NaN(a, float32_zero, status);
3445         }
3446         return a;
3447     }
3448
3449     aExp -= 0x7F;
3450     aSig |= 0x00800000;
3451     zSign = aExp < 0;
3452     zSig = aExp << 23;
3453
3454     for (i = 1 << 22; i > 0; i >>= 1) {
3455         aSig = ( (uint64_t)aSig * aSig ) >> 23;
3456         if ( aSig & 0x01000000 ) {
3457             aSig >>= 1;
3458             zSig |= i;
3459         }
3460     }
3461
3462     if ( zSign )
3463         zSig = -zSig;
3464
3465     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3466 }
3467
3468 /*----------------------------------------------------------------------------
3469 | Returns 1 if the single-precision floating-point value `a' is equal to
3470 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3471 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3472 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3473 *----------------------------------------------------------------------------*/
3474
3475 int float32_eq(float32 a, float32 b, float_status *status)
3476 {
3477     uint32_t av, bv;
3478     a = float32_squash_input_denormal(a, status);
3479     b = float32_squash_input_denormal(b, status);
3480
3481     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3482          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3483        ) {
3484         float_raise(float_flag_invalid, status);
3485         return 0;
3486     }
3487     av = float32_val(a);
3488     bv = float32_val(b);
3489     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3490 }
3491
3492 /*----------------------------------------------------------------------------
3493 | Returns 1 if the single-precision floating-point value `a' is less than
3494 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
3495 | exception is raised if either operand is a NaN.  The comparison is performed
3496 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3497 *----------------------------------------------------------------------------*/
3498
3499 int float32_le(float32 a, float32 b, float_status *status)
3500 {
3501     flag aSign, bSign;
3502     uint32_t av, bv;
3503     a = float32_squash_input_denormal(a, status);
3504     b = float32_squash_input_denormal(b, status);
3505
3506     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3507          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3508        ) {
3509         float_raise(float_flag_invalid, status);
3510         return 0;
3511     }
3512     aSign = extractFloat32Sign( a );
3513     bSign = extractFloat32Sign( b );
3514     av = float32_val(a);
3515     bv = float32_val(b);
3516     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3517     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3518
3519 }
3520
3521 /*----------------------------------------------------------------------------
3522 | Returns 1 if the single-precision floating-point value `a' is less than
3523 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3524 | raised if either operand is a NaN.  The comparison is performed according
3525 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3526 *----------------------------------------------------------------------------*/
3527
3528 int float32_lt(float32 a, float32 b, float_status *status)
3529 {
3530     flag aSign, bSign;
3531     uint32_t av, bv;
3532     a = float32_squash_input_denormal(a, status);
3533     b = float32_squash_input_denormal(b, status);
3534
3535     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3536          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3537        ) {
3538         float_raise(float_flag_invalid, status);
3539         return 0;
3540     }
3541     aSign = extractFloat32Sign( a );
3542     bSign = extractFloat32Sign( b );
3543     av = float32_val(a);
3544     bv = float32_val(b);
3545     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3546     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3547
3548 }
3549
3550 /*----------------------------------------------------------------------------
3551 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3552 | be compared, and 0 otherwise.  The invalid exception is raised if either
3553 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
3554 | Standard for Binary Floating-Point Arithmetic.
3555 *----------------------------------------------------------------------------*/
3556
3557 int float32_unordered(float32 a, float32 b, float_status *status)
3558 {
3559     a = float32_squash_input_denormal(a, status);
3560     b = float32_squash_input_denormal(b, status);
3561
3562     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3563          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3564        ) {
3565         float_raise(float_flag_invalid, status);
3566         return 1;
3567     }
3568     return 0;
3569 }
3570
3571 /*----------------------------------------------------------------------------
3572 | Returns 1 if the single-precision floating-point value `a' is equal to
3573 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3574 | exception.  The comparison is performed according to the IEC/IEEE Standard
3575 | for Binary Floating-Point Arithmetic.
3576 *----------------------------------------------------------------------------*/
3577
3578 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3579 {
3580     a = float32_squash_input_denormal(a, status);
3581     b = float32_squash_input_denormal(b, status);
3582
3583     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3584          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3585        ) {
3586         if (float32_is_signaling_nan(a, status)
3587          || float32_is_signaling_nan(b, status)) {
3588             float_raise(float_flag_invalid, status);
3589         }
3590         return 0;
3591     }
3592     return ( float32_val(a) == float32_val(b) ) ||
3593             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3594 }
3595
3596 /*----------------------------------------------------------------------------
3597 | Returns 1 if the single-precision floating-point value `a' is less than or
3598 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3599 | cause an exception.  Otherwise, the comparison is performed according to the
3600 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3601 *----------------------------------------------------------------------------*/
3602
3603 int float32_le_quiet(float32 a, float32 b, float_status *status)
3604 {
3605     flag aSign, bSign;
3606     uint32_t av, bv;
3607     a = float32_squash_input_denormal(a, status);
3608     b = float32_squash_input_denormal(b, status);
3609
3610     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3611          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3612        ) {
3613         if (float32_is_signaling_nan(a, status)
3614          || float32_is_signaling_nan(b, status)) {
3615             float_raise(float_flag_invalid, status);
3616         }
3617         return 0;
3618     }
3619     aSign = extractFloat32Sign( a );
3620     bSign = extractFloat32Sign( b );
3621     av = float32_val(a);
3622     bv = float32_val(b);
3623     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3624     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3625
3626 }
3627
3628 /*----------------------------------------------------------------------------
3629 | Returns 1 if the single-precision floating-point value `a' is less than
3630 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3631 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3632 | Standard for Binary Floating-Point Arithmetic.
3633 *----------------------------------------------------------------------------*/
3634
3635 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3636 {
3637     flag aSign, bSign;
3638     uint32_t av, bv;
3639     a = float32_squash_input_denormal(a, status);
3640     b = float32_squash_input_denormal(b, status);
3641
3642     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3643          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3644        ) {
3645         if (float32_is_signaling_nan(a, status)
3646          || float32_is_signaling_nan(b, status)) {
3647             float_raise(float_flag_invalid, status);
3648         }
3649         return 0;
3650     }
3651     aSign = extractFloat32Sign( a );
3652     bSign = extractFloat32Sign( b );
3653     av = float32_val(a);
3654     bv = float32_val(b);
3655     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3656     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3657
3658 }
3659
3660 /*----------------------------------------------------------------------------
3661 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3662 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3663 | comparison is performed according to the IEC/IEEE Standard for Binary
3664 | Floating-Point Arithmetic.
3665 *----------------------------------------------------------------------------*/
3666
3667 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3668 {
3669     a = float32_squash_input_denormal(a, status);
3670     b = float32_squash_input_denormal(b, status);
3671
3672     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3673          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3674        ) {
3675         if (float32_is_signaling_nan(a, status)
3676          || float32_is_signaling_nan(b, status)) {
3677             float_raise(float_flag_invalid, status);
3678         }
3679         return 1;
3680     }
3681     return 0;
3682 }
3683
3684
3685 /*----------------------------------------------------------------------------
3686 | Returns the result of converting the double-precision floating-point value
3687 | `a' to the single-precision floating-point format.  The conversion is
3688 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3689 | Arithmetic.
3690 *----------------------------------------------------------------------------*/
3691
3692 float32 float64_to_float32(float64 a, float_status *status)
3693 {
3694     flag aSign;
3695     int aExp;
3696     uint64_t aSig;
3697     uint32_t zSig;
3698     a = float64_squash_input_denormal(a, status);
3699
3700     aSig = extractFloat64Frac( a );
3701     aExp = extractFloat64Exp( a );
3702     aSign = extractFloat64Sign( a );
3703     if ( aExp == 0x7FF ) {
3704         if (aSig) {
3705             return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3706         }
3707         return packFloat32( aSign, 0xFF, 0 );
3708     }
3709     shift64RightJamming( aSig, 22, &aSig );
3710     zSig = aSig;
3711     if ( aExp || zSig ) {
3712         zSig |= 0x40000000;
3713         aExp -= 0x381;
3714     }
3715     return roundAndPackFloat32(aSign, aExp, zSig, status);
3716
3717 }
3718
3719
3720 /*----------------------------------------------------------------------------
3721 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3722 | half-precision floating-point value, returning the result.  After being
3723 | shifted into the proper positions, the three fields are simply added
3724 | together to form the result.  This means that any integer portion of `zSig'
3725 | will be added into the exponent.  Since a properly normalized significand
3726 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3727 | than the desired result exponent whenever `zSig' is a complete, normalized
3728 | significand.
3729 *----------------------------------------------------------------------------*/
3730 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3731 {
3732     return make_float16(
3733         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3734 }
3735
3736 /*----------------------------------------------------------------------------
3737 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3738 | and significand `zSig', and returns the proper half-precision floating-
3739 | point value corresponding to the abstract input.  Ordinarily, the abstract
3740 | value is simply rounded and packed into the half-precision format, with
3741 | the inexact exception raised if the abstract input cannot be represented
3742 | exactly.  However, if the abstract value is too large, the overflow and
3743 | inexact exceptions are raised and an infinity or maximal finite value is
3744 | returned.  If the abstract value is too small, the input value is rounded to
3745 | a subnormal number, and the underflow and inexact exceptions are raised if
3746 | the abstract input cannot be represented exactly as a subnormal half-
3747 | precision floating-point number.
3748 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3749 | ARM-style "alternative representation", which omits the NaN and Inf
3750 | encodings in order to raise the maximum representable exponent by one.
3751 |     The input significand `zSig' has its binary point between bits 22
3752 | and 23, which is 13 bits to the left of the usual location.  This shifted
3753 | significand must be normalized or smaller.  If `zSig' is not normalized,
3754 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3755 | and it must not require rounding.  In the usual case that `zSig' is
3756 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3757 | Note the slightly odd position of the binary point in zSig compared with the
3758 | other roundAndPackFloat functions. This should probably be fixed if we
3759 | need to implement more float16 routines than just conversion.
3760 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3761 | Binary Floating-Point Arithmetic.
3762 *----------------------------------------------------------------------------*/
3763
3764 static float16 roundAndPackFloat16(flag zSign, int zExp,
3765                                    uint32_t zSig, flag ieee,
3766                                    float_status *status)
3767 {
3768     int maxexp = ieee ? 29 : 30;
3769     uint32_t mask;
3770     uint32_t increment;
3771     bool rounding_bumps_exp;
3772     bool is_tiny = false;
3773
3774     /* Calculate the mask of bits of the mantissa which are not
3775      * representable in half-precision and will be lost.
3776      */
3777     if (zExp < 1) {
3778         /* Will be denormal in halfprec */
3779         mask = 0x00ffffff;
3780         if (zExp >= -11) {
3781             mask >>= 11 + zExp;
3782         }
3783     } else {
3784         /* Normal number in halfprec */
3785         mask = 0x00001fff;
3786     }
3787
3788     switch (status->float_rounding_mode) {
3789     case float_round_nearest_even:
3790         increment = (mask + 1) >> 1;
3791         if ((zSig & mask) == increment) {
3792             increment = zSig & (increment << 1);
3793         }
3794         break;
3795     case float_round_ties_away:
3796         increment = (mask + 1) >> 1;
3797         break;
3798     case float_round_up:
3799         increment = zSign ? 0 : mask;
3800         break;
3801     case float_round_down:
3802         increment = zSign ? mask : 0;
3803         break;
3804     default: /* round_to_zero */
3805         increment = 0;
3806         break;
3807     }
3808
3809     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3810
3811     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3812         if (ieee) {
3813             float_raise(float_flag_overflow | float_flag_inexact, status);
3814             return packFloat16(zSign, 0x1f, 0);
3815         } else {
3816             float_raise(float_flag_invalid, status);
3817             return packFloat16(zSign, 0x1f, 0x3ff);
3818         }
3819     }
3820
3821     if (zExp < 0) {
3822         /* Note that flush-to-zero does not affect half-precision results */
3823         is_tiny =
3824             (status->float_detect_tininess == float_tininess_before_rounding)
3825             || (zExp < -1)
3826             || (!rounding_bumps_exp);
3827     }
3828     if (zSig & mask) {
3829         float_raise(float_flag_inexact, status);
3830         if (is_tiny) {
3831             float_raise(float_flag_underflow, status);
3832         }
3833     }
3834
3835     zSig += increment;
3836     if (rounding_bumps_exp) {
3837         zSig >>= 1;
3838         zExp++;
3839     }
3840
3841     if (zExp < -10) {
3842         return packFloat16(zSign, 0, 0);
3843     }
3844     if (zExp < 0) {
3845         zSig >>= -zExp;
3846         zExp = 0;
3847     }
3848     return packFloat16(zSign, zExp, zSig >> 13);
3849 }
3850
3851 /*----------------------------------------------------------------------------
3852 | If `a' is denormal and we are in flush-to-zero mode then set the
3853 | input-denormal exception and return zero. Otherwise just return the value.
3854 *----------------------------------------------------------------------------*/
3855 float16 float16_squash_input_denormal(float16 a, float_status *status)
3856 {
3857     if (status->flush_inputs_to_zero) {
3858         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3859             float_raise(float_flag_input_denormal, status);
3860             return make_float16(float16_val(a) & 0x8000);
3861         }
3862     }
3863     return a;
3864 }
3865
3866 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3867                                       uint32_t *zSigPtr)
3868 {
3869     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3870     *zSigPtr = aSig << shiftCount;
3871     *zExpPtr = 1 - shiftCount;
3872 }
3873
3874 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3875    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3876
3877 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3878 {
3879     flag aSign;
3880     int aExp;
3881     uint32_t aSig;
3882
3883     aSign = extractFloat16Sign(a);
3884     aExp = extractFloat16Exp(a);
3885     aSig = extractFloat16Frac(a);
3886
3887     if (aExp == 0x1f && ieee) {
3888         if (aSig) {
3889             return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3890         }
3891         return packFloat32(aSign, 0xff, 0);
3892     }
3893     if (aExp == 0) {
3894         if (aSig == 0) {
3895             return packFloat32(aSign, 0, 0);
3896         }
3897
3898         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3899         aExp--;
3900     }
3901     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3902 }
3903
3904 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3905 {
3906     flag aSign;
3907     int aExp;
3908     uint32_t aSig;
3909
3910     a = float32_squash_input_denormal(a, status);
3911
3912     aSig = extractFloat32Frac( a );
3913     aExp = extractFloat32Exp( a );
3914     aSign = extractFloat32Sign( a );
3915     if ( aExp == 0xFF ) {
3916         if (aSig) {
3917             /* Input is a NaN */
3918             if (!ieee) {
3919                 float_raise(float_flag_invalid, status);
3920                 return packFloat16(aSign, 0, 0);
3921             }
3922             return commonNaNToFloat16(
3923                 float32ToCommonNaN(a, status), status);
3924         }
3925         /* Infinity */
3926         if (!ieee) {
3927             float_raise(float_flag_invalid, status);
3928             return packFloat16(aSign, 0x1f, 0x3ff);
3929         }
3930         return packFloat16(aSign, 0x1f, 0);
3931     }
3932     if (aExp == 0 && aSig == 0) {
3933         return packFloat16(aSign, 0, 0);
3934     }
3935     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3936      * even if the input is denormal; however this is harmless because
3937      * the largest possible single-precision denormal is still smaller
3938      * than the smallest representable half-precision denormal, and so we
3939      * will end up ignoring aSig and returning via the "always return zero"
3940      * codepath.
3941      */
3942     aSig |= 0x00800000;
3943     aExp -= 0x71;
3944
3945     return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3946 }
3947
3948 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3949 {
3950     flag aSign;
3951     int aExp;
3952     uint32_t aSig;
3953
3954     aSign = extractFloat16Sign(a);
3955     aExp = extractFloat16Exp(a);
3956     aSig = extractFloat16Frac(a);
3957
3958     if (aExp == 0x1f && ieee) {
3959         if (aSig) {
3960             return commonNaNToFloat64(
3961                 float16ToCommonNaN(a, status), status);
3962         }
3963         return packFloat64(aSign, 0x7ff, 0);
3964     }
3965     if (aExp == 0) {
3966         if (aSig == 0) {
3967             return packFloat64(aSign, 0, 0);
3968         }
3969
3970         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3971         aExp--;
3972     }
3973     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3974 }
3975
3976 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3977 {
3978     flag aSign;
3979     int aExp;
3980     uint64_t aSig;
3981     uint32_t zSig;
3982
3983     a = float64_squash_input_denormal(a, status);
3984
3985     aSig = extractFloat64Frac(a);
3986     aExp = extractFloat64Exp(a);
3987     aSign = extractFloat64Sign(a);
3988     if (aExp == 0x7FF) {
3989         if (aSig) {
3990             /* Input is a NaN */
3991             if (!ieee) {
3992                 float_raise(float_flag_invalid, status);
3993                 return packFloat16(aSign, 0, 0);
3994             }
3995             return commonNaNToFloat16(
3996                 float64ToCommonNaN(a, status), status);
3997         }
3998         /* Infinity */
3999         if (!ieee) {
4000             float_raise(float_flag_invalid, status);
4001             return packFloat16(aSign, 0x1f, 0x3ff);
4002         }
4003         return packFloat16(aSign, 0x1f, 0);
4004     }
4005     shift64RightJamming(aSig, 29, &aSig);
4006     zSig = aSig;
4007     if (aExp == 0 && zSig == 0) {
4008         return packFloat16(aSign, 0, 0);
4009     }
4010     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4011      * even if the input is denormal; however this is harmless because
4012      * the largest possible single-precision denormal is still smaller
4013      * than the smallest representable half-precision denormal, and so we
4014      * will end up ignoring aSig and returning via the "always return zero"
4015      * codepath.
4016      */
4017     zSig |= 0x00800000;
4018     aExp -= 0x3F1;
4019
4020     return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
4021 }
4022
4023 /*----------------------------------------------------------------------------
4024 | Returns the result of converting the double-precision floating-point value
4025 | `a' to the extended double-precision floating-point format.  The conversion
4026 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4027 | Arithmetic.
4028 *----------------------------------------------------------------------------*/
4029
4030 floatx80 float64_to_floatx80(float64 a, float_status *status)
4031 {
4032     flag aSign;
4033     int aExp;
4034     uint64_t aSig;
4035
4036     a = float64_squash_input_denormal(a, status);
4037     aSig = extractFloat64Frac( a );
4038     aExp = extractFloat64Exp( a );
4039     aSign = extractFloat64Sign( a );
4040     if ( aExp == 0x7FF ) {
4041         if (aSig) {
4042             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4043         }
4044         return packFloatx80(aSign,
4045                             floatx80_infinity_high,
4046                             floatx80_infinity_low);
4047     }
4048     if ( aExp == 0 ) {
4049         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4050         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4051     }
4052     return
4053         packFloatx80(
4054             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4055
4056 }
4057
4058 /*----------------------------------------------------------------------------
4059 | Returns the result of converting the double-precision floating-point value
4060 | `a' to the quadruple-precision floating-point format.  The conversion is
4061 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4062 | Arithmetic.
4063 *----------------------------------------------------------------------------*/
4064
4065 float128 float64_to_float128(float64 a, float_status *status)
4066 {
4067     flag aSign;
4068     int aExp;
4069     uint64_t aSig, zSig0, zSig1;
4070
4071     a = float64_squash_input_denormal(a, status);
4072     aSig = extractFloat64Frac( a );
4073     aExp = extractFloat64Exp( a );
4074     aSign = extractFloat64Sign( a );
4075     if ( aExp == 0x7FF ) {
4076         if (aSig) {
4077             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4078         }
4079         return packFloat128( aSign, 0x7FFF, 0, 0 );
4080     }
4081     if ( aExp == 0 ) {
4082         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4083         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4084         --aExp;
4085     }
4086     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4087     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4088
4089 }
4090
4091
4092 /*----------------------------------------------------------------------------
4093 | Returns the remainder of the double-precision floating-point value `a'
4094 | with respect to the corresponding value `b'.  The operation is performed
4095 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4096 *----------------------------------------------------------------------------*/
4097
4098 float64 float64_rem(float64 a, float64 b, float_status *status)
4099 {
4100     flag aSign, zSign;
4101     int aExp, bExp, expDiff;
4102     uint64_t aSig, bSig;
4103     uint64_t q, alternateASig;
4104     int64_t sigMean;
4105
4106     a = float64_squash_input_denormal(a, status);
4107     b = float64_squash_input_denormal(b, status);
4108     aSig = extractFloat64Frac( a );
4109     aExp = extractFloat64Exp( a );
4110     aSign = extractFloat64Sign( a );
4111     bSig = extractFloat64Frac( b );
4112     bExp = extractFloat64Exp( b );
4113     if ( aExp == 0x7FF ) {
4114         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4115             return propagateFloat64NaN(a, b, status);
4116         }
4117         float_raise(float_flag_invalid, status);
4118         return float64_default_nan(status);
4119     }
4120     if ( bExp == 0x7FF ) {
4121         if (bSig) {
4122             return propagateFloat64NaN(a, b, status);
4123         }
4124         return a;
4125     }
4126     if ( bExp == 0 ) {
4127         if ( bSig == 0 ) {
4128             float_raise(float_flag_invalid, status);
4129             return float64_default_nan(status);
4130         }
4131         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4132     }
4133     if ( aExp == 0 ) {
4134         if ( aSig == 0 ) return a;
4135         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4136     }
4137     expDiff = aExp - bExp;
4138     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4139     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4140     if ( expDiff < 0 ) {
4141         if ( expDiff < -1 ) return a;
4142         aSig >>= 1;
4143     }
4144     q = ( bSig <= aSig );
4145     if ( q ) aSig -= bSig;
4146     expDiff -= 64;
4147     while ( 0 < expDiff ) {
4148         q = estimateDiv128To64( aSig, 0, bSig );
4149         q = ( 2 < q ) ? q - 2 : 0;
4150         aSig = - ( ( bSig>>2 ) * q );
4151         expDiff -= 62;
4152     }
4153     expDiff += 64;
4154     if ( 0 < expDiff ) {
4155         q = estimateDiv128To64( aSig, 0, bSig );
4156         q = ( 2 < q ) ? q - 2 : 0;
4157         q >>= 64 - expDiff;
4158         bSig >>= 2;
4159         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4160     }
4161     else {
4162         aSig >>= 2;
4163         bSig >>= 2;
4164     }
4165     do {
4166         alternateASig = aSig;
4167         ++q;
4168         aSig -= bSig;
4169     } while ( 0 <= (int64_t) aSig );
4170     sigMean = aSig + alternateASig;
4171     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4172         aSig = alternateASig;
4173     }
4174     zSign = ( (int64_t) aSig < 0 );
4175     if ( zSign ) aSig = - aSig;
4176     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4177
4178 }
4179
4180 /*----------------------------------------------------------------------------
4181 | Returns the binary log of the double-precision floating-point value `a'.
4182 | The operation is performed according to the IEC/IEEE Standard for Binary
4183 | Floating-Point Arithmetic.
4184 *----------------------------------------------------------------------------*/
4185 float64 float64_log2(float64 a, float_status *status)
4186 {
4187     flag aSign, zSign;
4188     int aExp;
4189     uint64_t aSig, aSig0, aSig1, zSig, i;
4190     a = float64_squash_input_denormal(a, status);
4191
4192     aSig = extractFloat64Frac( a );
4193     aExp = extractFloat64Exp( a );
4194     aSign = extractFloat64Sign( a );
4195
4196     if ( aExp == 0 ) {
4197         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4198         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4199     }
4200     if ( aSign ) {
4201         float_raise(float_flag_invalid, status);
4202         return float64_default_nan(status);
4203     }
4204     if ( aExp == 0x7FF ) {
4205         if (aSig) {
4206             return propagateFloat64NaN(a, float64_zero, status);
4207         }
4208         return a;
4209     }
4210
4211     aExp -= 0x3FF;
4212     aSig |= LIT64( 0x0010000000000000 );
4213     zSign = aExp < 0;
4214     zSig = (uint64_t)aExp << 52;
4215     for (i = 1LL << 51; i > 0; i >>= 1) {
4216         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4217         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4218         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4219             aSig >>= 1;
4220             zSig |= i;
4221         }
4222     }
4223
4224     if ( zSign )
4225         zSig = -zSig;
4226     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4227 }
4228
4229 /*----------------------------------------------------------------------------
4230 | Returns 1 if the double-precision floating-point value `a' is equal to the
4231 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4232 | if either operand is a NaN.  Otherwise, the comparison is performed
4233 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4234 *----------------------------------------------------------------------------*/
4235
4236 int float64_eq(float64 a, float64 b, float_status *status)
4237 {
4238     uint64_t av, bv;
4239     a = float64_squash_input_denormal(a, status);
4240     b = float64_squash_input_denormal(b, status);
4241
4242     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4243          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4244        ) {
4245         float_raise(float_flag_invalid, status);
4246         return 0;
4247     }
4248     av = float64_val(a);
4249     bv = float64_val(b);
4250     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4251
4252 }
4253
4254 /*----------------------------------------------------------------------------
4255 | Returns 1 if the double-precision floating-point value `a' is less than or
4256 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4257 | exception is raised if either operand is a NaN.  The comparison is performed
4258 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4259 *----------------------------------------------------------------------------*/
4260
4261 int float64_le(float64 a, float64 b, float_status *status)
4262 {
4263     flag aSign, bSign;
4264     uint64_t av, bv;
4265     a = float64_squash_input_denormal(a, status);
4266     b = float64_squash_input_denormal(b, status);
4267
4268     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4269          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4270        ) {
4271         float_raise(float_flag_invalid, status);
4272         return 0;
4273     }
4274     aSign = extractFloat64Sign( a );
4275     bSign = extractFloat64Sign( b );
4276     av = float64_val(a);
4277     bv = float64_val(b);
4278     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4279     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4280
4281 }
4282
4283 /*----------------------------------------------------------------------------
4284 | Returns 1 if the double-precision floating-point value `a' is less than
4285 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4286 | raised if either operand is a NaN.  The comparison is performed according
4287 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4288 *----------------------------------------------------------------------------*/
4289
4290 int float64_lt(float64 a, float64 b, float_status *status)
4291 {
4292     flag aSign, bSign;
4293     uint64_t av, bv;
4294
4295     a = float64_squash_input_denormal(a, status);
4296     b = float64_squash_input_denormal(b, status);
4297     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4298          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4299        ) {
4300         float_raise(float_flag_invalid, status);
4301         return 0;
4302     }
4303     aSign = extractFloat64Sign( a );
4304     bSign = extractFloat64Sign( b );
4305     av = float64_val(a);
4306     bv = float64_val(b);
4307     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4308     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4309
4310 }
4311
4312 /*----------------------------------------------------------------------------
4313 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4314 | be compared, and 0 otherwise.  The invalid exception is raised if either
4315 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4316 | Standard for Binary Floating-Point Arithmetic.
4317 *----------------------------------------------------------------------------*/
4318
4319 int float64_unordered(float64 a, float64 b, float_status *status)
4320 {
4321     a = float64_squash_input_denormal(a, status);
4322     b = float64_squash_input_denormal(b, status);
4323
4324     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4325          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4326        ) {
4327         float_raise(float_flag_invalid, status);
4328         return 1;
4329     }
4330     return 0;
4331 }
4332
4333 /*----------------------------------------------------------------------------
4334 | Returns 1 if the double-precision floating-point value `a' is equal to the
4335 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4336 | exception.The comparison is performed according to the IEC/IEEE Standard
4337 | for Binary Floating-Point Arithmetic.
4338 *----------------------------------------------------------------------------*/
4339
4340 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4341 {
4342     uint64_t av, bv;
4343     a = float64_squash_input_denormal(a, status);
4344     b = float64_squash_input_denormal(b, status);
4345
4346     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4347          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4348        ) {
4349         if (float64_is_signaling_nan(a, status)
4350          || float64_is_signaling_nan(b, status)) {
4351             float_raise(float_flag_invalid, status);
4352         }
4353         return 0;
4354     }
4355     av = float64_val(a);
4356     bv = float64_val(b);
4357     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4358
4359 }
4360
4361 /*----------------------------------------------------------------------------
4362 | Returns 1 if the double-precision floating-point value `a' is less than or
4363 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4364 | cause an exception.  Otherwise, the comparison is performed according to the
4365 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4366 *----------------------------------------------------------------------------*/
4367
4368 int float64_le_quiet(float64 a, float64 b, float_status *status)
4369 {
4370     flag aSign, bSign;
4371     uint64_t av, bv;
4372     a = float64_squash_input_denormal(a, status);
4373     b = float64_squash_input_denormal(b, status);
4374
4375     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4376          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4377        ) {
4378         if (float64_is_signaling_nan(a, status)
4379          || float64_is_signaling_nan(b, status)) {
4380             float_raise(float_flag_invalid, status);
4381         }
4382         return 0;
4383     }
4384     aSign = extractFloat64Sign( a );
4385     bSign = extractFloat64Sign( b );
4386     av = float64_val(a);
4387     bv = float64_val(b);
4388     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4389     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4390
4391 }
4392
4393 /*----------------------------------------------------------------------------
4394 | Returns 1 if the double-precision floating-point value `a' is less than
4395 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4396 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4397 | Standard for Binary Floating-Point Arithmetic.
4398 *----------------------------------------------------------------------------*/
4399
4400 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4401 {
4402     flag aSign, bSign;
4403     uint64_t av, bv;
4404     a = float64_squash_input_denormal(a, status);
4405     b = float64_squash_input_denormal(b, status);
4406
4407     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4408          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4409        ) {
4410         if (float64_is_signaling_nan(a, status)
4411          || float64_is_signaling_nan(b, status)) {
4412             float_raise(float_flag_invalid, status);
4413         }
4414         return 0;
4415     }
4416     aSign = extractFloat64Sign( a );
4417     bSign = extractFloat64Sign( b );
4418     av = float64_val(a);
4419     bv = float64_val(b);
4420     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4421     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4422
4423 }
4424
4425 /*----------------------------------------------------------------------------
4426 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4427 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4428 | comparison is performed according to the IEC/IEEE Standard for Binary
4429 | Floating-Point Arithmetic.
4430 *----------------------------------------------------------------------------*/
4431
4432 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4433 {
4434     a = float64_squash_input_denormal(a, status);
4435     b = float64_squash_input_denormal(b, status);
4436
4437     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4438          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4439        ) {
4440         if (float64_is_signaling_nan(a, status)
4441          || float64_is_signaling_nan(b, status)) {
4442             float_raise(float_flag_invalid, status);
4443         }
4444         return 1;
4445     }
4446     return 0;
4447 }
4448
4449 /*----------------------------------------------------------------------------
4450 | Returns the result of converting the extended double-precision floating-
4451 | point value `a' to the 32-bit two's complement integer format.  The
4452 | conversion is performed according to the IEC/IEEE Standard for Binary
4453 | Floating-Point Arithmetic---which means in particular that the conversion
4454 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4455 | largest positive integer is returned.  Otherwise, if the conversion
4456 | overflows, the largest integer with the same sign as `a' is returned.
4457 *----------------------------------------------------------------------------*/
4458
4459 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4460 {
4461     flag aSign;
4462     int32_t aExp, shiftCount;
4463     uint64_t aSig;
4464
4465     if (floatx80_invalid_encoding(a)) {
4466         float_raise(float_flag_invalid, status);
4467         return 1 << 31;
4468     }
4469     aSig = extractFloatx80Frac( a );
4470     aExp = extractFloatx80Exp( a );
4471     aSign = extractFloatx80Sign( a );
4472     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4473     shiftCount = 0x4037 - aExp;
4474     if ( shiftCount <= 0 ) shiftCount = 1;
4475     shift64RightJamming( aSig, shiftCount, &aSig );
4476     return roundAndPackInt32(aSign, aSig, status);
4477
4478 }
4479
4480 /*----------------------------------------------------------------------------
4481 | Returns the result of converting the extended double-precision floating-
4482 | point value `a' to the 32-bit two's complement integer format.  The
4483 | conversion is performed according to the IEC/IEEE Standard for Binary
4484 | Floating-Point Arithmetic, except that the conversion is always rounded
4485 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4486 | Otherwise, if the conversion overflows, the largest integer with the same
4487 | sign as `a' is returned.
4488 *----------------------------------------------------------------------------*/
4489
4490 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4491 {
4492     flag aSign;
4493     int32_t aExp, shiftCount;
4494     uint64_t aSig, savedASig;
4495     int32_t z;
4496
4497     if (floatx80_invalid_encoding(a)) {
4498         float_raise(float_flag_invalid, status);
4499         return 1 << 31;
4500     }
4501     aSig = extractFloatx80Frac( a );
4502     aExp = extractFloatx80Exp( a );
4503     aSign = extractFloatx80Sign( a );
4504     if ( 0x401E < aExp ) {
4505         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4506         goto invalid;
4507     }
4508     else if ( aExp < 0x3FFF ) {
4509         if (aExp || aSig) {
4510             status->float_exception_flags |= float_flag_inexact;
4511         }
4512         return 0;
4513     }
4514     shiftCount = 0x403E - aExp;
4515     savedASig = aSig;
4516     aSig >>= shiftCount;
4517     z = aSig;
4518     if ( aSign ) z = - z;
4519     if ( ( z < 0 ) ^ aSign ) {
4520  invalid:
4521         float_raise(float_flag_invalid, status);
4522         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4523     }
4524     if ( ( aSig<<shiftCount ) != savedASig ) {
4525         status->float_exception_flags |= float_flag_inexact;
4526     }
4527     return z;
4528
4529 }
4530
4531 /*----------------------------------------------------------------------------
4532 | Returns the result of converting the extended double-precision floating-
4533 | point value `a' to the 64-bit two's complement integer format.  The
4534 | conversion is performed according to the IEC/IEEE Standard for Binary
4535 | Floating-Point Arithmetic---which means in particular that the conversion
4536 | is rounded according to the current rounding mode.  If `a' is a NaN,
4537 | the largest positive integer is returned.  Otherwise, if the conversion
4538 | overflows, the largest integer with the same sign as `a' is returned.
4539 *----------------------------------------------------------------------------*/
4540
4541 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4542 {
4543     flag aSign;
4544     int32_t aExp, shiftCount;
4545     uint64_t aSig, aSigExtra;
4546
4547     if (floatx80_invalid_encoding(a)) {
4548         float_raise(float_flag_invalid, status);
4549         return 1ULL << 63;
4550     }
4551     aSig = extractFloatx80Frac( a );
4552     aExp = extractFloatx80Exp( a );
4553     aSign = extractFloatx80Sign( a );
4554     shiftCount = 0x403E - aExp;
4555     if ( shiftCount <= 0 ) {
4556         if ( shiftCount ) {
4557             float_raise(float_flag_invalid, status);
4558             if (!aSign || floatx80_is_any_nan(a)) {
4559                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4560             }
4561             return (int64_t) LIT64( 0x8000000000000000 );
4562         }
4563         aSigExtra = 0;
4564     }
4565     else {
4566         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4567     }
4568     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4569
4570 }
4571
4572 /*----------------------------------------------------------------------------
4573 | Returns the result of converting the extended double-precision floating-
4574 | point value `a' to the 64-bit two's complement integer format.  The
4575 | conversion is performed according to the IEC/IEEE Standard for Binary
4576 | Floating-Point Arithmetic, except that the conversion is always rounded
4577 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4578 | Otherwise, if the conversion overflows, the largest integer with the same
4579 | sign as `a' is returned.
4580 *----------------------------------------------------------------------------*/
4581
4582 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4583 {
4584     flag aSign;
4585     int32_t aExp, shiftCount;
4586     uint64_t aSig;
4587     int64_t z;
4588
4589     if (floatx80_invalid_encoding(a)) {
4590         float_raise(float_flag_invalid, status);
4591         return 1ULL << 63;
4592     }
4593     aSig = extractFloatx80Frac( a );
4594     aExp = extractFloatx80Exp( a );
4595     aSign = extractFloatx80Sign( a );
4596     shiftCount = aExp - 0x403E;
4597     if ( 0 <= shiftCount ) {
4598         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4599         if ( ( a.high != 0xC03E ) || aSig ) {
4600             float_raise(float_flag_invalid, status);
4601             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4602                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4603             }
4604         }
4605         return (int64_t) LIT64( 0x8000000000000000 );
4606     }
4607     else if ( aExp < 0x3FFF ) {
4608         if (aExp | aSig) {
4609             status->float_exception_flags |= float_flag_inexact;
4610         }
4611         return 0;
4612     }
4613     z = aSig>>( - shiftCount );
4614     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4615         status->float_exception_flags |= float_flag_inexact;
4616     }
4617     if ( aSign ) z = - z;
4618     return z;
4619
4620 }
4621
4622 /*----------------------------------------------------------------------------
4623 | Returns the result of converting the extended double-precision floating-
4624 | point value `a' to the single-precision floating-point format.  The
4625 | conversion is performed according to the IEC/IEEE Standard for Binary
4626 | Floating-Point Arithmetic.
4627 *----------------------------------------------------------------------------*/
4628
4629 float32 floatx80_to_float32(floatx80 a, float_status *status)
4630 {
4631     flag aSign;
4632     int32_t aExp;
4633     uint64_t aSig;
4634
4635     if (floatx80_invalid_encoding(a)) {
4636         float_raise(float_flag_invalid, status);
4637         return float32_default_nan(status);
4638     }
4639     aSig = extractFloatx80Frac( a );
4640     aExp = extractFloatx80Exp( a );
4641     aSign = extractFloatx80Sign( a );
4642     if ( aExp == 0x7FFF ) {
4643         if ( (uint64_t) ( aSig<<1 ) ) {
4644             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4645         }
4646         return packFloat32( aSign, 0xFF, 0 );
4647     }
4648     shift64RightJamming( aSig, 33, &aSig );
4649     if ( aExp || aSig ) aExp -= 0x3F81;
4650     return roundAndPackFloat32(aSign, aExp, aSig, status);
4651
4652 }
4653
4654 /*----------------------------------------------------------------------------
4655 | Returns the result of converting the extended double-precision floating-
4656 | point value `a' to the double-precision floating-point format.  The
4657 | conversion is performed according to the IEC/IEEE Standard for Binary
4658 | Floating-Point Arithmetic.
4659 *----------------------------------------------------------------------------*/
4660
4661 float64 floatx80_to_float64(floatx80 a, float_status *status)
4662 {
4663     flag aSign;
4664     int32_t aExp;
4665     uint64_t aSig, zSig;
4666
4667     if (floatx80_invalid_encoding(a)) {
4668         float_raise(float_flag_invalid, status);
4669         return float64_default_nan(status);
4670     }
4671     aSig = extractFloatx80Frac( a );
4672     aExp = extractFloatx80Exp( a );
4673     aSign = extractFloatx80Sign( a );
4674     if ( aExp == 0x7FFF ) {
4675         if ( (uint64_t) ( aSig<<1 ) ) {
4676             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4677         }
4678         return packFloat64( aSign, 0x7FF, 0 );
4679     }
4680     shift64RightJamming( aSig, 1, &zSig );
4681     if ( aExp || aSig ) aExp -= 0x3C01;
4682     return roundAndPackFloat64(aSign, aExp, zSig, status);
4683
4684 }
4685
4686 /*----------------------------------------------------------------------------
4687 | Returns the result of converting the extended double-precision floating-
4688 | point value `a' to the quadruple-precision floating-point format.  The
4689 | conversion is performed according to the IEC/IEEE Standard for Binary
4690 | Floating-Point Arithmetic.
4691 *----------------------------------------------------------------------------*/
4692
4693 float128 floatx80_to_float128(floatx80 a, float_status *status)
4694 {
4695     flag aSign;
4696     int aExp;
4697     uint64_t aSig, zSig0, zSig1;
4698
4699     if (floatx80_invalid_encoding(a)) {
4700         float_raise(float_flag_invalid, status);
4701         return float128_default_nan(status);
4702     }
4703     aSig = extractFloatx80Frac( a );
4704     aExp = extractFloatx80Exp( a );
4705     aSign = extractFloatx80Sign( a );
4706     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4707         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4708     }
4709     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4710     return packFloat128( aSign, aExp, zSig0, zSig1 );
4711
4712 }
4713
4714 /*----------------------------------------------------------------------------
4715 | Rounds the extended double-precision floating-point value `a'
4716 | to the precision provided by floatx80_rounding_precision and returns the
4717 | result as an extended double-precision floating-point value.
4718 | The operation is performed according to the IEC/IEEE Standard for Binary
4719 | Floating-Point Arithmetic.
4720 *----------------------------------------------------------------------------*/
4721
4722 floatx80 floatx80_round(floatx80 a, float_status *status)
4723 {
4724     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4725                                 extractFloatx80Sign(a),
4726                                 extractFloatx80Exp(a),
4727                                 extractFloatx80Frac(a), 0, status);
4728 }
4729
4730 /*----------------------------------------------------------------------------
4731 | Rounds the extended double-precision floating-point value `a' to an integer,
4732 | and returns the result as an extended quadruple-precision floating-point
4733 | value.  The operation is performed according to the IEC/IEEE Standard for
4734 | Binary Floating-Point Arithmetic.
4735 *----------------------------------------------------------------------------*/
4736
4737 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4738 {
4739     flag aSign;
4740     int32_t aExp;
4741     uint64_t lastBitMask, roundBitsMask;
4742     floatx80 z;
4743
4744     if (floatx80_invalid_encoding(a)) {
4745         float_raise(float_flag_invalid, status);
4746         return floatx80_default_nan(status);
4747     }
4748     aExp = extractFloatx80Exp( a );
4749     if ( 0x403E <= aExp ) {
4750         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4751             return propagateFloatx80NaN(a, a, status);
4752         }
4753         return a;
4754     }
4755     if ( aExp < 0x3FFF ) {
4756         if (    ( aExp == 0 )
4757              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4758             return a;
4759         }
4760         status->float_exception_flags |= float_flag_inexact;
4761         aSign = extractFloatx80Sign( a );
4762         switch (status->float_rounding_mode) {
4763          case float_round_nearest_even:
4764             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4765                ) {
4766                 return
4767                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4768             }
4769             break;
4770         case float_round_ties_away:
4771             if (aExp == 0x3FFE) {
4772                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4773             }
4774             break;
4775          case float_round_down:
4776             return
4777                   aSign ?
4778                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4779                 : packFloatx80( 0, 0, 0 );
4780          case float_round_up:
4781             return
4782                   aSign ? packFloatx80( 1, 0, 0 )
4783                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4784         }
4785         return packFloatx80( aSign, 0, 0 );
4786     }
4787     lastBitMask = 1;
4788     lastBitMask <<= 0x403E - aExp;
4789     roundBitsMask = lastBitMask - 1;
4790     z = a;
4791     switch (status->float_rounding_mode) {
4792     case float_round_nearest_even:
4793         z.low += lastBitMask>>1;
4794         if ((z.low & roundBitsMask) == 0) {
4795             z.low &= ~lastBitMask;
4796         }
4797         break;
4798     case float_round_ties_away:
4799         z.low += lastBitMask >> 1;
4800         break;
4801     case float_round_to_zero:
4802         break;
4803     case float_round_up:
4804         if (!extractFloatx80Sign(z)) {
4805             z.low += roundBitsMask;
4806         }
4807         break;
4808     case float_round_down:
4809         if (extractFloatx80Sign(z)) {
4810             z.low += roundBitsMask;
4811         }
4812         break;
4813     default:
4814         abort();
4815     }
4816     z.low &= ~ roundBitsMask;
4817     if ( z.low == 0 ) {
4818         ++z.high;
4819         z.low = LIT64( 0x8000000000000000 );
4820     }
4821     if (z.low != a.low) {
4822         status->float_exception_flags |= float_flag_inexact;
4823     }
4824     return z;
4825
4826 }
4827
4828 /*----------------------------------------------------------------------------
4829 | Returns the result of adding the absolute values of the extended double-
4830 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4831 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4832 | The addition is performed according to the IEC/IEEE Standard for Binary
4833 | Floating-Point Arithmetic.
4834 *----------------------------------------------------------------------------*/
4835
4836 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4837                                 float_status *status)
4838 {
4839     int32_t aExp, bExp, zExp;
4840     uint64_t aSig, bSig, zSig0, zSig1;
4841     int32_t expDiff;
4842
4843     aSig = extractFloatx80Frac( a );
4844     aExp = extractFloatx80Exp( a );
4845     bSig = extractFloatx80Frac( b );
4846     bExp = extractFloatx80Exp( b );
4847     expDiff = aExp - bExp;
4848     if ( 0 < expDiff ) {
4849         if ( aExp == 0x7FFF ) {
4850             if ((uint64_t)(aSig << 1)) {
4851                 return propagateFloatx80NaN(a, b, status);
4852             }
4853             return a;
4854         }
4855         if ( bExp == 0 ) --expDiff;
4856         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4857         zExp = aExp;
4858     }
4859     else if ( expDiff < 0 ) {
4860         if ( bExp == 0x7FFF ) {
4861             if ((uint64_t)(bSig << 1)) {
4862                 return propagateFloatx80NaN(a, b, status);
4863             }
4864             return packFloatx80(zSign,
4865                                 floatx80_infinity_high,
4866                                 floatx80_infinity_low);
4867         }
4868         if ( aExp == 0 ) ++expDiff;
4869         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4870         zExp = bExp;
4871     }
4872     else {
4873         if ( aExp == 0x7FFF ) {
4874             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4875                 return propagateFloatx80NaN(a, b, status);
4876             }
4877             return a;
4878         }
4879         zSig1 = 0;
4880         zSig0 = aSig + bSig;
4881         if ( aExp == 0 ) {
4882             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4883             goto roundAndPack;
4884         }
4885         zExp = aExp;
4886         goto shiftRight1;
4887     }
4888     zSig0 = aSig + bSig;
4889     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4890  shiftRight1:
4891     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4892     zSig0 |= LIT64( 0x8000000000000000 );
4893     ++zExp;
4894  roundAndPack:
4895     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4896                                 zSign, zExp, zSig0, zSig1, status);
4897 }
4898
4899 /*----------------------------------------------------------------------------
4900 | Returns the result of subtracting the absolute values of the extended
4901 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4902 | difference is negated before being returned.  `zSign' is ignored if the
4903 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4904 | Standard for Binary Floating-Point Arithmetic.
4905 *----------------------------------------------------------------------------*/
4906
4907 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4908                                 float_status *status)
4909 {
4910     int32_t aExp, bExp, zExp;
4911     uint64_t aSig, bSig, zSig0, zSig1;
4912     int32_t expDiff;
4913
4914     aSig = extractFloatx80Frac( a );
4915     aExp = extractFloatx80Exp( a );
4916     bSig = extractFloatx80Frac( b );
4917     bExp = extractFloatx80Exp( b );
4918     expDiff = aExp - bExp;
4919     if ( 0 < expDiff ) goto aExpBigger;
4920     if ( expDiff < 0 ) goto bExpBigger;
4921     if ( aExp == 0x7FFF ) {
4922         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4923             return propagateFloatx80NaN(a, b, status);
4924         }
4925         float_raise(float_flag_invalid, status);
4926         return floatx80_default_nan(status);
4927     }
4928     if ( aExp == 0 ) {
4929         aExp = 1;
4930         bExp = 1;
4931     }
4932     zSig1 = 0;
4933     if ( bSig < aSig ) goto aBigger;
4934     if ( aSig < bSig ) goto bBigger;
4935     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
4936  bExpBigger:
4937     if ( bExp == 0x7FFF ) {
4938         if ((uint64_t)(bSig << 1)) {
4939             return propagateFloatx80NaN(a, b, status);
4940         }
4941         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
4942                             floatx80_infinity_low);
4943     }
4944     if ( aExp == 0 ) ++expDiff;
4945     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4946  bBigger:
4947     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4948     zExp = bExp;
4949     zSign ^= 1;
4950     goto normalizeRoundAndPack;
4951  aExpBigger:
4952     if ( aExp == 0x7FFF ) {
4953         if ((uint64_t)(aSig << 1)) {
4954             return propagateFloatx80NaN(a, b, status);
4955         }
4956         return a;
4957     }
4958     if ( bExp == 0 ) --expDiff;
4959     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4960  aBigger:
4961     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4962     zExp = aExp;
4963  normalizeRoundAndPack:
4964     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
4965                                          zSign, zExp, zSig0, zSig1, status);
4966 }
4967
4968 /*----------------------------------------------------------------------------
4969 | Returns the result of adding the extended double-precision floating-point
4970 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4971 | Standard for Binary Floating-Point Arithmetic.
4972 *----------------------------------------------------------------------------*/
4973
4974 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
4975 {
4976     flag aSign, bSign;
4977
4978     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4979         float_raise(float_flag_invalid, status);
4980         return floatx80_default_nan(status);
4981     }
4982     aSign = extractFloatx80Sign( a );
4983     bSign = extractFloatx80Sign( b );
4984     if ( aSign == bSign ) {
4985         return addFloatx80Sigs(a, b, aSign, status);
4986     }
4987     else {
4988         return subFloatx80Sigs(a, b, aSign, status);
4989     }
4990
4991 }
4992
4993 /*----------------------------------------------------------------------------
4994 | Returns the result of subtracting the extended double-precision floating-
4995 | point values `a' and `b'.  The operation is performed according to the
4996 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4997 *----------------------------------------------------------------------------*/
4998
4999 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5000 {
5001     flag aSign, bSign;
5002
5003     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5004         float_raise(float_flag_invalid, status);
5005         return floatx80_default_nan(status);
5006     }
5007     aSign = extractFloatx80Sign( a );
5008     bSign = extractFloatx80Sign( b );
5009     if ( aSign == bSign ) {
5010         return subFloatx80Sigs(a, b, aSign, status);
5011     }
5012     else {
5013         return addFloatx80Sigs(a, b, aSign, status);
5014     }
5015
5016 }
5017
5018 /*----------------------------------------------------------------------------
5019 | Returns the result of multiplying the extended double-precision floating-
5020 | point values `a' and `b'.  The operation is performed according to the
5021 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5022 *----------------------------------------------------------------------------*/
5023
5024 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5025 {
5026     flag aSign, bSign, zSign;
5027     int32_t aExp, bExp, zExp;
5028     uint64_t aSig, bSig, zSig0, zSig1;
5029
5030     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5031         float_raise(float_flag_invalid, status);
5032         return floatx80_default_nan(status);
5033     }
5034     aSig = extractFloatx80Frac( a );
5035     aExp = extractFloatx80Exp( a );
5036     aSign = extractFloatx80Sign( a );
5037     bSig = extractFloatx80Frac( b );
5038     bExp = extractFloatx80Exp( b );
5039     bSign = extractFloatx80Sign( b );
5040     zSign = aSign ^ bSign;
5041     if ( aExp == 0x7FFF ) {
5042         if (    (uint64_t) ( aSig<<1 )
5043              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5044             return propagateFloatx80NaN(a, b, status);
5045         }
5046         if ( ( bExp | bSig ) == 0 ) goto invalid;
5047         return packFloatx80(zSign, floatx80_infinity_high,
5048                                    floatx80_infinity_low);
5049     }
5050     if ( bExp == 0x7FFF ) {
5051         if ((uint64_t)(bSig << 1)) {
5052             return propagateFloatx80NaN(a, b, status);
5053         }
5054         if ( ( aExp | aSig ) == 0 ) {
5055  invalid:
5056             float_raise(float_flag_invalid, status);
5057             return floatx80_default_nan(status);
5058         }
5059         return packFloatx80(zSign, floatx80_infinity_high,
5060                                    floatx80_infinity_low);
5061     }
5062     if ( aExp == 0 ) {
5063         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5064         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5065     }
5066     if ( bExp == 0 ) {
5067         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5068         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5069     }
5070     zExp = aExp + bExp - 0x3FFE;
5071     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5072     if ( 0 < (int64_t) zSig0 ) {
5073         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5074         --zExp;
5075     }
5076     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5077                                 zSign, zExp, zSig0, zSig1, status);
5078 }
5079
5080 /*----------------------------------------------------------------------------
5081 | Returns the result of dividing the extended double-precision floating-point
5082 | value `a' by the corresponding value `b'.  The operation is performed
5083 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5084 *----------------------------------------------------------------------------*/
5085
5086 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5087 {
5088     flag aSign, bSign, zSign;
5089     int32_t aExp, bExp, zExp;
5090     uint64_t aSig, bSig, zSig0, zSig1;
5091     uint64_t rem0, rem1, rem2, term0, term1, term2;
5092
5093     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5094         float_raise(float_flag_invalid, status);
5095         return floatx80_default_nan(status);
5096     }
5097     aSig = extractFloatx80Frac( a );
5098     aExp = extractFloatx80Exp( a );
5099     aSign = extractFloatx80Sign( a );
5100     bSig = extractFloatx80Frac( b );
5101     bExp = extractFloatx80Exp( b );
5102     bSign = extractFloatx80Sign( b );
5103     zSign = aSign ^ bSign;
5104     if ( aExp == 0x7FFF ) {
5105         if ((uint64_t)(aSig << 1)) {
5106             return propagateFloatx80NaN(a, b, status);
5107         }
5108         if ( bExp == 0x7FFF ) {
5109             if ((uint64_t)(bSig << 1)) {
5110                 return propagateFloatx80NaN(a, b, status);
5111             }
5112             goto invalid;
5113         }
5114         return packFloatx80(zSign, floatx80_infinity_high,
5115                                    floatx80_infinity_low);
5116     }
5117     if ( bExp == 0x7FFF ) {
5118         if ((uint64_t)(bSig << 1)) {
5119             return propagateFloatx80NaN(a, b, status);
5120         }
5121         return packFloatx80( zSign, 0, 0 );
5122     }
5123     if ( bExp == 0 ) {
5124         if ( bSig == 0 ) {
5125             if ( ( aExp | aSig ) == 0 ) {
5126  invalid:
5127                 float_raise(float_flag_invalid, status);
5128                 return floatx80_default_nan(status);
5129             }
5130             float_raise(float_flag_divbyzero, status);
5131             return packFloatx80(zSign, floatx80_infinity_high,
5132                                        floatx80_infinity_low);
5133         }
5134         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5135     }
5136     if ( aExp == 0 ) {
5137         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5138         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5139     }
5140     zExp = aExp - bExp + 0x3FFE;
5141     rem1 = 0;
5142     if ( bSig <= aSig ) {
5143         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5144         ++zExp;
5145     }
5146     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5147     mul64To128( bSig, zSig0, &term0, &term1 );
5148     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5149     while ( (int64_t) rem0 < 0 ) {
5150         --zSig0;
5151         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5152     }
5153     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5154     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5155         mul64To128( bSig, zSig1, &term1, &term2 );
5156         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5157         while ( (int64_t) rem1 < 0 ) {
5158             --zSig1;
5159             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5160         }
5161         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5162     }
5163     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5164                                 zSign, zExp, zSig0, zSig1, status);
5165 }
5166
5167 /*----------------------------------------------------------------------------
5168 | Returns the remainder of the extended double-precision floating-point value
5169 | `a' with respect to the corresponding value `b'.  The operation is performed
5170 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5171 *----------------------------------------------------------------------------*/
5172
5173 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5174 {
5175     flag aSign, zSign;
5176     int32_t aExp, bExp, expDiff;
5177     uint64_t aSig0, aSig1, bSig;
5178     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5179
5180     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5181         float_raise(float_flag_invalid, status);
5182         return floatx80_default_nan(status);
5183     }
5184     aSig0 = extractFloatx80Frac( a );
5185     aExp = extractFloatx80Exp( a );
5186     aSign = extractFloatx80Sign( a );
5187     bSig = extractFloatx80Frac( b );
5188     bExp = extractFloatx80Exp( b );
5189     if ( aExp == 0x7FFF ) {
5190         if (    (uint64_t) ( aSig0<<1 )
5191              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5192             return propagateFloatx80NaN(a, b, status);
5193         }
5194         goto invalid;
5195     }
5196     if ( bExp == 0x7FFF ) {
5197         if ((uint64_t)(bSig << 1)) {
5198             return propagateFloatx80NaN(a, b, status);
5199         }
5200         return a;
5201     }
5202     if ( bExp == 0 ) {
5203         if ( bSig == 0 ) {
5204  invalid:
5205             float_raise(float_flag_invalid, status);
5206             return floatx80_default_nan(status);
5207         }
5208         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5209     }
5210     if ( aExp == 0 ) {
5211         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5212         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5213     }
5214     bSig |= LIT64( 0x8000000000000000 );
5215     zSign = aSign;
5216     expDiff = aExp - bExp;
5217     aSig1 = 0;
5218     if ( expDiff < 0 ) {
5219         if ( expDiff < -1 ) return a;
5220         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5221         expDiff = 0;
5222     }
5223     q = ( bSig <= aSig0 );
5224     if ( q ) aSig0 -= bSig;
5225     expDiff -= 64;
5226     while ( 0 < expDiff ) {
5227         q = estimateDiv128To64( aSig0, aSig1, bSig );
5228         q = ( 2 < q ) ? q - 2 : 0;
5229         mul64To128( bSig, q, &term0, &term1 );
5230         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5231         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5232         expDiff -= 62;
5233     }
5234     expDiff += 64;
5235     if ( 0 < expDiff ) {
5236         q = estimateDiv128To64( aSig0, aSig1, bSig );
5237         q = ( 2 < q ) ? q - 2 : 0;
5238         q >>= 64 - expDiff;
5239         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5240         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5241         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5242         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5243             ++q;
5244             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5245         }
5246     }
5247     else {
5248         term1 = 0;
5249         term0 = bSig;
5250     }
5251     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5252     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5253          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5254               && ( q & 1 ) )
5255        ) {
5256         aSig0 = alternateASig0;
5257         aSig1 = alternateASig1;
5258         zSign = ! zSign;
5259     }
5260     return
5261         normalizeRoundAndPackFloatx80(
5262             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5263
5264 }
5265
5266 /*----------------------------------------------------------------------------
5267 | Returns the square root of the extended double-precision floating-point
5268 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5269 | for Binary Floating-Point Arithmetic.
5270 *----------------------------------------------------------------------------*/
5271
5272 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5273 {
5274     flag aSign;
5275     int32_t aExp, zExp;
5276     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5277     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5278
5279     if (floatx80_invalid_encoding(a)) {
5280         float_raise(float_flag_invalid, status);
5281         return floatx80_default_nan(status);
5282     }
5283     aSig0 = extractFloatx80Frac( a );
5284     aExp = extractFloatx80Exp( a );
5285     aSign = extractFloatx80Sign( a );
5286     if ( aExp == 0x7FFF ) {
5287         if ((uint64_t)(aSig0 << 1)) {
5288             return propagateFloatx80NaN(a, a, status);
5289         }
5290         if ( ! aSign ) return a;
5291         goto invalid;
5292     }
5293     if ( aSign ) {
5294         if ( ( aExp | aSig0 ) == 0 ) return a;
5295  invalid:
5296         float_raise(float_flag_invalid, status);
5297         return floatx80_default_nan(status);
5298     }
5299     if ( aExp == 0 ) {
5300         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5301         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5302     }
5303     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5304     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5305     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5306     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5307     doubleZSig0 = zSig0<<1;
5308     mul64To128( zSig0, zSig0, &term0, &term1 );
5309     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5310     while ( (int64_t) rem0 < 0 ) {
5311         --zSig0;
5312         doubleZSig0 -= 2;
5313         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5314     }
5315     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5316     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5317         if ( zSig1 == 0 ) zSig1 = 1;
5318         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5319         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5320         mul64To128( zSig1, zSig1, &term2, &term3 );
5321         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5322         while ( (int64_t) rem1 < 0 ) {
5323             --zSig1;
5324             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5325             term3 |= 1;
5326             term2 |= doubleZSig0;
5327             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5328         }
5329         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5330     }
5331     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5332     zSig0 |= doubleZSig0;
5333     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5334                                 0, zExp, zSig0, zSig1, status);
5335 }
5336
5337 /*----------------------------------------------------------------------------
5338 | Returns 1 if the extended double-precision floating-point value `a' is equal
5339 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5340 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5341 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5342 *----------------------------------------------------------------------------*/
5343
5344 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5345 {
5346
5347     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5348         || (extractFloatx80Exp(a) == 0x7FFF
5349             && (uint64_t) (extractFloatx80Frac(a) << 1))
5350         || (extractFloatx80Exp(b) == 0x7FFF
5351             && (uint64_t) (extractFloatx80Frac(b) << 1))
5352        ) {
5353         float_raise(float_flag_invalid, status);
5354         return 0;
5355     }
5356     return
5357            ( a.low == b.low )
5358         && (    ( a.high == b.high )
5359              || (    ( a.low == 0 )
5360                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5361            );
5362
5363 }
5364
5365 /*----------------------------------------------------------------------------
5366 | Returns 1 if the extended double-precision floating-point value `a' is
5367 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5368 | invalid exception is raised if either operand is a NaN.  The comparison is
5369 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5370 | Arithmetic.
5371 *----------------------------------------------------------------------------*/
5372
5373 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5374 {
5375     flag aSign, bSign;
5376
5377     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5378         || (extractFloatx80Exp(a) == 0x7FFF
5379             && (uint64_t) (extractFloatx80Frac(a) << 1))
5380         || (extractFloatx80Exp(b) == 0x7FFF
5381             && (uint64_t) (extractFloatx80Frac(b) << 1))
5382        ) {
5383         float_raise(float_flag_invalid, status);
5384         return 0;
5385     }
5386     aSign = extractFloatx80Sign( a );
5387     bSign = extractFloatx80Sign( b );
5388     if ( aSign != bSign ) {
5389         return
5390                aSign
5391             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5392                  == 0 );
5393     }
5394     return
5395           aSign ? le128( b.high, b.low, a.high, a.low )
5396         : le128( a.high, a.low, b.high, b.low );
5397
5398 }
5399
5400 /*----------------------------------------------------------------------------
5401 | Returns 1 if the extended double-precision floating-point value `a' is
5402 | less than the corresponding value `b', and 0 otherwise.  The invalid
5403 | exception is raised if either operand is a NaN.  The comparison is performed
5404 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5405 *----------------------------------------------------------------------------*/
5406
5407 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5408 {
5409     flag aSign, bSign;
5410
5411     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5412         || (extractFloatx80Exp(a) == 0x7FFF
5413             && (uint64_t) (extractFloatx80Frac(a) << 1))
5414         || (extractFloatx80Exp(b) == 0x7FFF
5415             && (uint64_t) (extractFloatx80Frac(b) << 1))
5416        ) {
5417         float_raise(float_flag_invalid, status);
5418         return 0;
5419     }
5420     aSign = extractFloatx80Sign( a );
5421     bSign = extractFloatx80Sign( b );
5422     if ( aSign != bSign ) {
5423         return
5424                aSign
5425             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5426                  != 0 );
5427     }
5428     return
5429           aSign ? lt128( b.high, b.low, a.high, a.low )
5430         : lt128( a.high, a.low, b.high, b.low );
5431
5432 }
5433
5434 /*----------------------------------------------------------------------------
5435 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5436 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5437 | either operand is a NaN.   The comparison is performed according to the
5438 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5439 *----------------------------------------------------------------------------*/
5440 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5441 {
5442     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5443         || (extractFloatx80Exp(a) == 0x7FFF
5444             && (uint64_t) (extractFloatx80Frac(a) << 1))
5445         || (extractFloatx80Exp(b) == 0x7FFF
5446             && (uint64_t) (extractFloatx80Frac(b) << 1))
5447        ) {
5448         float_raise(float_flag_invalid, status);
5449         return 1;
5450     }
5451     return 0;
5452 }
5453
5454 /*----------------------------------------------------------------------------
5455 | Returns 1 if the extended double-precision floating-point value `a' is
5456 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5457 | cause an exception.  The comparison is performed according to the IEC/IEEE
5458 | Standard for Binary Floating-Point Arithmetic.
5459 *----------------------------------------------------------------------------*/
5460
5461 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5462 {
5463
5464     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5465         float_raise(float_flag_invalid, status);
5466         return 0;
5467     }
5468     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5469               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5470          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5471               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5472        ) {
5473         if (floatx80_is_signaling_nan(a, status)
5474          || floatx80_is_signaling_nan(b, status)) {
5475             float_raise(float_flag_invalid, status);
5476         }
5477         return 0;
5478     }
5479     return
5480            ( a.low == b.low )
5481         && (    ( a.high == b.high )
5482              || (    ( a.low == 0 )
5483                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5484            );
5485
5486 }
5487
5488 /*----------------------------------------------------------------------------
5489 | Returns 1 if the extended double-precision floating-point value `a' is less
5490 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5491 | do not cause an exception.  Otherwise, the comparison is performed according
5492 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5493 *----------------------------------------------------------------------------*/
5494
5495 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5496 {
5497     flag aSign, bSign;
5498
5499     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5500         float_raise(float_flag_invalid, status);
5501         return 0;
5502     }
5503     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5504               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5505          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5506               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5507        ) {
5508         if (floatx80_is_signaling_nan(a, status)
5509          || floatx80_is_signaling_nan(b, status)) {
5510             float_raise(float_flag_invalid, status);
5511         }
5512         return 0;
5513     }
5514     aSign = extractFloatx80Sign( a );
5515     bSign = extractFloatx80Sign( b );
5516     if ( aSign != bSign ) {
5517         return
5518                aSign
5519             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5520                  == 0 );
5521     }
5522     return
5523           aSign ? le128( b.high, b.low, a.high, a.low )
5524         : le128( a.high, a.low, b.high, b.low );
5525
5526 }
5527
5528 /*----------------------------------------------------------------------------
5529 | Returns 1 if the extended double-precision floating-point value `a' is less
5530 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5531 | an exception.  Otherwise, the comparison is performed according to the
5532 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5533 *----------------------------------------------------------------------------*/
5534
5535 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5536 {
5537     flag aSign, bSign;
5538
5539     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5540         float_raise(float_flag_invalid, status);
5541         return 0;
5542     }
5543     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5544               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5545          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5546               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5547        ) {
5548         if (floatx80_is_signaling_nan(a, status)
5549          || floatx80_is_signaling_nan(b, status)) {
5550             float_raise(float_flag_invalid, status);
5551         }
5552         return 0;
5553     }
5554     aSign = extractFloatx80Sign( a );
5555     bSign = extractFloatx80Sign( b );
5556     if ( aSign != bSign ) {
5557         return
5558                aSign
5559             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5560                  != 0 );
5561     }
5562     return
5563           aSign ? lt128( b.high, b.low, a.high, a.low )
5564         : lt128( a.high, a.low, b.high, b.low );
5565
5566 }
5567
5568 /*----------------------------------------------------------------------------
5569 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5570 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5571 | The comparison is performed according to the IEC/IEEE Standard for Binary
5572 | Floating-Point Arithmetic.
5573 *----------------------------------------------------------------------------*/
5574 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5575 {
5576     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5577         float_raise(float_flag_invalid, status);
5578         return 1;
5579     }
5580     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5581               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5582          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5583               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5584        ) {
5585         if (floatx80_is_signaling_nan(a, status)
5586          || floatx80_is_signaling_nan(b, status)) {
5587             float_raise(float_flag_invalid, status);
5588         }
5589         return 1;
5590     }
5591     return 0;
5592 }
5593
5594 /*----------------------------------------------------------------------------
5595 | Returns the result of converting the quadruple-precision floating-point
5596 | value `a' to the 32-bit two's complement integer format.  The conversion
5597 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5598 | Arithmetic---which means in particular that the conversion is rounded
5599 | according to the current rounding mode.  If `a' is a NaN, the largest
5600 | positive integer is returned.  Otherwise, if the conversion overflows, the
5601 | largest integer with the same sign as `a' is returned.
5602 *----------------------------------------------------------------------------*/
5603
5604 int32_t float128_to_int32(float128 a, float_status *status)
5605 {
5606     flag aSign;
5607     int32_t aExp, shiftCount;
5608     uint64_t aSig0, aSig1;
5609
5610     aSig1 = extractFloat128Frac1( a );
5611     aSig0 = extractFloat128Frac0( a );
5612     aExp = extractFloat128Exp( a );
5613     aSign = extractFloat128Sign( a );
5614     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5615     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5616     aSig0 |= ( aSig1 != 0 );
5617     shiftCount = 0x4028 - aExp;
5618     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5619     return roundAndPackInt32(aSign, aSig0, status);
5620
5621 }
5622
5623 /*----------------------------------------------------------------------------
5624 | Returns the result of converting the quadruple-precision floating-point
5625 | value `a' to the 32-bit two's complement integer format.  The conversion
5626 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5627 | Arithmetic, except that the conversion is always rounded toward zero.  If
5628 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5629 | conversion overflows, the largest integer with the same sign as `a' is
5630 | returned.
5631 *----------------------------------------------------------------------------*/
5632
5633 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5634 {
5635     flag aSign;
5636     int32_t aExp, shiftCount;
5637     uint64_t aSig0, aSig1, savedASig;
5638     int32_t z;
5639
5640     aSig1 = extractFloat128Frac1( a );
5641     aSig0 = extractFloat128Frac0( a );
5642     aExp = extractFloat128Exp( a );
5643     aSign = extractFloat128Sign( a );
5644     aSig0 |= ( aSig1 != 0 );
5645     if ( 0x401E < aExp ) {
5646         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5647         goto invalid;
5648     }
5649     else if ( aExp < 0x3FFF ) {
5650         if (aExp || aSig0) {
5651             status->float_exception_flags |= float_flag_inexact;
5652         }
5653         return 0;
5654     }
5655     aSig0 |= LIT64( 0x0001000000000000 );
5656     shiftCount = 0x402F - aExp;
5657     savedASig = aSig0;
5658     aSig0 >>= shiftCount;
5659     z = aSig0;
5660     if ( aSign ) z = - z;
5661     if ( ( z < 0 ) ^ aSign ) {
5662  invalid:
5663         float_raise(float_flag_invalid, status);
5664         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5665     }
5666     if ( ( aSig0<<shiftCount ) != savedASig ) {
5667         status->float_exception_flags |= float_flag_inexact;
5668     }
5669     return z;
5670
5671 }
5672
5673 /*----------------------------------------------------------------------------
5674 | Returns the result of converting the quadruple-precision floating-point
5675 | value `a' to the 64-bit two's complement integer format.  The conversion
5676 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5677 | Arithmetic---which means in particular that the conversion is rounded
5678 | according to the current rounding mode.  If `a' is a NaN, the largest
5679 | positive integer is returned.  Otherwise, if the conversion overflows, the
5680 | largest integer with the same sign as `a' is returned.
5681 *----------------------------------------------------------------------------*/
5682
5683 int64_t float128_to_int64(float128 a, float_status *status)
5684 {
5685     flag aSign;
5686     int32_t aExp, shiftCount;
5687     uint64_t aSig0, aSig1;
5688
5689     aSig1 = extractFloat128Frac1( a );
5690     aSig0 = extractFloat128Frac0( a );
5691     aExp = extractFloat128Exp( a );
5692     aSign = extractFloat128Sign( a );
5693     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5694     shiftCount = 0x402F - aExp;
5695     if ( shiftCount <= 0 ) {
5696         if ( 0x403E < aExp ) {
5697             float_raise(float_flag_invalid, status);
5698             if (    ! aSign
5699                  || (    ( aExp == 0x7FFF )
5700                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5701                     )
5702                ) {
5703                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5704             }
5705             return (int64_t) LIT64( 0x8000000000000000 );
5706         }
5707         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5708     }
5709     else {
5710         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5711     }
5712     return roundAndPackInt64(aSign, aSig0, aSig1, status);
5713
5714 }
5715
5716 /*----------------------------------------------------------------------------
5717 | Returns the result of converting the quadruple-precision floating-point
5718 | value `a' to the 64-bit two's complement integer format.  The conversion
5719 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5720 | Arithmetic, except that the conversion is always rounded toward zero.
5721 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5722 | the conversion overflows, the largest integer with the same sign as `a' is
5723 | returned.
5724 *----------------------------------------------------------------------------*/
5725
5726 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5727 {
5728     flag aSign;
5729     int32_t aExp, shiftCount;
5730     uint64_t aSig0, aSig1;
5731     int64_t z;
5732
5733     aSig1 = extractFloat128Frac1( a );
5734     aSig0 = extractFloat128Frac0( a );
5735     aExp = extractFloat128Exp( a );
5736     aSign = extractFloat128Sign( a );
5737     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5738     shiftCount = aExp - 0x402F;
5739     if ( 0 < shiftCount ) {
5740         if ( 0x403E <= aExp ) {
5741             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5742             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5743                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5744                 if (aSig1) {
5745                     status->float_exception_flags |= float_flag_inexact;
5746                 }
5747             }
5748             else {
5749                 float_raise(float_flag_invalid, status);
5750                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5751                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5752                 }
5753             }
5754             return (int64_t) LIT64( 0x8000000000000000 );
5755         }
5756         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5757         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5758             status->float_exception_flags |= float_flag_inexact;
5759         }
5760     }
5761     else {
5762         if ( aExp < 0x3FFF ) {
5763             if ( aExp | aSig0 | aSig1 ) {
5764                 status->float_exception_flags |= float_flag_inexact;
5765             }
5766             return 0;
5767         }
5768         z = aSig0>>( - shiftCount );
5769         if (    aSig1
5770              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5771             status->float_exception_flags |= float_flag_inexact;
5772         }
5773     }
5774     if ( aSign ) z = - z;
5775     return z;
5776
5777 }
5778
5779 /*----------------------------------------------------------------------------
5780 | Returns the result of converting the quadruple-precision floating-point value
5781 | `a' to the 64-bit unsigned integer format.  The conversion is
5782 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5783 | Arithmetic---which means in particular that the conversion is rounded
5784 | according to the current rounding mode.  If `a' is a NaN, the largest
5785 | positive integer is returned.  If the conversion overflows, the
5786 | largest unsigned integer is returned.  If 'a' is negative, the value is
5787 | rounded and zero is returned; negative values that do not round to zero
5788 | will raise the inexact exception.
5789 *----------------------------------------------------------------------------*/
5790
5791 uint64_t float128_to_uint64(float128 a, float_status *status)
5792 {
5793     flag aSign;
5794     int aExp;
5795     int shiftCount;
5796     uint64_t aSig0, aSig1;
5797
5798     aSig0 = extractFloat128Frac0(a);
5799     aSig1 = extractFloat128Frac1(a);
5800     aExp = extractFloat128Exp(a);
5801     aSign = extractFloat128Sign(a);
5802     if (aSign && (aExp > 0x3FFE)) {
5803         float_raise(float_flag_invalid, status);
5804         if (float128_is_any_nan(a)) {
5805             return LIT64(0xFFFFFFFFFFFFFFFF);
5806         } else {
5807             return 0;
5808         }
5809     }
5810     if (aExp) {
5811         aSig0 |= LIT64(0x0001000000000000);
5812     }
5813     shiftCount = 0x402F - aExp;
5814     if (shiftCount <= 0) {
5815         if (0x403E < aExp) {
5816             float_raise(float_flag_invalid, status);
5817             return LIT64(0xFFFFFFFFFFFFFFFF);
5818         }
5819         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5820     } else {
5821         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5822     }
5823     return roundAndPackUint64(aSign, aSig0, aSig1, status);
5824 }
5825
5826 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5827 {
5828     uint64_t v;
5829     signed char current_rounding_mode = status->float_rounding_mode;
5830
5831     set_float_rounding_mode(float_round_to_zero, status);
5832     v = float128_to_uint64(a, status);
5833     set_float_rounding_mode(current_rounding_mode, status);
5834
5835     return v;
5836 }
5837
5838 /*----------------------------------------------------------------------------
5839 | Returns the result of converting the quadruple-precision floating-point
5840 | value `a' to the 32-bit unsigned integer format.  The conversion
5841 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5842 | Arithmetic except that the conversion is always rounded toward zero.
5843 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
5844 | if the conversion overflows, the largest unsigned integer is returned.
5845 | If 'a' is negative, the value is rounded and zero is returned; negative
5846 | values that do not round to zero will raise the inexact exception.
5847 *----------------------------------------------------------------------------*/
5848
5849 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5850 {
5851     uint64_t v;
5852     uint32_t res;
5853     int old_exc_flags = get_float_exception_flags(status);
5854
5855     v = float128_to_uint64_round_to_zero(a, status);
5856     if (v > 0xffffffff) {
5857         res = 0xffffffff;
5858     } else {
5859         return v;
5860     }
5861     set_float_exception_flags(old_exc_flags, status);
5862     float_raise(float_flag_invalid, status);
5863     return res;
5864 }
5865
5866 /*----------------------------------------------------------------------------
5867 | Returns the result of converting the quadruple-precision floating-point
5868 | value `a' to the single-precision floating-point format.  The conversion
5869 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5870 | Arithmetic.
5871 *----------------------------------------------------------------------------*/
5872
5873 float32 float128_to_float32(float128 a, float_status *status)
5874 {
5875     flag aSign;
5876     int32_t aExp;
5877     uint64_t aSig0, aSig1;
5878     uint32_t zSig;
5879
5880     aSig1 = extractFloat128Frac1( a );
5881     aSig0 = extractFloat128Frac0( a );
5882     aExp = extractFloat128Exp( a );
5883     aSign = extractFloat128Sign( a );
5884     if ( aExp == 0x7FFF ) {
5885         if ( aSig0 | aSig1 ) {
5886             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5887         }
5888         return packFloat32( aSign, 0xFF, 0 );
5889     }
5890     aSig0 |= ( aSig1 != 0 );
5891     shift64RightJamming( aSig0, 18, &aSig0 );
5892     zSig = aSig0;
5893     if ( aExp || zSig ) {
5894         zSig |= 0x40000000;
5895         aExp -= 0x3F81;
5896     }
5897     return roundAndPackFloat32(aSign, aExp, zSig, status);
5898
5899 }
5900
5901 /*----------------------------------------------------------------------------
5902 | Returns the result of converting the quadruple-precision floating-point
5903 | value `a' to the double-precision floating-point format.  The conversion
5904 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5905 | Arithmetic.
5906 *----------------------------------------------------------------------------*/
5907
5908 float64 float128_to_float64(float128 a, float_status *status)
5909 {
5910     flag aSign;
5911     int32_t aExp;
5912     uint64_t aSig0, aSig1;
5913
5914     aSig1 = extractFloat128Frac1( a );
5915     aSig0 = extractFloat128Frac0( a );
5916     aExp = extractFloat128Exp( a );
5917     aSign = extractFloat128Sign( a );
5918     if ( aExp == 0x7FFF ) {
5919         if ( aSig0 | aSig1 ) {
5920             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5921         }
5922         return packFloat64( aSign, 0x7FF, 0 );
5923     }
5924     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5925     aSig0 |= ( aSig1 != 0 );
5926     if ( aExp || aSig0 ) {
5927         aSig0 |= LIT64( 0x4000000000000000 );
5928         aExp -= 0x3C01;
5929     }
5930     return roundAndPackFloat64(aSign, aExp, aSig0, status);
5931
5932 }
5933
5934 /*----------------------------------------------------------------------------
5935 | Returns the result of converting the quadruple-precision floating-point
5936 | value `a' to the extended double-precision floating-point format.  The
5937 | conversion is performed according to the IEC/IEEE Standard for Binary
5938 | Floating-Point Arithmetic.
5939 *----------------------------------------------------------------------------*/
5940
5941 floatx80 float128_to_floatx80(float128 a, float_status *status)
5942 {
5943     flag aSign;
5944     int32_t aExp;
5945     uint64_t aSig0, aSig1;
5946
5947     aSig1 = extractFloat128Frac1( a );
5948     aSig0 = extractFloat128Frac0( a );
5949     aExp = extractFloat128Exp( a );
5950     aSign = extractFloat128Sign( a );
5951     if ( aExp == 0x7FFF ) {
5952         if ( aSig0 | aSig1 ) {
5953             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
5954         }
5955         return packFloatx80(aSign, floatx80_infinity_high,
5956                                    floatx80_infinity_low);
5957     }
5958     if ( aExp == 0 ) {
5959         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5960         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5961     }
5962     else {
5963         aSig0 |= LIT64( 0x0001000000000000 );
5964     }
5965     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5966     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
5967
5968 }
5969
5970 /*----------------------------------------------------------------------------
5971 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5972 | returns the result as a quadruple-precision floating-point value.  The
5973 | operation is performed according to the IEC/IEEE Standard for Binary
5974 | Floating-Point Arithmetic.
5975 *----------------------------------------------------------------------------*/
5976
5977 float128 float128_round_to_int(float128 a, float_status *status)
5978 {
5979     flag aSign;
5980     int32_t aExp;
5981     uint64_t lastBitMask, roundBitsMask;
5982     float128 z;
5983
5984     aExp = extractFloat128Exp( a );
5985     if ( 0x402F <= aExp ) {
5986         if ( 0x406F <= aExp ) {
5987             if (    ( aExp == 0x7FFF )
5988                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5989                ) {
5990                 return propagateFloat128NaN(a, a, status);
5991             }
5992             return a;
5993         }
5994         lastBitMask = 1;
5995         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5996         roundBitsMask = lastBitMask - 1;
5997         z = a;
5998         switch (status->float_rounding_mode) {
5999         case float_round_nearest_even:
6000             if ( lastBitMask ) {
6001                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6002                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6003             }
6004             else {
6005                 if ( (int64_t) z.low < 0 ) {
6006                     ++z.high;
6007                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6008                 }
6009             }
6010             break;
6011         case float_round_ties_away:
6012             if (lastBitMask) {
6013                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6014             } else {
6015                 if ((int64_t) z.low < 0) {
6016                     ++z.high;
6017                 }
6018             }
6019             break;
6020         case float_round_to_zero:
6021             break;
6022         case float_round_up:
6023             if (!extractFloat128Sign(z)) {
6024                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6025             }
6026             break;
6027         case float_round_down:
6028             if (extractFloat128Sign(z)) {
6029                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6030             }
6031             break;
6032         default:
6033             abort();
6034         }
6035         z.low &= ~ roundBitsMask;
6036     }
6037     else {
6038         if ( aExp < 0x3FFF ) {
6039             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6040             status->float_exception_flags |= float_flag_inexact;
6041             aSign = extractFloat128Sign( a );
6042             switch (status->float_rounding_mode) {
6043              case float_round_nearest_even:
6044                 if (    ( aExp == 0x3FFE )
6045                      && (   extractFloat128Frac0( a )
6046                           | extractFloat128Frac1( a ) )
6047                    ) {
6048                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6049                 }
6050                 break;
6051             case float_round_ties_away:
6052                 if (aExp == 0x3FFE) {
6053                     return packFloat128(aSign, 0x3FFF, 0, 0);
6054                 }
6055                 break;
6056              case float_round_down:
6057                 return
6058                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6059                     : packFloat128( 0, 0, 0, 0 );
6060              case float_round_up:
6061                 return
6062                       aSign ? packFloat128( 1, 0, 0, 0 )
6063                     : packFloat128( 0, 0x3FFF, 0, 0 );
6064             }
6065             return packFloat128( aSign, 0, 0, 0 );
6066         }
6067         lastBitMask = 1;
6068         lastBitMask <<= 0x402F - aExp;
6069         roundBitsMask = lastBitMask - 1;
6070         z.low = 0;
6071         z.high = a.high;
6072         switch (status->float_rounding_mode) {
6073         case float_round_nearest_even:
6074             z.high += lastBitMask>>1;
6075             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6076                 z.high &= ~ lastBitMask;
6077             }
6078             break;
6079         case float_round_ties_away:
6080             z.high += lastBitMask>>1;
6081             break;
6082         case float_round_to_zero:
6083             break;
6084         case float_round_up:
6085             if (!extractFloat128Sign(z)) {
6086                 z.high |= ( a.low != 0 );
6087                 z.high += roundBitsMask;
6088             }
6089             break;
6090         case float_round_down:
6091             if (extractFloat128Sign(z)) {
6092                 z.high |= (a.low != 0);
6093                 z.high += roundBitsMask;
6094             }
6095             break;
6096         default:
6097             abort();
6098         }
6099         z.high &= ~ roundBitsMask;
6100     }
6101     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6102         status->float_exception_flags |= float_flag_inexact;
6103     }
6104     return z;
6105
6106 }
6107
6108 /*----------------------------------------------------------------------------
6109 | Returns the result of adding the absolute values of the quadruple-precision
6110 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6111 | before being returned.  `zSign' is ignored if the result is a NaN.
6112 | The addition is performed according to the IEC/IEEE Standard for Binary
6113 | Floating-Point Arithmetic.
6114 *----------------------------------------------------------------------------*/
6115
6116 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6117                                 float_status *status)
6118 {
6119     int32_t aExp, bExp, zExp;
6120     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6121     int32_t expDiff;
6122
6123     aSig1 = extractFloat128Frac1( a );
6124     aSig0 = extractFloat128Frac0( a );
6125     aExp = extractFloat128Exp( a );
6126     bSig1 = extractFloat128Frac1( b );
6127     bSig0 = extractFloat128Frac0( b );
6128     bExp = extractFloat128Exp( b );
6129     expDiff = aExp - bExp;
6130     if ( 0 < expDiff ) {
6131         if ( aExp == 0x7FFF ) {
6132             if (aSig0 | aSig1) {
6133                 return propagateFloat128NaN(a, b, status);
6134             }
6135             return a;
6136         }
6137         if ( bExp == 0 ) {
6138             --expDiff;
6139         }
6140         else {
6141             bSig0 |= LIT64( 0x0001000000000000 );
6142         }
6143         shift128ExtraRightJamming(
6144             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6145         zExp = aExp;
6146     }
6147     else if ( expDiff < 0 ) {
6148         if ( bExp == 0x7FFF ) {
6149             if (bSig0 | bSig1) {
6150                 return propagateFloat128NaN(a, b, status);
6151             }
6152             return packFloat128( zSign, 0x7FFF, 0, 0 );
6153         }
6154         if ( aExp == 0 ) {
6155             ++expDiff;
6156         }
6157         else {
6158             aSig0 |= LIT64( 0x0001000000000000 );
6159         }
6160         shift128ExtraRightJamming(
6161             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6162         zExp = bExp;
6163     }
6164     else {
6165         if ( aExp == 0x7FFF ) {
6166             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6167                 return propagateFloat128NaN(a, b, status);
6168             }
6169             return a;
6170         }
6171         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6172         if ( aExp == 0 ) {
6173             if (status->flush_to_zero) {
6174                 if (zSig0 | zSig1) {
6175                     float_raise(float_flag_output_denormal, status);
6176                 }
6177                 return packFloat128(zSign, 0, 0, 0);
6178             }
6179             return packFloat128( zSign, 0, zSig0, zSig1 );
6180         }
6181         zSig2 = 0;
6182         zSig0 |= LIT64( 0x0002000000000000 );
6183         zExp = aExp;
6184         goto shiftRight1;
6185     }
6186     aSig0 |= LIT64( 0x0001000000000000 );
6187     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6188     --zExp;
6189     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6190     ++zExp;
6191  shiftRight1:
6192     shift128ExtraRightJamming(
6193         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6194  roundAndPack:
6195     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6196
6197 }
6198
6199 /*----------------------------------------------------------------------------
6200 | Returns the result of subtracting the absolute values of the quadruple-
6201 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6202 | difference is negated before being returned.  `zSign' is ignored if the
6203 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6204 | Standard for Binary Floating-Point Arithmetic.
6205 *----------------------------------------------------------------------------*/
6206
6207 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6208                                 float_status *status)
6209 {
6210     int32_t aExp, bExp, zExp;
6211     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6212     int32_t expDiff;
6213
6214     aSig1 = extractFloat128Frac1( a );
6215     aSig0 = extractFloat128Frac0( a );
6216     aExp = extractFloat128Exp( a );
6217     bSig1 = extractFloat128Frac1( b );
6218     bSig0 = extractFloat128Frac0( b );
6219     bExp = extractFloat128Exp( b );
6220     expDiff = aExp - bExp;
6221     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6222     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6223     if ( 0 < expDiff ) goto aExpBigger;
6224     if ( expDiff < 0 ) goto bExpBigger;
6225     if ( aExp == 0x7FFF ) {
6226         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6227             return propagateFloat128NaN(a, b, status);
6228         }
6229         float_raise(float_flag_invalid, status);
6230         return float128_default_nan(status);
6231     }
6232     if ( aExp == 0 ) {
6233         aExp = 1;
6234         bExp = 1;
6235     }
6236     if ( bSig0 < aSig0 ) goto aBigger;
6237     if ( aSig0 < bSig0 ) goto bBigger;
6238     if ( bSig1 < aSig1 ) goto aBigger;
6239     if ( aSig1 < bSig1 ) goto bBigger;
6240     return packFloat128(status->float_rounding_mode == float_round_down,
6241                         0, 0, 0);
6242  bExpBigger:
6243     if ( bExp == 0x7FFF ) {
6244         if (bSig0 | bSig1) {
6245             return propagateFloat128NaN(a, b, status);
6246         }
6247         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6248     }
6249     if ( aExp == 0 ) {
6250         ++expDiff;
6251     }
6252     else {
6253         aSig0 |= LIT64( 0x4000000000000000 );
6254     }
6255     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6256     bSig0 |= LIT64( 0x4000000000000000 );
6257  bBigger:
6258     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6259     zExp = bExp;
6260     zSign ^= 1;
6261     goto normalizeRoundAndPack;
6262  aExpBigger:
6263     if ( aExp == 0x7FFF ) {
6264         if (aSig0 | aSig1) {
6265             return propagateFloat128NaN(a, b, status);
6266         }
6267         return a;
6268     }
6269     if ( bExp == 0 ) {
6270         --expDiff;
6271     }
6272     else {
6273         bSig0 |= LIT64( 0x4000000000000000 );
6274     }
6275     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6276     aSig0 |= LIT64( 0x4000000000000000 );
6277  aBigger:
6278     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6279     zExp = aExp;
6280  normalizeRoundAndPack:
6281     --zExp;
6282     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6283                                          status);
6284
6285 }
6286
6287 /*----------------------------------------------------------------------------
6288 | Returns the result of adding the quadruple-precision floating-point values
6289 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6290 | for Binary Floating-Point Arithmetic.
6291 *----------------------------------------------------------------------------*/
6292
6293 float128 float128_add(float128 a, float128 b, float_status *status)
6294 {
6295     flag aSign, bSign;
6296
6297     aSign = extractFloat128Sign( a );
6298     bSign = extractFloat128Sign( b );
6299     if ( aSign == bSign ) {
6300         return addFloat128Sigs(a, b, aSign, status);
6301     }
6302     else {
6303         return subFloat128Sigs(a, b, aSign, status);
6304     }
6305
6306 }
6307
6308 /*----------------------------------------------------------------------------
6309 | Returns the result of subtracting the quadruple-precision floating-point
6310 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6311 | Standard for Binary Floating-Point Arithmetic.
6312 *----------------------------------------------------------------------------*/
6313
6314 float128 float128_sub(float128 a, float128 b, float_status *status)
6315 {
6316     flag aSign, bSign;
6317
6318     aSign = extractFloat128Sign( a );
6319     bSign = extractFloat128Sign( b );
6320     if ( aSign == bSign ) {
6321         return subFloat128Sigs(a, b, aSign, status);
6322     }
6323     else {
6324         return addFloat128Sigs(a, b, aSign, status);
6325     }
6326
6327 }
6328
6329 /*----------------------------------------------------------------------------
6330 | Returns the result of multiplying the quadruple-precision floating-point
6331 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6332 | Standard for Binary Floating-Point Arithmetic.
6333 *----------------------------------------------------------------------------*/
6334
6335 float128 float128_mul(float128 a, float128 b, float_status *status)
6336 {
6337     flag aSign, bSign, zSign;
6338     int32_t aExp, bExp, zExp;
6339     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6340
6341     aSig1 = extractFloat128Frac1( a );
6342     aSig0 = extractFloat128Frac0( a );
6343     aExp = extractFloat128Exp( a );
6344     aSign = extractFloat128Sign( a );
6345     bSig1 = extractFloat128Frac1( b );
6346     bSig0 = extractFloat128Frac0( b );
6347     bExp = extractFloat128Exp( b );
6348     bSign = extractFloat128Sign( b );
6349     zSign = aSign ^ bSign;
6350     if ( aExp == 0x7FFF ) {
6351         if (    ( aSig0 | aSig1 )
6352              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6353             return propagateFloat128NaN(a, b, status);
6354         }
6355         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6356         return packFloat128( zSign, 0x7FFF, 0, 0 );
6357     }
6358     if ( bExp == 0x7FFF ) {
6359         if (bSig0 | bSig1) {
6360             return propagateFloat128NaN(a, b, status);
6361         }
6362         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6363  invalid:
6364             float_raise(float_flag_invalid, status);
6365             return float128_default_nan(status);
6366         }
6367         return packFloat128( zSign, 0x7FFF, 0, 0 );
6368     }
6369     if ( aExp == 0 ) {
6370         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6371         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6372     }
6373     if ( bExp == 0 ) {
6374         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6375         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6376     }
6377     zExp = aExp + bExp - 0x4000;
6378     aSig0 |= LIT64( 0x0001000000000000 );
6379     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6380     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6381     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6382     zSig2 |= ( zSig3 != 0 );
6383     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6384         shift128ExtraRightJamming(
6385             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6386         ++zExp;
6387     }
6388     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6389
6390 }
6391
6392 /*----------------------------------------------------------------------------
6393 | Returns the result of dividing the quadruple-precision floating-point value
6394 | `a' by the corresponding value `b'.  The operation is performed according to
6395 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6396 *----------------------------------------------------------------------------*/
6397
6398 float128 float128_div(float128 a, float128 b, float_status *status)
6399 {
6400     flag aSign, bSign, zSign;
6401     int32_t aExp, bExp, zExp;
6402     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6403     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6404
6405     aSig1 = extractFloat128Frac1( a );
6406     aSig0 = extractFloat128Frac0( a );
6407     aExp = extractFloat128Exp( a );
6408     aSign = extractFloat128Sign( a );
6409     bSig1 = extractFloat128Frac1( b );
6410     bSig0 = extractFloat128Frac0( b );
6411     bExp = extractFloat128Exp( b );
6412     bSign = extractFloat128Sign( b );
6413     zSign = aSign ^ bSign;
6414     if ( aExp == 0x7FFF ) {
6415         if (aSig0 | aSig1) {
6416             return propagateFloat128NaN(a, b, status);
6417         }
6418         if ( bExp == 0x7FFF ) {
6419             if (bSig0 | bSig1) {
6420                 return propagateFloat128NaN(a, b, status);
6421             }
6422             goto invalid;
6423         }
6424         return packFloat128( zSign, 0x7FFF, 0, 0 );
6425     }
6426     if ( bExp == 0x7FFF ) {
6427         if (bSig0 | bSig1) {
6428             return propagateFloat128NaN(a, b, status);
6429         }
6430         return packFloat128( zSign, 0, 0, 0 );
6431     }
6432     if ( bExp == 0 ) {
6433         if ( ( bSig0 | bSig1 ) == 0 ) {
6434             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6435  invalid:
6436                 float_raise(float_flag_invalid, status);
6437                 return float128_default_nan(status);
6438             }
6439             float_raise(float_flag_divbyzero, status);
6440             return packFloat128( zSign, 0x7FFF, 0, 0 );
6441         }
6442         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6443     }
6444     if ( aExp == 0 ) {
6445         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6446         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6447     }
6448     zExp = aExp - bExp + 0x3FFD;
6449     shortShift128Left(
6450         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6451     shortShift128Left(
6452         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6453     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6454         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6455         ++zExp;
6456     }
6457     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6458     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6459     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6460     while ( (int64_t) rem0 < 0 ) {
6461         --zSig0;
6462         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6463     }
6464     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6465     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6466         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6467         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6468         while ( (int64_t) rem1 < 0 ) {
6469             --zSig1;
6470             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6471         }
6472         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6473     }
6474     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6475     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6476
6477 }
6478
6479 /*----------------------------------------------------------------------------
6480 | Returns the remainder of the quadruple-precision floating-point value `a'
6481 | with respect to the corresponding value `b'.  The operation is performed
6482 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6483 *----------------------------------------------------------------------------*/
6484
6485 float128 float128_rem(float128 a, float128 b, float_status *status)
6486 {
6487     flag aSign, zSign;
6488     int32_t aExp, bExp, expDiff;
6489     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6490     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6491     int64_t sigMean0;
6492
6493     aSig1 = extractFloat128Frac1( a );
6494     aSig0 = extractFloat128Frac0( a );
6495     aExp = extractFloat128Exp( a );
6496     aSign = extractFloat128Sign( a );
6497     bSig1 = extractFloat128Frac1( b );
6498     bSig0 = extractFloat128Frac0( b );
6499     bExp = extractFloat128Exp( b );
6500     if ( aExp == 0x7FFF ) {
6501         if (    ( aSig0 | aSig1 )
6502              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6503             return propagateFloat128NaN(a, b, status);
6504         }
6505         goto invalid;
6506     }
6507     if ( bExp == 0x7FFF ) {
6508         if (bSig0 | bSig1) {
6509             return propagateFloat128NaN(a, b, status);
6510         }
6511         return a;
6512     }
6513     if ( bExp == 0 ) {
6514         if ( ( bSig0 | bSig1 ) == 0 ) {
6515  invalid:
6516             float_raise(float_flag_invalid, status);
6517             return float128_default_nan(status);
6518         }
6519         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6520     }
6521     if ( aExp == 0 ) {
6522         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6523         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6524     }
6525     expDiff = aExp - bExp;
6526     if ( expDiff < -1 ) return a;
6527     shortShift128Left(
6528         aSig0 | LIT64( 0x0001000000000000 ),
6529         aSig1,
6530         15 - ( expDiff < 0 ),
6531         &aSig0,
6532         &aSig1
6533     );
6534     shortShift128Left(
6535         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6536     q = le128( bSig0, bSig1, aSig0, aSig1 );
6537     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6538     expDiff -= 64;
6539     while ( 0 < expDiff ) {
6540         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6541         q = ( 4 < q ) ? q - 4 : 0;
6542         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6543         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6544         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6545         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6546         expDiff -= 61;
6547     }
6548     if ( -64 < expDiff ) {
6549         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6550         q = ( 4 < q ) ? q - 4 : 0;
6551         q >>= - expDiff;
6552         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6553         expDiff += 52;
6554         if ( expDiff < 0 ) {
6555             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6556         }
6557         else {
6558             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6559         }
6560         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6561         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6562     }
6563     else {
6564         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6565         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6566     }
6567     do {
6568         alternateASig0 = aSig0;
6569         alternateASig1 = aSig1;
6570         ++q;
6571         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6572     } while ( 0 <= (int64_t) aSig0 );
6573     add128(
6574         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6575     if (    ( sigMean0 < 0 )
6576          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6577         aSig0 = alternateASig0;
6578         aSig1 = alternateASig1;
6579     }
6580     zSign = ( (int64_t) aSig0 < 0 );
6581     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6582     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6583                                          status);
6584 }
6585
6586 /*----------------------------------------------------------------------------
6587 | Returns the square root of the quadruple-precision floating-point value `a'.
6588 | The operation is performed according to the IEC/IEEE Standard for Binary
6589 | Floating-Point Arithmetic.
6590 *----------------------------------------------------------------------------*/
6591
6592 float128 float128_sqrt(float128 a, float_status *status)
6593 {
6594     flag aSign;
6595     int32_t aExp, zExp;
6596     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6597     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6598
6599     aSig1 = extractFloat128Frac1( a );
6600     aSig0 = extractFloat128Frac0( a );
6601     aExp = extractFloat128Exp( a );
6602     aSign = extractFloat128Sign( a );
6603     if ( aExp == 0x7FFF ) {
6604         if (aSig0 | aSig1) {
6605             return propagateFloat128NaN(a, a, status);
6606         }
6607         if ( ! aSign ) return a;
6608         goto invalid;
6609     }
6610     if ( aSign ) {
6611         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6612  invalid:
6613         float_raise(float_flag_invalid, status);
6614         return float128_default_nan(status);
6615     }
6616     if ( aExp == 0 ) {
6617         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6618         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6619     }
6620     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6621     aSig0 |= LIT64( 0x0001000000000000 );
6622     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6623     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6624     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6625     doubleZSig0 = zSig0<<1;
6626     mul64To128( zSig0, zSig0, &term0, &term1 );
6627     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6628     while ( (int64_t) rem0 < 0 ) {
6629         --zSig0;
6630         doubleZSig0 -= 2;
6631         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6632     }
6633     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6634     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6635         if ( zSig1 == 0 ) zSig1 = 1;
6636         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6637         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6638         mul64To128( zSig1, zSig1, &term2, &term3 );
6639         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6640         while ( (int64_t) rem1 < 0 ) {
6641             --zSig1;
6642             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6643             term3 |= 1;
6644             term2 |= doubleZSig0;
6645             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6646         }
6647         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6648     }
6649     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6650     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6651
6652 }
6653
6654 /*----------------------------------------------------------------------------
6655 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6656 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6657 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6658 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6659 *----------------------------------------------------------------------------*/
6660
6661 int float128_eq(float128 a, float128 b, float_status *status)
6662 {
6663
6664     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6665               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6666          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6667               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6668        ) {
6669         float_raise(float_flag_invalid, status);
6670         return 0;
6671     }
6672     return
6673            ( a.low == b.low )
6674         && (    ( a.high == b.high )
6675              || (    ( a.low == 0 )
6676                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6677            );
6678
6679 }
6680
6681 /*----------------------------------------------------------------------------
6682 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6683 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6684 | exception is raised if either operand is a NaN.  The comparison is performed
6685 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6686 *----------------------------------------------------------------------------*/
6687
6688 int float128_le(float128 a, float128 b, float_status *status)
6689 {
6690     flag aSign, bSign;
6691
6692     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6693               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6694          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6695               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6696        ) {
6697         float_raise(float_flag_invalid, status);
6698         return 0;
6699     }
6700     aSign = extractFloat128Sign( a );
6701     bSign = extractFloat128Sign( b );
6702     if ( aSign != bSign ) {
6703         return
6704                aSign
6705             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6706                  == 0 );
6707     }
6708     return
6709           aSign ? le128( b.high, b.low, a.high, a.low )
6710         : le128( a.high, a.low, b.high, b.low );
6711
6712 }
6713
6714 /*----------------------------------------------------------------------------
6715 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6716 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6717 | raised if either operand is a NaN.  The comparison is performed according
6718 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6719 *----------------------------------------------------------------------------*/
6720
6721 int float128_lt(float128 a, float128 b, float_status *status)
6722 {
6723     flag aSign, bSign;
6724
6725     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6726               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6727          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6728               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6729        ) {
6730         float_raise(float_flag_invalid, status);
6731         return 0;
6732     }
6733     aSign = extractFloat128Sign( a );
6734     bSign = extractFloat128Sign( b );
6735     if ( aSign != bSign ) {
6736         return
6737                aSign
6738             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6739                  != 0 );
6740     }
6741     return
6742           aSign ? lt128( b.high, b.low, a.high, a.low )
6743         : lt128( a.high, a.low, b.high, b.low );
6744
6745 }
6746
6747 /*----------------------------------------------------------------------------
6748 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6749 | be compared, and 0 otherwise.  The invalid exception is raised if either
6750 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6751 | Standard for Binary Floating-Point Arithmetic.
6752 *----------------------------------------------------------------------------*/
6753
6754 int float128_unordered(float128 a, float128 b, float_status *status)
6755 {
6756     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6757               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6758          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6759               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6760        ) {
6761         float_raise(float_flag_invalid, status);
6762         return 1;
6763     }
6764     return 0;
6765 }
6766
6767 /*----------------------------------------------------------------------------
6768 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6769 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6770 | exception.  The comparison is performed according to the IEC/IEEE Standard
6771 | for Binary Floating-Point Arithmetic.
6772 *----------------------------------------------------------------------------*/
6773
6774 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6775 {
6776
6777     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6778               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6779          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6780               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6781        ) {
6782         if (float128_is_signaling_nan(a, status)
6783          || float128_is_signaling_nan(b, status)) {
6784             float_raise(float_flag_invalid, status);
6785         }
6786         return 0;
6787     }
6788     return
6789            ( a.low == b.low )
6790         && (    ( a.high == b.high )
6791              || (    ( a.low == 0 )
6792                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6793            );
6794
6795 }
6796
6797 /*----------------------------------------------------------------------------
6798 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6799 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6800 | cause an exception.  Otherwise, the comparison is performed according to the
6801 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6802 *----------------------------------------------------------------------------*/
6803
6804 int float128_le_quiet(float128 a, float128 b, float_status *status)
6805 {
6806     flag aSign, bSign;
6807
6808     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6809               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6810          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6811               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6812        ) {
6813         if (float128_is_signaling_nan(a, status)
6814          || float128_is_signaling_nan(b, status)) {
6815             float_raise(float_flag_invalid, status);
6816         }
6817         return 0;
6818     }
6819     aSign = extractFloat128Sign( a );
6820     bSign = extractFloat128Sign( b );
6821     if ( aSign != bSign ) {
6822         return
6823                aSign
6824             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6825                  == 0 );
6826     }
6827     return
6828           aSign ? le128( b.high, b.low, a.high, a.low )
6829         : le128( a.high, a.low, b.high, b.low );
6830
6831 }
6832
6833 /*----------------------------------------------------------------------------
6834 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6835 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6836 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6837 | Standard for Binary Floating-Point Arithmetic.
6838 *----------------------------------------------------------------------------*/
6839
6840 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6841 {
6842     flag aSign, bSign;
6843
6844     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6845               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6846          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6847               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6848        ) {
6849         if (float128_is_signaling_nan(a, status)
6850          || float128_is_signaling_nan(b, status)) {
6851             float_raise(float_flag_invalid, status);
6852         }
6853         return 0;
6854     }
6855     aSign = extractFloat128Sign( a );
6856     bSign = extractFloat128Sign( b );
6857     if ( aSign != bSign ) {
6858         return
6859                aSign
6860             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6861                  != 0 );
6862     }
6863     return
6864           aSign ? lt128( b.high, b.low, a.high, a.low )
6865         : lt128( a.high, a.low, b.high, b.low );
6866
6867 }
6868
6869 /*----------------------------------------------------------------------------
6870 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6871 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6872 | comparison is performed according to the IEC/IEEE Standard for Binary
6873 | Floating-Point Arithmetic.
6874 *----------------------------------------------------------------------------*/
6875
6876 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6877 {
6878     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6879               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6880          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6881               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6882        ) {
6883         if (float128_is_signaling_nan(a, status)
6884          || float128_is_signaling_nan(b, status)) {
6885             float_raise(float_flag_invalid, status);
6886         }
6887         return 1;
6888     }
6889     return 0;
6890 }
6891
6892 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6893                                             int is_quiet, float_status *status)
6894 {
6895     flag aSign, bSign;
6896
6897     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6898         float_raise(float_flag_invalid, status);
6899         return float_relation_unordered;
6900     }
6901     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6902           ( extractFloatx80Frac( a )<<1 ) ) ||
6903         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6904           ( extractFloatx80Frac( b )<<1 ) )) {
6905         if (!is_quiet ||
6906             floatx80_is_signaling_nan(a, status) ||
6907             floatx80_is_signaling_nan(b, status)) {
6908             float_raise(float_flag_invalid, status);
6909         }
6910         return float_relation_unordered;
6911     }
6912     aSign = extractFloatx80Sign( a );
6913     bSign = extractFloatx80Sign( b );
6914     if ( aSign != bSign ) {
6915
6916         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6917              ( ( a.low | b.low ) == 0 ) ) {
6918             /* zero case */
6919             return float_relation_equal;
6920         } else {
6921             return 1 - (2 * aSign);
6922         }
6923     } else {
6924         if (a.low == b.low && a.high == b.high) {
6925             return float_relation_equal;
6926         } else {
6927             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6928         }
6929     }
6930 }
6931
6932 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6933 {
6934     return floatx80_compare_internal(a, b, 0, status);
6935 }
6936
6937 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6938 {
6939     return floatx80_compare_internal(a, b, 1, status);
6940 }
6941
6942 static inline int float128_compare_internal(float128 a, float128 b,
6943                                             int is_quiet, float_status *status)
6944 {
6945     flag aSign, bSign;
6946
6947     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6948           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6949         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6950           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6951         if (!is_quiet ||
6952             float128_is_signaling_nan(a, status) ||
6953             float128_is_signaling_nan(b, status)) {
6954             float_raise(float_flag_invalid, status);
6955         }
6956         return float_relation_unordered;
6957     }
6958     aSign = extractFloat128Sign( a );
6959     bSign = extractFloat128Sign( b );
6960     if ( aSign != bSign ) {
6961         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6962             /* zero case */
6963             return float_relation_equal;
6964         } else {
6965             return 1 - (2 * aSign);
6966         }
6967     } else {
6968         if (a.low == b.low && a.high == b.high) {
6969             return float_relation_equal;
6970         } else {
6971             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6972         }
6973     }
6974 }
6975
6976 int float128_compare(float128 a, float128 b, float_status *status)
6977 {
6978     return float128_compare_internal(a, b, 0, status);
6979 }
6980
6981 int float128_compare_quiet(float128 a, float128 b, float_status *status)
6982 {
6983     return float128_compare_internal(a, b, 1, status);
6984 }
6985
6986 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
6987 {
6988     flag aSign;
6989     int32_t aExp;
6990     uint64_t aSig;
6991
6992     if (floatx80_invalid_encoding(a)) {
6993         float_raise(float_flag_invalid, status);
6994         return floatx80_default_nan(status);
6995     }
6996     aSig = extractFloatx80Frac( a );
6997     aExp = extractFloatx80Exp( a );
6998     aSign = extractFloatx80Sign( a );
6999
7000     if ( aExp == 0x7FFF ) {
7001         if ( aSig<<1 ) {
7002             return propagateFloatx80NaN(a, a, status);
7003         }
7004         return a;
7005     }
7006
7007     if (aExp == 0) {
7008         if (aSig == 0) {
7009             return a;
7010         }
7011         aExp++;
7012     }
7013
7014     if (n > 0x10000) {
7015         n = 0x10000;
7016     } else if (n < -0x10000) {
7017         n = -0x10000;
7018     }
7019
7020     aExp += n;
7021     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7022                                          aSign, aExp, aSig, 0, status);
7023 }
7024
7025 float128 float128_scalbn(float128 a, int n, float_status *status)
7026 {
7027     flag aSign;
7028     int32_t aExp;
7029     uint64_t aSig0, aSig1;
7030
7031     aSig1 = extractFloat128Frac1( a );
7032     aSig0 = extractFloat128Frac0( a );
7033     aExp = extractFloat128Exp( a );
7034     aSign = extractFloat128Sign( a );
7035     if ( aExp == 0x7FFF ) {
7036         if ( aSig0 | aSig1 ) {
7037             return propagateFloat128NaN(a, a, status);
7038         }
7039         return a;
7040     }
7041     if (aExp != 0) {
7042         aSig0 |= LIT64( 0x0001000000000000 );
7043     } else if (aSig0 == 0 && aSig1 == 0) {
7044         return a;
7045     } else {
7046         aExp++;
7047     }
7048
7049     if (n > 0x10000) {
7050         n = 0x10000;
7051     } else if (n < -0x10000) {
7052         n = -0x10000;
7053     }
7054
7055     aExp += n - 1;
7056     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7057                                          , status);
7058
7059 }