fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * The code in this source file is derived from release 2a of the SoftFloat
   5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6  * some later contributions) are provided under that license, as detailed below.
   7  * It has subsequently been modified by contributors to the QEMU Project,
   8  * so some portions are provided under:
   9  *  the SoftFloat-2a license
  10  *  the BSD license
  11  *  GPL-v2-or-later
  12  *
  13  * Any future contributions to this file after December 1st 2014 will be
  14  * taken to be licensed under the Softfloat-2a license unless specifically
  15  * indicated otherwise.
  16  */
  17
  18 /*
  19 ===============================================================================
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* BSD licensing:
  48  * Copyright (c) 2006, Fabrice Bellard
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are met:
  53  *
  54  * 1. Redistributions of source code must retain the above copyright notice,
  55  * this list of conditions and the following disclaimer.
  56  *
  57  * 2. Redistributions in binary form must reproduce the above copyright notice,
  58  * this list of conditions and the following disclaimer in the documentation
  59  * and/or other materials provided with the distribution.
  60  *
  61  * 3. Neither the name of the copyright holder nor the names of its contributors
  62  * may be used to endorse or promote products derived from this software without
  63  * specific prior written permission.
  64  *
  65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75  * THE POSSIBILITY OF SUCH DAMAGE.
  76  */
  77
  78 /* Portions of this work are licensed under the terms of the GNU GPL,
  79  * version 2 or later. See the COPYING file in the top-level directory.
  80  */
  81
  82 /* softfloat (and in particular the code in softfloat-specialize.h) is
  83  * target-dependent and needs the TARGET_* macros.
  84  */
  85 #include "qemu/osdep.h"
  86 #include "qemu/bitops.h"
  87 #include "fpu/softfloat.h"
  88
  89 /* We only need stdlib for abort() */
  90
  91 /*----------------------------------------------------------------------------
  92 | Primitive arithmetic functions, including multi-word arithmetic, and
  93 | division and square root approximations.  (Can be specialized to target if
  94 | desired.)
  95 *----------------------------------------------------------------------------*/
  96 #include "fpu/softfloat-macros.h"
  97
  98 /*----------------------------------------------------------------------------
  99 | Returns the fraction bits of the half-precision floating-point value `a'.
 100 *----------------------------------------------------------------------------*/
 101
 102 static inline uint32_t extractFloat16Frac(float16 a)
 103 {
 104     return float16_val(a) & 0x3ff;
 105 }
 106
 107 /*----------------------------------------------------------------------------
 108 | Returns the exponent bits of the half-precision floating-point value `a'.
 109 *----------------------------------------------------------------------------*/
 110
 111 static inline int extractFloat16Exp(float16 a)
 112 {
 113     return (float16_val(a) >> 10) & 0x1f;
 114 }
 115
 116 /*----------------------------------------------------------------------------
 117 | Returns the fraction bits of the single-precision floating-point value `a'.
 118 *----------------------------------------------------------------------------*/
 119
 120 static inline uint32_t extractFloat32Frac(float32 a)
 121 {
 122     return float32_val(a) & 0x007FFFFF;
 123 }
 124
 125 /*----------------------------------------------------------------------------
 126 | Returns the exponent bits of the single-precision floating-point value `a'.
 127 *----------------------------------------------------------------------------*/
 128
 129 static inline int extractFloat32Exp(float32 a)
 130 {
 131     return (float32_val(a) >> 23) & 0xFF;
 132 }
 133
 134 /*----------------------------------------------------------------------------
 135 | Returns the sign bit of the single-precision floating-point value `a'.
 136 *----------------------------------------------------------------------------*/
 137
 138 static inline flag extractFloat32Sign(float32 a)
 139 {
 140     return float32_val(a) >> 31;
 141 }
 142
 143 /*----------------------------------------------------------------------------
 144 | Returns the fraction bits of the double-precision floating-point value `a'.
 145 *----------------------------------------------------------------------------*/
 146
 147 static inline uint64_t extractFloat64Frac(float64 a)
 148 {
 149     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
 150 }
 151
 152 /*----------------------------------------------------------------------------
 153 | Returns the exponent bits of the double-precision floating-point value `a'.
 154 *----------------------------------------------------------------------------*/
 155
 156 static inline int extractFloat64Exp(float64 a)
 157 {
 158     return (float64_val(a) >> 52) & 0x7FF;
 159 }
 160
 161 /*----------------------------------------------------------------------------
 162 | Returns the sign bit of the double-precision floating-point value `a'.
 163 *----------------------------------------------------------------------------*/
 164
 165 static inline flag extractFloat64Sign(float64 a)
 166 {
 167     return float64_val(a) >> 63;
 168 }
 169
 170 /*
 171  * Classify a floating point number. Everything above float_class_qnan
 172  * is a NaN so cls >= float_class_qnan is any NaN.
 173  */
 174
 175 typedef enum __attribute__ ((__packed__)) {
 176     float_class_unclassified,
 177     float_class_zero,
 178     float_class_normal,
 179     float_class_inf,
 180     float_class_qnan,  /* all NaNs from here */
 181     float_class_snan,
 182 } FloatClass;
 183
 184 /* Simple helpers for checking if, or what kind of, NaN we have */
 185 static inline __attribute__((unused)) bool is_nan(FloatClass c)
 186 {
 187     return unlikely(c >= float_class_qnan);
 188 }
 189
 190 static inline __attribute__((unused)) bool is_snan(FloatClass c)
 191 {
 192     return c == float_class_snan;
 193 }
 194
 195 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
 196 {
 197     return c == float_class_qnan;
 198 }
 199
 200 /*
 201  * Structure holding all of the decomposed parts of a float. The
 202  * exponent is unbiased and the fraction is normalized. All
 203  * calculations are done with a 64 bit fraction and then rounded as
 204  * appropriate for the final format.
 205  *
 206  * Thanks to the packed FloatClass a decent compiler should be able to
 207  * fit the whole structure into registers and avoid using the stack
 208  * for parameter passing.
 209  */
 210
 211 typedef struct {
 212     uint64_t frac;
 213     int32_t  exp;
 214     FloatClass cls;
 215     bool sign;
 216 } FloatParts;
 217
 218 #define DECOMPOSED_BINARY_POINT    (64 - 2)
 219 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 220 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
 221
 222 /* Structure holding all of the relevant parameters for a format.
 223  *   exp_size: the size of the exponent field
 224  *   exp_bias: the offset applied to the exponent field
 225  *   exp_max: the maximum normalised exponent
 226  *   frac_size: the size of the fraction field
 227  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
 228  * The following are computed based the size of fraction
 229  *   frac_lsb: least significant bit of fraction
 230  *   frac_lsbm1: the bit below the least significant bit (for rounding)
 231  *   round_mask/roundeven_mask: masks used for rounding
 232  * The following optional modifiers are available:
 233  *   arm_althp: handle ARM Alternative Half Precision
 234  */
 235 typedef struct {
 236     int exp_size;
 237     int exp_bias;
 238     int exp_max;
 239     int frac_size;
 240     int frac_shift;
 241     uint64_t frac_lsb;
 242     uint64_t frac_lsbm1;
 243     uint64_t round_mask;
 244     uint64_t roundeven_mask;
 245     bool arm_althp;
 246 } FloatFmt;
 247
 248 /* Expand fields based on the size of exponent and fraction */
 249 #define FLOAT_PARAMS(E, F)                                           \
 250     .exp_size       = E,                                             \
 251     .exp_bias       = ((1 << E) - 1) >> 1,                           \
 252     .exp_max        = (1 << E) - 1,                                  \
 253     .frac_size      = F,                                             \
 254     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
 255     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
 256     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
 257     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
 258     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
 259
 260 static const FloatFmt float16_params = {
 261     FLOAT_PARAMS(5, 10)
 262 };
 263
 264 static const FloatFmt float16_params_ahp = {
 265     FLOAT_PARAMS(5, 10),
 266     .arm_althp = true
 267 };
 268
 269 static const FloatFmt float32_params = {
 270     FLOAT_PARAMS(8, 23)
 271 };
 272
 273 static const FloatFmt float64_params = {
 274     FLOAT_PARAMS(11, 52)
 275 };
 276
 277 /* Unpack a float to parts, but do not canonicalize.  */
 278 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
 279 {
 280     const int sign_pos = fmt.frac_size + fmt.exp_size;
 281
 282     return (FloatParts) {
 283         .cls = float_class_unclassified,
 284         .sign = extract64(raw, sign_pos, 1),
 285         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
 286         .frac = extract64(raw, 0, fmt.frac_size),
 287     };
 288 }
 289
 290 static inline FloatParts float16_unpack_raw(float16 f)
 291 {
 292     return unpack_raw(float16_params, f);
 293 }
 294
 295 static inline FloatParts float32_unpack_raw(float32 f)
 296 {
 297     return unpack_raw(float32_params, f);
 298 }
 299
 300 static inline FloatParts float64_unpack_raw(float64 f)
 301 {
 302     return unpack_raw(float64_params, f);
 303 }
 304
 305 /* Pack a float from parts, but do not canonicalize.  */
 306 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
 307 {
 308     const int sign_pos = fmt.frac_size + fmt.exp_size;
 309     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
 310     return deposit64(ret, sign_pos, 1, p.sign);
 311 }
 312
 313 static inline float16 float16_pack_raw(FloatParts p)
 314 {
 315     return make_float16(pack_raw(float16_params, p));
 316 }
 317
 318 static inline float32 float32_pack_raw(FloatParts p)
 319 {
 320     return make_float32(pack_raw(float32_params, p));
 321 }
 322
 323 static inline float64 float64_pack_raw(FloatParts p)
 324 {
 325     return make_float64(pack_raw(float64_params, p));
 326 }
 327
 328 /*----------------------------------------------------------------------------
 329 | Functions and definitions to determine:  (1) whether tininess for underflow
 330 | is detected before or after rounding by default, (2) what (if anything)
 331 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
 332 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 333 | are propagated from function inputs to output.  These details are target-
 334 | specific.
 335 *----------------------------------------------------------------------------*/
 336 #include "softfloat-specialize.h"
 337
 338 /* Canonicalize EXP and FRAC, setting CLS.  */
 339 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
 340                                float_status *status)
 341 {
 342     if (part.exp == parm->exp_max && !parm->arm_althp) {
 343         if (part.frac == 0) {
 344             part.cls = float_class_inf;
 345         } else {
 346             part.frac <<= parm->frac_shift;
 347             part.cls = (parts_is_snan_frac(part.frac, status)
 348                         ? float_class_snan : float_class_qnan);
 349         }
 350     } else if (part.exp == 0) {
 351         if (likely(part.frac == 0)) {
 352             part.cls = float_class_zero;
 353         } else if (status->flush_inputs_to_zero) {
 354             float_raise(float_flag_input_denormal, status);
 355             part.cls = float_class_zero;
 356             part.frac = 0;
 357         } else {
 358             int shift = clz64(part.frac) - 1;
 359             part.cls = float_class_normal;
 360             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
 361             part.frac <<= shift;
 362         }
 363     } else {
 364         part.cls = float_class_normal;
 365         part.exp -= parm->exp_bias;
 366         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
 367     }
 368     return part;
 369 }
 370
 371 /* Round and uncanonicalize a floating-point number by parts. There
 372  * are FRAC_SHIFT bits that may require rounding at the bottom of the
 373  * fraction; these bits will be removed. The exponent will be biased
 374  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
 375  */
 376
 377 static FloatParts round_canonical(FloatParts p, float_status *s,
 378                                   const FloatFmt *parm)
 379 {
 380     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
 381     const uint64_t round_mask = parm->round_mask;
 382     const uint64_t roundeven_mask = parm->roundeven_mask;
 383     const int exp_max = parm->exp_max;
 384     const int frac_shift = parm->frac_shift;
 385     uint64_t frac, inc;
 386     int exp, flags = 0;
 387     bool overflow_norm;
 388
 389     frac = p.frac;
 390     exp = p.exp;
 391
 392     switch (p.cls) {
 393     case float_class_normal:
 394         switch (s->float_rounding_mode) {
 395         case float_round_nearest_even:
 396             overflow_norm = false;
 397             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
 398             break;
 399         case float_round_ties_away:
 400             overflow_norm = false;
 401             inc = frac_lsbm1;
 402             break;
 403         case float_round_to_zero:
 404             overflow_norm = true;
 405             inc = 0;
 406             break;
 407         case float_round_up:
 408             inc = p.sign ? 0 : round_mask;
 409             overflow_norm = p.sign;
 410             break;
 411         case float_round_down:
 412             inc = p.sign ? round_mask : 0;
 413             overflow_norm = !p.sign;
 414             break;
 415         default:
 416             g_assert_not_reached();
 417         }
 418
 419         exp += parm->exp_bias;
 420         if (likely(exp > 0)) {
 421             if (frac & round_mask) {
 422                 flags |= float_flag_inexact;
 423                 frac += inc;
 424                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
 425                     frac >>= 1;
 426                     exp++;
 427                 }
 428             }
 429             frac >>= frac_shift;
 430
 431             if (parm->arm_althp) {
 432                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
 433                 if (unlikely(exp > exp_max)) {
 434                     /* Overflow.  Return the maximum normal.  */
 435                     flags = float_flag_invalid;
 436                     exp = exp_max;
 437                     frac = -1;
 438                 }
 439             } else if (unlikely(exp >= exp_max)) {
 440                 flags |= float_flag_overflow | float_flag_inexact;
 441                 if (overflow_norm) {
 442                     exp = exp_max - 1;
 443                     frac = -1;
 444                 } else {
 445                     p.cls = float_class_inf;
 446                     goto do_inf;
 447                 }
 448             }
 449         } else if (s->flush_to_zero) {
 450             flags |= float_flag_output_denormal;
 451             p.cls = float_class_zero;
 452             goto do_zero;
 453         } else {
 454             bool is_tiny = (s->float_detect_tininess
 455                             == float_tininess_before_rounding)
 456                         || (exp < 0)
 457                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
 458
 459             shift64RightJamming(frac, 1 - exp, &frac);
 460             if (frac & round_mask) {
 461                 /* Need to recompute round-to-even.  */
 462                 if (s->float_rounding_mode == float_round_nearest_even) {
 463                     inc = ((frac & roundeven_mask) != frac_lsbm1
 464                            ? frac_lsbm1 : 0);
 465                 }
 466                 flags |= float_flag_inexact;
 467                 frac += inc;
 468             }
 469
 470             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
 471             frac >>= frac_shift;
 472
 473             if (is_tiny && (flags & float_flag_inexact)) {
 474                 flags |= float_flag_underflow;
 475             }
 476             if (exp == 0 && frac == 0) {
 477                 p.cls = float_class_zero;
 478             }
 479         }
 480         break;
 481
 482     case float_class_zero:
 483     do_zero:
 484         exp = 0;
 485         frac = 0;
 486         break;
 487
 488     case float_class_inf:
 489     do_inf:
 490         assert(!parm->arm_althp);
 491         exp = exp_max;
 492         frac = 0;
 493         break;
 494
 495     case float_class_qnan:
 496     case float_class_snan:
 497         assert(!parm->arm_althp);
 498         exp = exp_max;
 499         frac >>= parm->frac_shift;
 500         break;
 501
 502     default:
 503         g_assert_not_reached();
 504     }
 505
 506     float_raise(flags, s);
 507     p.exp = exp;
 508     p.frac = frac;
 509     return p;
 510 }
 511
 512 /* Explicit FloatFmt version */
 513 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
 514                                             const FloatFmt *params)
 515 {
 516     return canonicalize(float16_unpack_raw(f), params, s);
 517 }
 518
 519 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
 520 {
 521     return float16a_unpack_canonical(f, s, &float16_params);
 522 }
 523
 524 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
 525                                              const FloatFmt *params)
 526 {
 527     return float16_pack_raw(round_canonical(p, s, params));
 528 }
 529
 530 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
 531 {
 532     return float16a_round_pack_canonical(p, s, &float16_params);
 533 }
 534
 535 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
 536 {
 537     return canonicalize(float32_unpack_raw(f), &float32_params, s);
 538 }
 539
 540 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
 541 {
 542     return float32_pack_raw(round_canonical(p, s, &float32_params));
 543 }
 544
 545 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
 546 {
 547     return canonicalize(float64_unpack_raw(f), &float64_params, s);
 548 }
 549
 550 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
 551 {
 552     return float64_pack_raw(round_canonical(p, s, &float64_params));
 553 }
 554
 555 static FloatParts return_nan(FloatParts a, float_status *s)
 556 {
 557     switch (a.cls) {
 558     case float_class_snan:
 559         s->float_exception_flags |= float_flag_invalid;
 560         a = parts_silence_nan(a, s);
 561         /* fall through */
 562     case float_class_qnan:
 563         if (s->default_nan_mode) {
 564             return parts_default_nan(s);
 565         }
 566         break;
 567
 568     default:
 569         g_assert_not_reached();
 570     }
 571     return a;
 572 }
 573
 574 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
 575 {
 576     if (is_snan(a.cls) || is_snan(b.cls)) {
 577         s->float_exception_flags |= float_flag_invalid;
 578     }
 579
 580     if (s->default_nan_mode) {
 581         return parts_default_nan(s);
 582     } else {
 583         if (pickNaN(a.cls, b.cls,
 584                     a.frac > b.frac ||
 585                     (a.frac == b.frac && a.sign < b.sign))) {
 586             a = b;
 587         }
 588         if (is_snan(a.cls)) {
 589             return parts_silence_nan(a, s);
 590         }
 591     }
 592     return a;
 593 }
 594
 595 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
 596                                   bool inf_zero, float_status *s)
 597 {
 598     int which;
 599
 600     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
 601         s->float_exception_flags |= float_flag_invalid;
 602     }
 603
 604     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
 605
 606     if (s->default_nan_mode) {
 607         /* Note that this check is after pickNaNMulAdd so that function
 608          * has an opportunity to set the Invalid flag.
 609          */
 610         which = 3;
 611     }
 612
 613     switch (which) {
 614     case 0:
 615         break;
 616     case 1:
 617         a = b;
 618         break;
 619     case 2:
 620         a = c;
 621         break;
 622     case 3:
 623         return parts_default_nan(s);
 624     default:
 625         g_assert_not_reached();
 626     }
 627
 628     if (is_snan(a.cls)) {
 629         return parts_silence_nan(a, s);
 630     }
 631     return a;
 632 }
 633
 634 /*
 635  * Returns the result of adding or subtracting the values of the
 636  * floating-point values `a' and `b'. The operation is performed
 637  * according to the IEC/IEEE Standard for Binary Floating-Point
 638  * Arithmetic.
 639  */
 640
 641 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
 642                                 float_status *s)
 643 {
 644     bool a_sign = a.sign;
 645     bool b_sign = b.sign ^ subtract;
 646
 647     if (a_sign != b_sign) {
 648         /* Subtraction */
 649
 650         if (a.cls == float_class_normal && b.cls == float_class_normal) {
 651             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
 652                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 653                 a.frac = a.frac - b.frac;
 654             } else {
 655                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 656                 a.frac = b.frac - a.frac;
 657                 a.exp = b.exp;
 658                 a_sign ^= 1;
 659             }
 660
 661             if (a.frac == 0) {
 662                 a.cls = float_class_zero;
 663                 a.sign = s->float_rounding_mode == float_round_down;
 664             } else {
 665                 int shift = clz64(a.frac) - 1;
 666                 a.frac = a.frac << shift;
 667                 a.exp = a.exp - shift;
 668                 a.sign = a_sign;
 669             }
 670             return a;
 671         }
 672         if (is_nan(a.cls) || is_nan(b.cls)) {
 673             return pick_nan(a, b, s);
 674         }
 675         if (a.cls == float_class_inf) {
 676             if (b.cls == float_class_inf) {
 677                 float_raise(float_flag_invalid, s);
 678                 return parts_default_nan(s);
 679             }
 680             return a;
 681         }
 682         if (a.cls == float_class_zero && b.cls == float_class_zero) {
 683             a.sign = s->float_rounding_mode == float_round_down;
 684             return a;
 685         }
 686         if (a.cls == float_class_zero || b.cls == float_class_inf) {
 687             b.sign = a_sign ^ 1;
 688             return b;
 689         }
 690         if (b.cls == float_class_zero) {
 691             return a;
 692         }
 693     } else {
 694         /* Addition */
 695         if (a.cls == float_class_normal && b.cls == float_class_normal) {
 696             if (a.exp > b.exp) {
 697                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 698             } else if (a.exp < b.exp) {
 699                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 700                 a.exp = b.exp;
 701             }
 702             a.frac += b.frac;
 703             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
 704                 shift64RightJamming(a.frac, 1, &a.frac);
 705                 a.exp += 1;
 706             }
 707             return a;
 708         }
 709         if (is_nan(a.cls) || is_nan(b.cls)) {
 710             return pick_nan(a, b, s);
 711         }
 712         if (a.cls == float_class_inf || b.cls == float_class_zero) {
 713             return a;
 714         }
 715         if (b.cls == float_class_inf || a.cls == float_class_zero) {
 716             b.sign = b_sign;
 717             return b;
 718         }
 719     }
 720     g_assert_not_reached();
 721 }
 722
 723 /*
 724  * Returns the result of adding or subtracting the floating-point
 725  * values `a' and `b'. The operation is performed according to the
 726  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 727  */
 728
 729 float16  __attribute__((flatten)) float16_add(float16 a, float16 b,
 730                                               float_status *status)
 731 {
 732     FloatParts pa = float16_unpack_canonical(a, status);
 733     FloatParts pb = float16_unpack_canonical(b, status);
 734     FloatParts pr = addsub_floats(pa, pb, false, status);
 735
 736     return float16_round_pack_canonical(pr, status);
 737 }
 738
 739 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
 740                                              float_status *status)
 741 {
 742     FloatParts pa = float32_unpack_canonical(a, status);
 743     FloatParts pb = float32_unpack_canonical(b, status);
 744     FloatParts pr = addsub_floats(pa, pb, false, status);
 745
 746     return float32_round_pack_canonical(pr, status);
 747 }
 748
 749 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
 750                                              float_status *status)
 751 {
 752     FloatParts pa = float64_unpack_canonical(a, status);
 753     FloatParts pb = float64_unpack_canonical(b, status);
 754     FloatParts pr = addsub_floats(pa, pb, false, status);
 755
 756     return float64_round_pack_canonical(pr, status);
 757 }
 758
 759 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
 760                                              float_status *status)
 761 {
 762     FloatParts pa = float16_unpack_canonical(a, status);
 763     FloatParts pb = float16_unpack_canonical(b, status);
 764     FloatParts pr = addsub_floats(pa, pb, true, status);
 765
 766     return float16_round_pack_canonical(pr, status);
 767 }
 768
 769 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
 770                                              float_status *status)
 771 {
 772     FloatParts pa = float32_unpack_canonical(a, status);
 773     FloatParts pb = float32_unpack_canonical(b, status);
 774     FloatParts pr = addsub_floats(pa, pb, true, status);
 775
 776     return float32_round_pack_canonical(pr, status);
 777 }
 778
 779 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
 780                                              float_status *status)
 781 {
 782     FloatParts pa = float64_unpack_canonical(a, status);
 783     FloatParts pb = float64_unpack_canonical(b, status);
 784     FloatParts pr = addsub_floats(pa, pb, true, status);
 785
 786     return float64_round_pack_canonical(pr, status);
 787 }
 788
 789 /*
 790  * Returns the result of multiplying the floating-point values `a' and
 791  * `b'. The operation is performed according to the IEC/IEEE Standard
 792  * for Binary Floating-Point Arithmetic.
 793  */
 794
 795 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
 796 {
 797     bool sign = a.sign ^ b.sign;
 798
 799     if (a.cls == float_class_normal && b.cls == float_class_normal) {
 800         uint64_t hi, lo;
 801         int exp = a.exp + b.exp;
 802
 803         mul64To128(a.frac, b.frac, &hi, &lo);
 804         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 805         if (lo & DECOMPOSED_OVERFLOW_BIT) {
 806             shift64RightJamming(lo, 1, &lo);
 807             exp += 1;
 808         }
 809
 810         /* Re-use a */
 811         a.exp = exp;
 812         a.sign = sign;
 813         a.frac = lo;
 814         return a;
 815     }
 816     /* handle all the NaN cases */
 817     if (is_nan(a.cls) || is_nan(b.cls)) {
 818         return pick_nan(a, b, s);
 819     }
 820     /* Inf * Zero == NaN */
 821     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
 822         (a.cls == float_class_zero && b.cls == float_class_inf)) {
 823         s->float_exception_flags |= float_flag_invalid;
 824         return parts_default_nan(s);
 825     }
 826     /* Multiply by 0 or Inf */
 827     if (a.cls == float_class_inf || a.cls == float_class_zero) {
 828         a.sign = sign;
 829         return a;
 830     }
 831     if (b.cls == float_class_inf || b.cls == float_class_zero) {
 832         b.sign = sign;
 833         return b;
 834     }
 835     g_assert_not_reached();
 836 }
 837
 838 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
 839                                              float_status *status)
 840 {
 841     FloatParts pa = float16_unpack_canonical(a, status);
 842     FloatParts pb = float16_unpack_canonical(b, status);
 843     FloatParts pr = mul_floats(pa, pb, status);
 844
 845     return float16_round_pack_canonical(pr, status);
 846 }
 847
 848 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
 849                                              float_status *status)
 850 {
 851     FloatParts pa = float32_unpack_canonical(a, status);
 852     FloatParts pb = float32_unpack_canonical(b, status);
 853     FloatParts pr = mul_floats(pa, pb, status);
 854
 855     return float32_round_pack_canonical(pr, status);
 856 }
 857
 858 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
 859                                              float_status *status)
 860 {
 861     FloatParts pa = float64_unpack_canonical(a, status);
 862     FloatParts pb = float64_unpack_canonical(b, status);
 863     FloatParts pr = mul_floats(pa, pb, status);
 864
 865     return float64_round_pack_canonical(pr, status);
 866 }
 867
 868 /*
 869  * Returns the result of multiplying the floating-point values `a' and
 870  * `b' then adding 'c', with no intermediate rounding step after the
 871  * multiplication. The operation is performed according to the
 872  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
 873  * The flags argument allows the caller to select negation of the
 874  * addend, the intermediate product, or the final result. (The
 875  * difference between this and having the caller do a separate
 876  * negation is that negating externally will flip the sign bit on
 877  * NaNs.)
 878  */
 879
 880 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
 881                                 int flags, float_status *s)
 882 {
 883     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
 884                     ((1 << float_class_inf) | (1 << float_class_zero));
 885     bool p_sign;
 886     bool sign_flip = flags & float_muladd_negate_result;
 887     FloatClass p_class;
 888     uint64_t hi, lo;
 889     int p_exp;
 890
 891     /* It is implementation-defined whether the cases of (0,inf,qnan)
 892      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
 893      * they return if they do), so we have to hand this information
 894      * off to the target-specific pick-a-NaN routine.
 895      */
 896     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
 897         return pick_nan_muladd(a, b, c, inf_zero, s);
 898     }
 899
 900     if (inf_zero) {
 901         s->float_exception_flags |= float_flag_invalid;
 902         return parts_default_nan(s);
 903     }
 904
 905     if (flags & float_muladd_negate_c) {
 906         c.sign ^= 1;
 907     }
 908
 909     p_sign = a.sign ^ b.sign;
 910
 911     if (flags & float_muladd_negate_product) {
 912         p_sign ^= 1;
 913     }
 914
 915     if (a.cls == float_class_inf || b.cls == float_class_inf) {
 916         p_class = float_class_inf;
 917     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
 918         p_class = float_class_zero;
 919     } else {
 920         p_class = float_class_normal;
 921     }
 922
 923     if (c.cls == float_class_inf) {
 924         if (p_class == float_class_inf && p_sign != c.sign) {
 925             s->float_exception_flags |= float_flag_invalid;
 926             return parts_default_nan(s);
 927         } else {
 928             a.cls = float_class_inf;
 929             a.sign = c.sign ^ sign_flip;
 930             return a;
 931         }
 932     }
 933
 934     if (p_class == float_class_inf) {
 935         a.cls = float_class_inf;
 936         a.sign = p_sign ^ sign_flip;
 937         return a;
 938     }
 939
 940     if (p_class == float_class_zero) {
 941         if (c.cls == float_class_zero) {
 942             if (p_sign != c.sign) {
 943                 p_sign = s->float_rounding_mode == float_round_down;
 944             }
 945             c.sign = p_sign;
 946         } else if (flags & float_muladd_halve_result) {
 947             c.exp -= 1;
 948         }
 949         c.sign ^= sign_flip;
 950         return c;
 951     }
 952
 953     /* a & b should be normals now... */
 954     assert(a.cls == float_class_normal &&
 955            b.cls == float_class_normal);
 956
 957     p_exp = a.exp + b.exp;
 958
 959     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
 960      * result.
 961      */
 962     mul64To128(a.frac, b.frac, &hi, &lo);
 963     /* binary point now at bit 124 */
 964
 965     /* check for overflow */
 966     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
 967         shift128RightJamming(hi, lo, 1, &hi, &lo);
 968         p_exp += 1;
 969     }
 970
 971     /* + add/sub */
 972     if (c.cls == float_class_zero) {
 973         /* move binary point back to 62 */
 974         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 975     } else {
 976         int exp_diff = p_exp - c.exp;
 977         if (p_sign == c.sign) {
 978             /* Addition */
 979             if (exp_diff <= 0) {
 980                 shift128RightJamming(hi, lo,
 981                                      DECOMPOSED_BINARY_POINT - exp_diff,
 982                                      &hi, &lo);
 983                 lo += c.frac;
 984                 p_exp = c.exp;
 985             } else {
 986                 uint64_t c_hi, c_lo;
 987                 /* shift c to the same binary point as the product (124) */
 988                 c_hi = c.frac >> 2;
 989                 c_lo = 0;
 990                 shift128RightJamming(c_hi, c_lo,
 991                                      exp_diff,
 992                                      &c_hi, &c_lo);
 993                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
 994                 /* move binary point back to 62 */
 995                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 996             }
 997
 998             if (lo & DECOMPOSED_OVERFLOW_BIT) {
 999                 shift64RightJamming(lo, 1, &lo);
1000                 p_exp += 1;
1001             }
1002
1003         } else {
1004             /* Subtraction */
1005             uint64_t c_hi, c_lo;
1006             /* make C binary point match product at bit 124 */
1007             c_hi = c.frac >> 2;
1008             c_lo = 0;
1009
1010             if (exp_diff <= 0) {
1011                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1012                 if (exp_diff == 0
1013                     &&
1014                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1015                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1016                 } else {
1017                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1018                     p_sign ^= 1;
1019                     p_exp = c.exp;
1020                 }
1021             } else {
1022                 shift128RightJamming(c_hi, c_lo,
1023                                      exp_diff,
1024                                      &c_hi, &c_lo);
1025                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1026             }
1027
1028             if (hi == 0 && lo == 0) {
1029                 a.cls = float_class_zero;
1030                 a.sign = s->float_rounding_mode == float_round_down;
1031                 a.sign ^= sign_flip;
1032                 return a;
1033             } else {
1034                 int shift;
1035                 if (hi != 0) {
1036                     shift = clz64(hi);
1037                 } else {
1038                     shift = clz64(lo) + 64;
1039                 }
1040                 /* Normalizing to a binary point of 124 is the
1041                    correct adjust for the exponent.  However since we're
1042                    shifting, we might as well put the binary point back
1043                    at 62 where we really want it.  Therefore shift as
1044                    if we're leaving 1 bit at the top of the word, but
1045                    adjust the exponent as if we're leaving 3 bits.  */
1046                 shift -= 1;
1047                 if (shift >= 64) {
1048                     lo = lo << (shift - 64);
1049                 } else {
1050                     hi = (hi << shift) | (lo >> (64 - shift));
1051                     lo = hi | ((lo << shift) != 0);
1052                 }
1053                 p_exp -= shift - 2;
1054             }
1055         }
1056     }
1057
1058     if (flags & float_muladd_halve_result) {
1059         p_exp -= 1;
1060     }
1061
1062     /* finally prepare our result */
1063     a.cls = float_class_normal;
1064     a.sign = p_sign ^ sign_flip;
1065     a.exp = p_exp;
1066     a.frac = lo;
1067
1068     return a;
1069 }
1070
1071 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1072                                                 int flags, float_status *status)
1073 {
1074     FloatParts pa = float16_unpack_canonical(a, status);
1075     FloatParts pb = float16_unpack_canonical(b, status);
1076     FloatParts pc = float16_unpack_canonical(c, status);
1077     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1078
1079     return float16_round_pack_canonical(pr, status);
1080 }
1081
1082 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1083                                                 int flags, float_status *status)
1084 {
1085     FloatParts pa = float32_unpack_canonical(a, status);
1086     FloatParts pb = float32_unpack_canonical(b, status);
1087     FloatParts pc = float32_unpack_canonical(c, status);
1088     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1089
1090     return float32_round_pack_canonical(pr, status);
1091 }
1092
1093 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1094                                                 int flags, float_status *status)
1095 {
1096     FloatParts pa = float64_unpack_canonical(a, status);
1097     FloatParts pb = float64_unpack_canonical(b, status);
1098     FloatParts pc = float64_unpack_canonical(c, status);
1099     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1100
1101     return float64_round_pack_canonical(pr, status);
1102 }
1103
1104 /*
1105  * Returns the result of dividing the floating-point value `a' by the
1106  * corresponding value `b'. The operation is performed according to
1107  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1108  */
1109
1110 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1111 {
1112     bool sign = a.sign ^ b.sign;
1113
1114     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1115         uint64_t temp_lo, temp_hi;
1116         int exp = a.exp - b.exp;
1117         if (a.frac < b.frac) {
1118             exp -= 1;
1119             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1120                               &temp_hi, &temp_lo);
1121         } else {
1122             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1123                               &temp_hi, &temp_lo);
1124         }
1125         /* LSB of quot is set if inexact which roundandpack will use
1126          * to set flags. Yet again we re-use a for the result */
1127         a.frac = div128To64(temp_lo, temp_hi, b.frac);
1128         a.sign = sign;
1129         a.exp = exp;
1130         return a;
1131     }
1132     /* handle all the NaN cases */
1133     if (is_nan(a.cls) || is_nan(b.cls)) {
1134         return pick_nan(a, b, s);
1135     }
1136     /* 0/0 or Inf/Inf */
1137     if (a.cls == b.cls
1138         &&
1139         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1140         s->float_exception_flags |= float_flag_invalid;
1141         return parts_default_nan(s);
1142     }
1143     /* Inf / x or 0 / x */
1144     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1145         a.sign = sign;
1146         return a;
1147     }
1148     /* Div 0 => Inf */
1149     if (b.cls == float_class_zero) {
1150         s->float_exception_flags |= float_flag_divbyzero;
1151         a.cls = float_class_inf;
1152         a.sign = sign;
1153         return a;
1154     }
1155     /* Div by Inf */
1156     if (b.cls == float_class_inf) {
1157         a.cls = float_class_zero;
1158         a.sign = sign;
1159         return a;
1160     }
1161     g_assert_not_reached();
1162 }
1163
1164 float16 float16_div(float16 a, float16 b, float_status *status)
1165 {
1166     FloatParts pa = float16_unpack_canonical(a, status);
1167     FloatParts pb = float16_unpack_canonical(b, status);
1168     FloatParts pr = div_floats(pa, pb, status);
1169
1170     return float16_round_pack_canonical(pr, status);
1171 }
1172
1173 float32 float32_div(float32 a, float32 b, float_status *status)
1174 {
1175     FloatParts pa = float32_unpack_canonical(a, status);
1176     FloatParts pb = float32_unpack_canonical(b, status);
1177     FloatParts pr = div_floats(pa, pb, status);
1178
1179     return float32_round_pack_canonical(pr, status);
1180 }
1181
1182 float64 float64_div(float64 a, float64 b, float_status *status)
1183 {
1184     FloatParts pa = float64_unpack_canonical(a, status);
1185     FloatParts pb = float64_unpack_canonical(b, status);
1186     FloatParts pr = div_floats(pa, pb, status);
1187
1188     return float64_round_pack_canonical(pr, status);
1189 }
1190
1191 /*
1192  * Float to Float conversions
1193  *
1194  * Returns the result of converting one float format to another. The
1195  * conversion is performed according to the IEC/IEEE Standard for
1196  * Binary Floating-Point Arithmetic.
1197  *
1198  * The float_to_float helper only needs to take care of raising
1199  * invalid exceptions and handling the conversion on NaNs.
1200  */
1201
1202 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1203                                  float_status *s)
1204 {
1205     if (dstf->arm_althp) {
1206         switch (a.cls) {
1207         case float_class_qnan:
1208         case float_class_snan:
1209             /* There is no NaN in the destination format.  Raise Invalid
1210              * and return a zero with the sign of the input NaN.
1211              */
1212             s->float_exception_flags |= float_flag_invalid;
1213             a.cls = float_class_zero;
1214             a.frac = 0;
1215             a.exp = 0;
1216             break;
1217
1218         case float_class_inf:
1219             /* There is no Inf in the destination format.  Raise Invalid
1220              * and return the maximum normal with the correct sign.
1221              */
1222             s->float_exception_flags |= float_flag_invalid;
1223             a.cls = float_class_normal;
1224             a.exp = dstf->exp_max;
1225             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1226             break;
1227
1228         default:
1229             break;
1230         }
1231     } else if (is_nan(a.cls)) {
1232         if (is_snan(a.cls)) {
1233             s->float_exception_flags |= float_flag_invalid;
1234             a = parts_silence_nan(a, s);
1235         }
1236         if (s->default_nan_mode) {
1237             return parts_default_nan(s);
1238         }
1239     }
1240     return a;
1241 }
1242
1243 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1244 {
1245     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1246     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1247     FloatParts pr = float_to_float(p, &float32_params, s);
1248     return float32_round_pack_canonical(pr, s);
1249 }
1250
1251 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1252 {
1253     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1254     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1255     FloatParts pr = float_to_float(p, &float64_params, s);
1256     return float64_round_pack_canonical(pr, s);
1257 }
1258
1259 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1260 {
1261     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1262     FloatParts p = float32_unpack_canonical(a, s);
1263     FloatParts pr = float_to_float(p, fmt16, s);
1264     return float16a_round_pack_canonical(pr, s, fmt16);
1265 }
1266
1267 float64 float32_to_float64(float32 a, float_status *s)
1268 {
1269     FloatParts p = float32_unpack_canonical(a, s);
1270     FloatParts pr = float_to_float(p, &float64_params, s);
1271     return float64_round_pack_canonical(pr, s);
1272 }
1273
1274 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1275 {
1276     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1277     FloatParts p = float64_unpack_canonical(a, s);
1278     FloatParts pr = float_to_float(p, fmt16, s);
1279     return float16a_round_pack_canonical(pr, s, fmt16);
1280 }
1281
1282 float32 float64_to_float32(float64 a, float_status *s)
1283 {
1284     FloatParts p = float64_unpack_canonical(a, s);
1285     FloatParts pr = float_to_float(p, &float32_params, s);
1286     return float32_round_pack_canonical(pr, s);
1287 }
1288
1289 /*
1290  * Rounds the floating-point value `a' to an integer, and returns the
1291  * result as a floating-point value. The operation is performed
1292  * according to the IEC/IEEE Standard for Binary Floating-Point
1293  * Arithmetic.
1294  */
1295
1296 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1297 {
1298     if (is_nan(a.cls)) {
1299         return return_nan(a, s);
1300     }
1301
1302     switch (a.cls) {
1303     case float_class_zero:
1304     case float_class_inf:
1305     case float_class_qnan:
1306         /* already "integral" */
1307         break;
1308     case float_class_normal:
1309         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1310             /* already integral */
1311             break;
1312         }
1313         if (a.exp < 0) {
1314             bool one;
1315             /* all fractional */
1316             s->float_exception_flags |= float_flag_inexact;
1317             switch (rounding_mode) {
1318             case float_round_nearest_even:
1319                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1320                 break;
1321             case float_round_ties_away:
1322                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1323                 break;
1324             case float_round_to_zero:
1325                 one = false;
1326                 break;
1327             case float_round_up:
1328                 one = !a.sign;
1329                 break;
1330             case float_round_down:
1331                 one = a.sign;
1332                 break;
1333             default:
1334                 g_assert_not_reached();
1335             }
1336
1337             if (one) {
1338                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1339                 a.exp = 0;
1340             } else {
1341                 a.cls = float_class_zero;
1342             }
1343         } else {
1344             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1345             uint64_t frac_lsbm1 = frac_lsb >> 1;
1346             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1347             uint64_t rnd_mask = rnd_even_mask >> 1;
1348             uint64_t inc;
1349
1350             switch (rounding_mode) {
1351             case float_round_nearest_even:
1352                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1353                 break;
1354             case float_round_ties_away:
1355                 inc = frac_lsbm1;
1356                 break;
1357             case float_round_to_zero:
1358                 inc = 0;
1359                 break;
1360             case float_round_up:
1361                 inc = a.sign ? 0 : rnd_mask;
1362                 break;
1363             case float_round_down:
1364                 inc = a.sign ? rnd_mask : 0;
1365                 break;
1366             default:
1367                 g_assert_not_reached();
1368             }
1369
1370             if (a.frac & rnd_mask) {
1371                 s->float_exception_flags |= float_flag_inexact;
1372                 a.frac += inc;
1373                 a.frac &= ~rnd_mask;
1374                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1375                     a.frac >>= 1;
1376                     a.exp++;
1377                 }
1378             }
1379         }
1380         break;
1381     default:
1382         g_assert_not_reached();
1383     }
1384     return a;
1385 }
1386
1387 float16 float16_round_to_int(float16 a, float_status *s)
1388 {
1389     FloatParts pa = float16_unpack_canonical(a, s);
1390     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1391     return float16_round_pack_canonical(pr, s);
1392 }
1393
1394 float32 float32_round_to_int(float32 a, float_status *s)
1395 {
1396     FloatParts pa = float32_unpack_canonical(a, s);
1397     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1398     return float32_round_pack_canonical(pr, s);
1399 }
1400
1401 float64 float64_round_to_int(float64 a, float_status *s)
1402 {
1403     FloatParts pa = float64_unpack_canonical(a, s);
1404     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1405     return float64_round_pack_canonical(pr, s);
1406 }
1407
1408 float64 float64_trunc_to_int(float64 a, float_status *s)
1409 {
1410     FloatParts pa = float64_unpack_canonical(a, s);
1411     FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1412     return float64_round_pack_canonical(pr, s);
1413 }
1414
1415 /*
1416  * Returns the result of converting the floating-point value `a' to
1417  * the two's complement integer format. The conversion is performed
1418  * according to the IEC/IEEE Standard for Binary Floating-Point
1419  * Arithmetic---which means in particular that the conversion is
1420  * rounded according to the current rounding mode. If `a' is a NaN,
1421  * the largest positive integer is returned. Otherwise, if the
1422  * conversion overflows, the largest integer with the same sign as `a'
1423  * is returned.
1424 */
1425
1426 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1427                                      int64_t min, int64_t max,
1428                                      float_status *s)
1429 {
1430     uint64_t r;
1431     int orig_flags = get_float_exception_flags(s);
1432     FloatParts p = round_to_int(in, rmode, s);
1433
1434     switch (p.cls) {
1435     case float_class_snan:
1436     case float_class_qnan:
1437         s->float_exception_flags = orig_flags | float_flag_invalid;
1438         return max;
1439     case float_class_inf:
1440         s->float_exception_flags = orig_flags | float_flag_invalid;
1441         return p.sign ? min : max;
1442     case float_class_zero:
1443         return 0;
1444     case float_class_normal:
1445         if (p.exp < DECOMPOSED_BINARY_POINT) {
1446             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1447         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1448             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1449         } else {
1450             r = UINT64_MAX;
1451         }
1452         if (p.sign) {
1453             if (r <= -(uint64_t) min) {
1454                 return -r;
1455             } else {
1456                 s->float_exception_flags = orig_flags | float_flag_invalid;
1457                 return min;
1458             }
1459         } else {
1460             if (r <= max) {
1461                 return r;
1462             } else {
1463                 s->float_exception_flags = orig_flags | float_flag_invalid;
1464                 return max;
1465             }
1466         }
1467     default:
1468         g_assert_not_reached();
1469     }
1470 }
1471
1472 #define FLOAT_TO_INT(fsz, isz)                                          \
1473 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a,         \
1474                                                 float_status *s)        \
1475 {                                                                       \
1476     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1477     return round_to_int_and_pack(p, s->float_rounding_mode,             \
1478                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1479                                  s);                                    \
1480 }                                                                       \
1481                                                                         \
1482 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero       \
1483  (float ## fsz a, float_status *s)                                      \
1484 {                                                                       \
1485     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1486     return round_to_int_and_pack(p, float_round_to_zero,                \
1487                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1488                                  s);                                    \
1489 }
1490
1491 FLOAT_TO_INT(16, 16)
1492 FLOAT_TO_INT(16, 32)
1493 FLOAT_TO_INT(16, 64)
1494
1495 FLOAT_TO_INT(32, 16)
1496 FLOAT_TO_INT(32, 32)
1497 FLOAT_TO_INT(32, 64)
1498
1499 FLOAT_TO_INT(64, 16)
1500 FLOAT_TO_INT(64, 32)
1501 FLOAT_TO_INT(64, 64)
1502
1503 #undef FLOAT_TO_INT
1504
1505 /*
1506  *  Returns the result of converting the floating-point value `a' to
1507  *  the unsigned integer format. The conversion is performed according
1508  *  to the IEC/IEEE Standard for Binary Floating-Point
1509  *  Arithmetic---which means in particular that the conversion is
1510  *  rounded according to the current rounding mode. If `a' is a NaN,
1511  *  the largest unsigned integer is returned. Otherwise, if the
1512  *  conversion overflows, the largest unsigned integer is returned. If
1513  *  the 'a' is negative, the result is rounded and zero is returned;
1514  *  values that do not round to zero will raise the inexact exception
1515  *  flag.
1516  */
1517
1518 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1519                                        float_status *s)
1520 {
1521     int orig_flags = get_float_exception_flags(s);
1522     FloatParts p = round_to_int(in, rmode, s);
1523
1524     switch (p.cls) {
1525     case float_class_snan:
1526     case float_class_qnan:
1527         s->float_exception_flags = orig_flags | float_flag_invalid;
1528         return max;
1529     case float_class_inf:
1530         s->float_exception_flags = orig_flags | float_flag_invalid;
1531         return p.sign ? 0 : max;
1532     case float_class_zero:
1533         return 0;
1534     case float_class_normal:
1535     {
1536         uint64_t r;
1537         if (p.sign) {
1538             s->float_exception_flags = orig_flags | float_flag_invalid;
1539             return 0;
1540         }
1541
1542         if (p.exp < DECOMPOSED_BINARY_POINT) {
1543             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1544         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1545             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1546         } else {
1547             s->float_exception_flags = orig_flags | float_flag_invalid;
1548             return max;
1549         }
1550
1551         /* For uint64 this will never trip, but if p.exp is too large
1552          * to shift a decomposed fraction we shall have exited via the
1553          * 3rd leg above.
1554          */
1555         if (r > max) {
1556             s->float_exception_flags = orig_flags | float_flag_invalid;
1557             return max;
1558         } else {
1559             return r;
1560         }
1561     }
1562     default:
1563         g_assert_not_reached();
1564     }
1565 }
1566
1567 #define FLOAT_TO_UINT(fsz, isz) \
1568 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a,       \
1569                                                   float_status *s)      \
1570 {                                                                       \
1571     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1572     return round_to_uint_and_pack(p, s->float_rounding_mode,            \
1573                                  UINT ## isz ## _MAX, s);               \
1574 }                                                                       \
1575                                                                         \
1576 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero     \
1577  (float ## fsz a, float_status *s)                                      \
1578 {                                                                       \
1579     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1580     return round_to_uint_and_pack(p, float_round_to_zero,               \
1581                                   UINT ## isz ## _MAX, s);              \
1582 }
1583
1584 FLOAT_TO_UINT(16, 16)
1585 FLOAT_TO_UINT(16, 32)
1586 FLOAT_TO_UINT(16, 64)
1587
1588 FLOAT_TO_UINT(32, 16)
1589 FLOAT_TO_UINT(32, 32)
1590 FLOAT_TO_UINT(32, 64)
1591
1592 FLOAT_TO_UINT(64, 16)
1593 FLOAT_TO_UINT(64, 32)
1594 FLOAT_TO_UINT(64, 64)
1595
1596 #undef FLOAT_TO_UINT
1597
1598 /*
1599  * Integer to float conversions
1600  *
1601  * Returns the result of converting the two's complement integer `a'
1602  * to the floating-point format. The conversion is performed according
1603  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1604  */
1605
1606 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
1607 {
1608     FloatParts r = { .sign = false };
1609
1610     if (a == 0) {
1611         r.cls = float_class_zero;
1612     } else {
1613         uint64_t f = a;
1614         int shift;
1615
1616         r.cls = float_class_normal;
1617         if (a < 0) {
1618             f = -f;
1619             r.sign = true;
1620         }
1621         shift = clz64(f) - 1;
1622         scale = MIN(MAX(scale, -0x10000), 0x10000);
1623
1624         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
1625         r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
1626     }
1627
1628     return r;
1629 }
1630
1631 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
1632 {
1633     FloatParts pa = int_to_float(a, scale, status);
1634     return float16_round_pack_canonical(pa, status);
1635 }
1636
1637 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
1638 {
1639     return int64_to_float16_scalbn(a, scale, status);
1640 }
1641
1642 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
1643 {
1644     return int64_to_float16_scalbn(a, scale, status);
1645 }
1646
1647 float16 int64_to_float16(int64_t a, float_status *status)
1648 {
1649     return int64_to_float16_scalbn(a, 0, status);
1650 }
1651
1652 float16 int32_to_float16(int32_t a, float_status *status)
1653 {
1654     return int64_to_float16_scalbn(a, 0, status);
1655 }
1656
1657 float16 int16_to_float16(int16_t a, float_status *status)
1658 {
1659     return int64_to_float16_scalbn(a, 0, status);
1660 }
1661
1662 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
1663 {
1664     FloatParts pa = int_to_float(a, scale, status);
1665     return float32_round_pack_canonical(pa, status);
1666 }
1667
1668 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
1669 {
1670     return int64_to_float32_scalbn(a, scale, status);
1671 }
1672
1673 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
1674 {
1675     return int64_to_float32_scalbn(a, scale, status);
1676 }
1677
1678 float32 int64_to_float32(int64_t a, float_status *status)
1679 {
1680     return int64_to_float32_scalbn(a, 0, status);
1681 }
1682
1683 float32 int32_to_float32(int32_t a, float_status *status)
1684 {
1685     return int64_to_float32_scalbn(a, 0, status);
1686 }
1687
1688 float32 int16_to_float32(int16_t a, float_status *status)
1689 {
1690     return int64_to_float32_scalbn(a, 0, status);
1691 }
1692
1693 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
1694 {
1695     FloatParts pa = int_to_float(a, scale, status);
1696     return float64_round_pack_canonical(pa, status);
1697 }
1698
1699 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
1700 {
1701     return int64_to_float64_scalbn(a, scale, status);
1702 }
1703
1704 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
1705 {
1706     return int64_to_float64_scalbn(a, scale, status);
1707 }
1708
1709 float64 int64_to_float64(int64_t a, float_status *status)
1710 {
1711     return int64_to_float64_scalbn(a, 0, status);
1712 }
1713
1714 float64 int32_to_float64(int32_t a, float_status *status)
1715 {
1716     return int64_to_float64_scalbn(a, 0, status);
1717 }
1718
1719 float64 int16_to_float64(int16_t a, float_status *status)
1720 {
1721     return int64_to_float64_scalbn(a, 0, status);
1722 }
1723
1724
1725 /*
1726  * Unsigned Integer to float conversions
1727  *
1728  * Returns the result of converting the unsigned integer `a' to the
1729  * floating-point format. The conversion is performed according to the
1730  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1731  */
1732
1733 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
1734 {
1735     FloatParts r = { .sign = false };
1736
1737     if (a == 0) {
1738         r.cls = float_class_zero;
1739     } else {
1740         scale = MIN(MAX(scale, -0x10000), 0x10000);
1741         r.cls = float_class_normal;
1742         if ((int64_t)a < 0) {
1743             r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
1744             shift64RightJamming(a, 1, &a);
1745             r.frac = a;
1746         } else {
1747             int shift = clz64(a) - 1;
1748             r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
1749             r.frac = a << shift;
1750         }
1751     }
1752
1753     return r;
1754 }
1755
1756 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
1757 {
1758     FloatParts pa = uint_to_float(a, scale, status);
1759     return float16_round_pack_canonical(pa, status);
1760 }
1761
1762 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
1763 {
1764     return uint64_to_float16_scalbn(a, scale, status);
1765 }
1766
1767 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
1768 {
1769     return uint64_to_float16_scalbn(a, scale, status);
1770 }
1771
1772 float16 uint64_to_float16(uint64_t a, float_status *status)
1773 {
1774     return uint64_to_float16_scalbn(a, 0, status);
1775 }
1776
1777 float16 uint32_to_float16(uint32_t a, float_status *status)
1778 {
1779     return uint64_to_float16_scalbn(a, 0, status);
1780 }
1781
1782 float16 uint16_to_float16(uint16_t a, float_status *status)
1783 {
1784     return uint64_to_float16_scalbn(a, 0, status);
1785 }
1786
1787 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
1788 {
1789     FloatParts pa = uint_to_float(a, scale, status);
1790     return float32_round_pack_canonical(pa, status);
1791 }
1792
1793 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
1794 {
1795     return uint64_to_float32_scalbn(a, scale, status);
1796 }
1797
1798 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
1799 {
1800     return uint64_to_float32_scalbn(a, scale, status);
1801 }
1802
1803 float32 uint64_to_float32(uint64_t a, float_status *status)
1804 {
1805     return uint64_to_float32_scalbn(a, 0, status);
1806 }
1807
1808 float32 uint32_to_float32(uint32_t a, float_status *status)
1809 {
1810     return uint64_to_float32_scalbn(a, 0, status);
1811 }
1812
1813 float32 uint16_to_float32(uint16_t a, float_status *status)
1814 {
1815     return uint64_to_float32_scalbn(a, 0, status);
1816 }
1817
1818 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
1819 {
1820     FloatParts pa = uint_to_float(a, scale, status);
1821     return float64_round_pack_canonical(pa, status);
1822 }
1823
1824 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
1825 {
1826     return uint64_to_float64_scalbn(a, scale, status);
1827 }
1828
1829 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
1830 {
1831     return uint64_to_float64_scalbn(a, scale, status);
1832 }
1833
1834 float64 uint64_to_float64(uint64_t a, float_status *status)
1835 {
1836     return uint64_to_float64_scalbn(a, 0, status);
1837 }
1838
1839 float64 uint32_to_float64(uint32_t a, float_status *status)
1840 {
1841     return uint64_to_float64_scalbn(a, 0, status);
1842 }
1843
1844 float64 uint16_to_float64(uint16_t a, float_status *status)
1845 {
1846     return uint64_to_float64_scalbn(a, 0, status);
1847 }
1848
1849 /* Float Min/Max */
1850 /* min() and max() functions. These can't be implemented as
1851  * 'compare and pick one input' because that would mishandle
1852  * NaNs and +0 vs -0.
1853  *
1854  * minnum() and maxnum() functions. These are similar to the min()
1855  * and max() functions but if one of the arguments is a QNaN and
1856  * the other is numerical then the numerical argument is returned.
1857  * SNaNs will get quietened before being returned.
1858  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1859  * and maxNum() operations. min() and max() are the typical min/max
1860  * semantics provided by many CPUs which predate that specification.
1861  *
1862  * minnummag() and maxnummag() functions correspond to minNumMag()
1863  * and minNumMag() from the IEEE-754 2008.
1864  */
1865 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1866                                 bool ieee, bool ismag, float_status *s)
1867 {
1868     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1869         if (ieee) {
1870             /* Takes two floating-point values `a' and `b', one of
1871              * which is a NaN, and returns the appropriate NaN
1872              * result. If either `a' or `b' is a signaling NaN,
1873              * the invalid exception is raised.
1874              */
1875             if (is_snan(a.cls) || is_snan(b.cls)) {
1876                 return pick_nan(a, b, s);
1877             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1878                 return b;
1879             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1880                 return a;
1881             }
1882         }
1883         return pick_nan(a, b, s);
1884     } else {
1885         int a_exp, b_exp;
1886
1887         switch (a.cls) {
1888         case float_class_normal:
1889             a_exp = a.exp;
1890             break;
1891         case float_class_inf:
1892             a_exp = INT_MAX;
1893             break;
1894         case float_class_zero:
1895             a_exp = INT_MIN;
1896             break;
1897         default:
1898             g_assert_not_reached();
1899             break;
1900         }
1901         switch (b.cls) {
1902         case float_class_normal:
1903             b_exp = b.exp;
1904             break;
1905         case float_class_inf:
1906             b_exp = INT_MAX;
1907             break;
1908         case float_class_zero:
1909             b_exp = INT_MIN;
1910             break;
1911         default:
1912             g_assert_not_reached();
1913             break;
1914         }
1915
1916         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
1917             bool a_less = a_exp < b_exp;
1918             if (a_exp == b_exp) {
1919                 a_less = a.frac < b.frac;
1920             }
1921             return a_less ^ ismin ? b : a;
1922         }
1923
1924         if (a.sign == b.sign) {
1925             bool a_less = a_exp < b_exp;
1926             if (a_exp == b_exp) {
1927                 a_less = a.frac < b.frac;
1928             }
1929             return a.sign ^ a_less ^ ismin ? b : a;
1930         } else {
1931             return a.sign ^ ismin ? b : a;
1932         }
1933     }
1934 }
1935
1936 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
1937 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
1938                                      float_status *s)                   \
1939 {                                                                       \
1940     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1941     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1942     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
1943                                                                         \
1944     return float ## sz ## _round_pack_canonical(pr, s);                 \
1945 }
1946
1947 MINMAX(16, min, true, false, false)
1948 MINMAX(16, minnum, true, true, false)
1949 MINMAX(16, minnummag, true, true, true)
1950 MINMAX(16, max, false, false, false)
1951 MINMAX(16, maxnum, false, true, false)
1952 MINMAX(16, maxnummag, false, true, true)
1953
1954 MINMAX(32, min, true, false, false)
1955 MINMAX(32, minnum, true, true, false)
1956 MINMAX(32, minnummag, true, true, true)
1957 MINMAX(32, max, false, false, false)
1958 MINMAX(32, maxnum, false, true, false)
1959 MINMAX(32, maxnummag, false, true, true)
1960
1961 MINMAX(64, min, true, false, false)
1962 MINMAX(64, minnum, true, true, false)
1963 MINMAX(64, minnummag, true, true, true)
1964 MINMAX(64, max, false, false, false)
1965 MINMAX(64, maxnum, false, true, false)
1966 MINMAX(64, maxnummag, false, true, true)
1967
1968 #undef MINMAX
1969
1970 /* Floating point compare */
1971 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1972                           float_status *s)
1973 {
1974     if (is_nan(a.cls) || is_nan(b.cls)) {
1975         if (!is_quiet ||
1976             a.cls == float_class_snan ||
1977             b.cls == float_class_snan) {
1978             s->float_exception_flags |= float_flag_invalid;
1979         }
1980         return float_relation_unordered;
1981     }
1982
1983     if (a.cls == float_class_zero) {
1984         if (b.cls == float_class_zero) {
1985             return float_relation_equal;
1986         }
1987         return b.sign ? float_relation_greater : float_relation_less;
1988     } else if (b.cls == float_class_zero) {
1989         return a.sign ? float_relation_less : float_relation_greater;
1990     }
1991
1992     /* The only really important thing about infinity is its sign. If
1993      * both are infinities the sign marks the smallest of the two.
1994      */
1995     if (a.cls == float_class_inf) {
1996         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1997             return float_relation_equal;
1998         }
1999         return a.sign ? float_relation_less : float_relation_greater;
2000     } else if (b.cls == float_class_inf) {
2001         return b.sign ? float_relation_greater : float_relation_less;
2002     }
2003
2004     if (a.sign != b.sign) {
2005         return a.sign ? float_relation_less : float_relation_greater;
2006     }
2007
2008     if (a.exp == b.exp) {
2009         if (a.frac == b.frac) {
2010             return float_relation_equal;
2011         }
2012         if (a.sign) {
2013             return a.frac > b.frac ?
2014                 float_relation_less : float_relation_greater;
2015         } else {
2016             return a.frac > b.frac ?
2017                 float_relation_greater : float_relation_less;
2018         }
2019     } else {
2020         if (a.sign) {
2021             return a.exp > b.exp ? float_relation_less : float_relation_greater;
2022         } else {
2023             return a.exp > b.exp ? float_relation_greater : float_relation_less;
2024         }
2025     }
2026 }
2027
2028 #define COMPARE(sz)                                                     \
2029 int float ## sz ## _compare(float ## sz a, float ## sz b,               \
2030                             float_status *s)                            \
2031 {                                                                       \
2032     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2033     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2034     return compare_floats(pa, pb, false, s);                            \
2035 }                                                                       \
2036 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b,         \
2037                                   float_status *s)                      \
2038 {                                                                       \
2039     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2040     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2041     return compare_floats(pa, pb, true, s);                             \
2042 }
2043
2044 COMPARE(16)
2045 COMPARE(32)
2046 COMPARE(64)
2047
2048 #undef COMPARE
2049
2050 /* Multiply A by 2 raised to the power N.  */
2051 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
2052 {
2053     if (unlikely(is_nan(a.cls))) {
2054         return return_nan(a, s);
2055     }
2056     if (a.cls == float_class_normal) {
2057         /* The largest float type (even though not supported by FloatParts)
2058          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
2059          * still allows rounding to infinity, without allowing overflow
2060          * within the int32_t that backs FloatParts.exp.
2061          */
2062         n = MIN(MAX(n, -0x10000), 0x10000);
2063         a.exp += n;
2064     }
2065     return a;
2066 }
2067
2068 float16 float16_scalbn(float16 a, int n, float_status *status)
2069 {
2070     FloatParts pa = float16_unpack_canonical(a, status);
2071     FloatParts pr = scalbn_decomposed(pa, n, status);
2072     return float16_round_pack_canonical(pr, status);
2073 }
2074
2075 float32 float32_scalbn(float32 a, int n, float_status *status)
2076 {
2077     FloatParts pa = float32_unpack_canonical(a, status);
2078     FloatParts pr = scalbn_decomposed(pa, n, status);
2079     return float32_round_pack_canonical(pr, status);
2080 }
2081
2082 float64 float64_scalbn(float64 a, int n, float_status *status)
2083 {
2084     FloatParts pa = float64_unpack_canonical(a, status);
2085     FloatParts pr = scalbn_decomposed(pa, n, status);
2086     return float64_round_pack_canonical(pr, status);
2087 }
2088
2089 /*
2090  * Square Root
2091  *
2092  * The old softfloat code did an approximation step before zeroing in
2093  * on the final result. However for simpleness we just compute the
2094  * square root by iterating down from the implicit bit to enough extra
2095  * bits to ensure we get a correctly rounded result.
2096  *
2097  * This does mean however the calculation is slower than before,
2098  * especially for 64 bit floats.
2099  */
2100
2101 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
2102 {
2103     uint64_t a_frac, r_frac, s_frac;
2104     int bit, last_bit;
2105
2106     if (is_nan(a.cls)) {
2107         return return_nan(a, s);
2108     }
2109     if (a.cls == float_class_zero) {
2110         return a;  /* sqrt(+-0) = +-0 */
2111     }
2112     if (a.sign) {
2113         s->float_exception_flags |= float_flag_invalid;
2114         return parts_default_nan(s);
2115     }
2116     if (a.cls == float_class_inf) {
2117         return a;  /* sqrt(+inf) = +inf */
2118     }
2119
2120     assert(a.cls == float_class_normal);
2121
2122     /* We need two overflow bits at the top. Adding room for that is a
2123      * right shift. If the exponent is odd, we can discard the low bit
2124      * by multiplying the fraction by 2; that's a left shift. Combine
2125      * those and we shift right if the exponent is even.
2126      */
2127     a_frac = a.frac;
2128     if (!(a.exp & 1)) {
2129         a_frac >>= 1;
2130     }
2131     a.exp >>= 1;
2132
2133     /* Bit-by-bit computation of sqrt.  */
2134     r_frac = 0;
2135     s_frac = 0;
2136
2137     /* Iterate from implicit bit down to the 3 extra bits to compute a
2138      * properly rounded result. Remember we've inserted one more bit
2139      * at the top, so these positions are one less.
2140      */
2141     bit = DECOMPOSED_BINARY_POINT - 1;
2142     last_bit = MAX(p->frac_shift - 4, 0);
2143     do {
2144         uint64_t q = 1ULL << bit;
2145         uint64_t t_frac = s_frac + q;
2146         if (t_frac <= a_frac) {
2147             s_frac = t_frac + q;
2148             a_frac -= t_frac;
2149             r_frac += q;
2150         }
2151         a_frac <<= 1;
2152     } while (--bit >= last_bit);
2153
2154     /* Undo the right shift done above. If there is any remaining
2155      * fraction, the result is inexact. Set the sticky bit.
2156      */
2157     a.frac = (r_frac << 1) + (a_frac != 0);
2158
2159     return a;
2160 }
2161
2162 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
2163 {
2164     FloatParts pa = float16_unpack_canonical(a, status);
2165     FloatParts pr = sqrt_float(pa, status, &float16_params);
2166     return float16_round_pack_canonical(pr, status);
2167 }
2168
2169 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
2170 {
2171     FloatParts pa = float32_unpack_canonical(a, status);
2172     FloatParts pr = sqrt_float(pa, status, &float32_params);
2173     return float32_round_pack_canonical(pr, status);
2174 }
2175
2176 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
2177 {
2178     FloatParts pa = float64_unpack_canonical(a, status);
2179     FloatParts pr = sqrt_float(pa, status, &float64_params);
2180     return float64_round_pack_canonical(pr, status);
2181 }
2182
2183 /*----------------------------------------------------------------------------
2184 | The pattern for a default generated NaN.
2185 *----------------------------------------------------------------------------*/
2186
2187 float16 float16_default_nan(float_status *status)
2188 {
2189     FloatParts p = parts_default_nan(status);
2190     p.frac >>= float16_params.frac_shift;
2191     return float16_pack_raw(p);
2192 }
2193
2194 float32 float32_default_nan(float_status *status)
2195 {
2196     FloatParts p = parts_default_nan(status);
2197     p.frac >>= float32_params.frac_shift;
2198     return float32_pack_raw(p);
2199 }
2200
2201 float64 float64_default_nan(float_status *status)
2202 {
2203     FloatParts p = parts_default_nan(status);
2204     p.frac >>= float64_params.frac_shift;
2205     return float64_pack_raw(p);
2206 }
2207
2208 float128 float128_default_nan(float_status *status)
2209 {
2210     FloatParts p = parts_default_nan(status);
2211     float128 r;
2212
2213     /* Extrapolate from the choices made by parts_default_nan to fill
2214      * in the quad-floating format.  If the low bit is set, assume we
2215      * want to set all non-snan bits.
2216      */
2217     r.low = -(p.frac & 1);
2218     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
2219     r.high |= LIT64(0x7FFF000000000000);
2220     r.high |= (uint64_t)p.sign << 63;
2221
2222     return r;
2223 }
2224
2225 /*----------------------------------------------------------------------------
2226 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
2227 *----------------------------------------------------------------------------*/
2228
2229 float16 float16_silence_nan(float16 a, float_status *status)
2230 {
2231     FloatParts p = float16_unpack_raw(a);
2232     p.frac <<= float16_params.frac_shift;
2233     p = parts_silence_nan(p, status);
2234     p.frac >>= float16_params.frac_shift;
2235     return float16_pack_raw(p);
2236 }
2237
2238 float32 float32_silence_nan(float32 a, float_status *status)
2239 {
2240     FloatParts p = float32_unpack_raw(a);
2241     p.frac <<= float32_params.frac_shift;
2242     p = parts_silence_nan(p, status);
2243     p.frac >>= float32_params.frac_shift;
2244     return float32_pack_raw(p);
2245 }
2246
2247 float64 float64_silence_nan(float64 a, float_status *status)
2248 {
2249     FloatParts p = float64_unpack_raw(a);
2250     p.frac <<= float64_params.frac_shift;
2251     p = parts_silence_nan(p, status);
2252     p.frac >>= float64_params.frac_shift;
2253     return float64_pack_raw(p);
2254 }
2255
2256 /*----------------------------------------------------------------------------
2257 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2258 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2259 | input.  If `zSign' is 1, the input is negated before being converted to an
2260 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
2261 | is simply rounded to an integer, with the inexact exception raised if the
2262 | input cannot be represented exactly as an integer.  However, if the fixed-
2263 | point input is too large, the invalid exception is raised and the largest
2264 | positive or negative integer is returned.
2265 *----------------------------------------------------------------------------*/
2266
2267 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2268 {
2269     int8_t roundingMode;
2270     flag roundNearestEven;
2271     int8_t roundIncrement, roundBits;
2272     int32_t z;
2273
2274     roundingMode = status->float_rounding_mode;
2275     roundNearestEven = ( roundingMode == float_round_nearest_even );
2276     switch (roundingMode) {
2277     case float_round_nearest_even:
2278     case float_round_ties_away:
2279         roundIncrement = 0x40;
2280         break;
2281     case float_round_to_zero:
2282         roundIncrement = 0;
2283         break;
2284     case float_round_up:
2285         roundIncrement = zSign ? 0 : 0x7f;
2286         break;
2287     case float_round_down:
2288         roundIncrement = zSign ? 0x7f : 0;
2289         break;
2290     default:
2291         abort();
2292     }
2293     roundBits = absZ & 0x7F;
2294     absZ = ( absZ + roundIncrement )>>7;
2295     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2296     z = absZ;
2297     if ( zSign ) z = - z;
2298     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2299         float_raise(float_flag_invalid, status);
2300         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2301     }
2302     if (roundBits) {
2303         status->float_exception_flags |= float_flag_inexact;
2304     }
2305     return z;
2306
2307 }
2308
2309 /*----------------------------------------------------------------------------
2310 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2311 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2312 | and returns the properly rounded 64-bit integer corresponding to the input.
2313 | If `zSign' is 1, the input is negated before being converted to an integer.
2314 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2315 | the inexact exception raised if the input cannot be represented exactly as
2316 | an integer.  However, if the fixed-point input is too large, the invalid
2317 | exception is raised and the largest positive or negative integer is
2318 | returned.
2319 *----------------------------------------------------------------------------*/
2320
2321 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2322                                float_status *status)
2323 {
2324     int8_t roundingMode;
2325     flag roundNearestEven, increment;
2326     int64_t z;
2327
2328     roundingMode = status->float_rounding_mode;
2329     roundNearestEven = ( roundingMode == float_round_nearest_even );
2330     switch (roundingMode) {
2331     case float_round_nearest_even:
2332     case float_round_ties_away:
2333         increment = ((int64_t) absZ1 < 0);
2334         break;
2335     case float_round_to_zero:
2336         increment = 0;
2337         break;
2338     case float_round_up:
2339         increment = !zSign && absZ1;
2340         break;
2341     case float_round_down:
2342         increment = zSign && absZ1;
2343         break;
2344     default:
2345         abort();
2346     }
2347     if ( increment ) {
2348         ++absZ0;
2349         if ( absZ0 == 0 ) goto overflow;
2350         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2351     }
2352     z = absZ0;
2353     if ( zSign ) z = - z;
2354     if ( z && ( ( z < 0 ) ^ zSign ) ) {
2355  overflow:
2356         float_raise(float_flag_invalid, status);
2357         return
2358               zSign ? (int64_t) LIT64( 0x8000000000000000 )
2359             : LIT64( 0x7FFFFFFFFFFFFFFF );
2360     }
2361     if (absZ1) {
2362         status->float_exception_flags |= float_flag_inexact;
2363     }
2364     return z;
2365
2366 }
2367
2368 /*----------------------------------------------------------------------------
2369 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2370 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2371 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2372 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
2373 | with the inexact exception raised if the input cannot be represented exactly
2374 | as an integer.  However, if the fixed-point input is too large, the invalid
2375 | exception is raised and the largest unsigned integer is returned.
2376 *----------------------------------------------------------------------------*/
2377
2378 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2379                                 uint64_t absZ1, float_status *status)
2380 {
2381     int8_t roundingMode;
2382     flag roundNearestEven, increment;
2383
2384     roundingMode = status->float_rounding_mode;
2385     roundNearestEven = (roundingMode == float_round_nearest_even);
2386     switch (roundingMode) {
2387     case float_round_nearest_even:
2388     case float_round_ties_away:
2389         increment = ((int64_t)absZ1 < 0);
2390         break;
2391     case float_round_to_zero:
2392         increment = 0;
2393         break;
2394     case float_round_up:
2395         increment = !zSign && absZ1;
2396         break;
2397     case float_round_down:
2398         increment = zSign && absZ1;
2399         break;
2400     default:
2401         abort();
2402     }
2403     if (increment) {
2404         ++absZ0;
2405         if (absZ0 == 0) {
2406             float_raise(float_flag_invalid, status);
2407             return LIT64(0xFFFFFFFFFFFFFFFF);
2408         }
2409         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2410     }
2411
2412     if (zSign && absZ0) {
2413         float_raise(float_flag_invalid, status);
2414         return 0;
2415     }
2416
2417     if (absZ1) {
2418         status->float_exception_flags |= float_flag_inexact;
2419     }
2420     return absZ0;
2421 }
2422
2423 /*----------------------------------------------------------------------------
2424 | If `a' is denormal and we are in flush-to-zero mode then set the
2425 | input-denormal exception and return zero. Otherwise just return the value.
2426 *----------------------------------------------------------------------------*/
2427 float32 float32_squash_input_denormal(float32 a, float_status *status)
2428 {
2429     if (status->flush_inputs_to_zero) {
2430         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2431             float_raise(float_flag_input_denormal, status);
2432             return make_float32(float32_val(a) & 0x80000000);
2433         }
2434     }
2435     return a;
2436 }
2437
2438 /*----------------------------------------------------------------------------
2439 | Normalizes the subnormal single-precision floating-point value represented
2440 | by the denormalized significand `aSig'.  The normalized exponent and
2441 | significand are stored at the locations pointed to by `zExpPtr' and
2442 | `zSigPtr', respectively.
2443 *----------------------------------------------------------------------------*/
2444
2445 static void
2446  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2447 {
2448     int8_t shiftCount;
2449
2450     shiftCount = countLeadingZeros32( aSig ) - 8;
2451     *zSigPtr = aSig<<shiftCount;
2452     *zExpPtr = 1 - shiftCount;
2453
2454 }
2455
2456 /*----------------------------------------------------------------------------
2457 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2458 | and significand `zSig', and returns the proper single-precision floating-
2459 | point value corresponding to the abstract input.  Ordinarily, the abstract
2460 | value is simply rounded and packed into the single-precision format, with
2461 | the inexact exception raised if the abstract input cannot be represented
2462 | exactly.  However, if the abstract value is too large, the overflow and
2463 | inexact exceptions are raised and an infinity or maximal finite value is
2464 | returned.  If the abstract value is too small, the input value is rounded to
2465 | a subnormal number, and the underflow and inexact exceptions are raised if
2466 | the abstract input cannot be represented exactly as a subnormal single-
2467 | precision floating-point number.
2468 |     The input significand `zSig' has its binary point between bits 30
2469 | and 29, which is 7 bits to the left of the usual location.  This shifted
2470 | significand must be normalized or smaller.  If `zSig' is not normalized,
2471 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2472 | and it must not require rounding.  In the usual case that `zSig' is
2473 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2474 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2475 | Binary Floating-Point Arithmetic.
2476 *----------------------------------------------------------------------------*/
2477
2478 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2479                                    float_status *status)
2480 {
2481     int8_t roundingMode;
2482     flag roundNearestEven;
2483     int8_t roundIncrement, roundBits;
2484     flag isTiny;
2485
2486     roundingMode = status->float_rounding_mode;
2487     roundNearestEven = ( roundingMode == float_round_nearest_even );
2488     switch (roundingMode) {
2489     case float_round_nearest_even:
2490     case float_round_ties_away:
2491         roundIncrement = 0x40;
2492         break;
2493     case float_round_to_zero:
2494         roundIncrement = 0;
2495         break;
2496     case float_round_up:
2497         roundIncrement = zSign ? 0 : 0x7f;
2498         break;
2499     case float_round_down:
2500         roundIncrement = zSign ? 0x7f : 0;
2501         break;
2502     default:
2503         abort();
2504         break;
2505     }
2506     roundBits = zSig & 0x7F;
2507     if ( 0xFD <= (uint16_t) zExp ) {
2508         if (    ( 0xFD < zExp )
2509              || (    ( zExp == 0xFD )
2510                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2511            ) {
2512             float_raise(float_flag_overflow | float_flag_inexact, status);
2513             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2514         }
2515         if ( zExp < 0 ) {
2516             if (status->flush_to_zero) {
2517                 float_raise(float_flag_output_denormal, status);
2518                 return packFloat32(zSign, 0, 0);
2519             }
2520             isTiny =
2521                 (status->float_detect_tininess
2522                  == float_tininess_before_rounding)
2523                 || ( zExp < -1 )
2524                 || ( zSig + roundIncrement < 0x80000000 );
2525             shift32RightJamming( zSig, - zExp, &zSig );
2526             zExp = 0;
2527             roundBits = zSig & 0x7F;
2528             if (isTiny && roundBits) {
2529                 float_raise(float_flag_underflow, status);
2530             }
2531         }
2532     }
2533     if (roundBits) {
2534         status->float_exception_flags |= float_flag_inexact;
2535     }
2536     zSig = ( zSig + roundIncrement )>>7;
2537     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2538     if ( zSig == 0 ) zExp = 0;
2539     return packFloat32( zSign, zExp, zSig );
2540
2541 }
2542
2543 /*----------------------------------------------------------------------------
2544 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2545 | and significand `zSig', and returns the proper single-precision floating-
2546 | point value corresponding to the abstract input.  This routine is just like
2547 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2548 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2549 | floating-point exponent.
2550 *----------------------------------------------------------------------------*/
2551
2552 static float32
2553  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2554                               float_status *status)
2555 {
2556     int8_t shiftCount;
2557
2558     shiftCount = countLeadingZeros32( zSig ) - 1;
2559     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2560                                status);
2561
2562 }
2563
2564 /*----------------------------------------------------------------------------
2565 | If `a' is denormal and we are in flush-to-zero mode then set the
2566 | input-denormal exception and return zero. Otherwise just return the value.
2567 *----------------------------------------------------------------------------*/
2568 float64 float64_squash_input_denormal(float64 a, float_status *status)
2569 {
2570     if (status->flush_inputs_to_zero) {
2571         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2572             float_raise(float_flag_input_denormal, status);
2573             return make_float64(float64_val(a) & (1ULL << 63));
2574         }
2575     }
2576     return a;
2577 }
2578
2579 /*----------------------------------------------------------------------------
2580 | Normalizes the subnormal double-precision floating-point value represented
2581 | by the denormalized significand `aSig'.  The normalized exponent and
2582 | significand are stored at the locations pointed to by `zExpPtr' and
2583 | `zSigPtr', respectively.
2584 *----------------------------------------------------------------------------*/
2585
2586 static void
2587  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2588 {
2589     int8_t shiftCount;
2590
2591     shiftCount = countLeadingZeros64( aSig ) - 11;
2592     *zSigPtr = aSig<<shiftCount;
2593     *zExpPtr = 1 - shiftCount;
2594
2595 }
2596
2597 /*----------------------------------------------------------------------------
2598 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2599 | double-precision floating-point value, returning the result.  After being
2600 | shifted into the proper positions, the three fields are simply added
2601 | together to form the result.  This means that any integer portion of `zSig'
2602 | will be added into the exponent.  Since a properly normalized significand
2603 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2604 | than the desired result exponent whenever `zSig' is a complete, normalized
2605 | significand.
2606 *----------------------------------------------------------------------------*/
2607
2608 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2609 {
2610
2611     return make_float64(
2612         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2613
2614 }
2615
2616 /*----------------------------------------------------------------------------
2617 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2618 | and significand `zSig', and returns the proper double-precision floating-
2619 | point value corresponding to the abstract input.  Ordinarily, the abstract
2620 | value is simply rounded and packed into the double-precision format, with
2621 | the inexact exception raised if the abstract input cannot be represented
2622 | exactly.  However, if the abstract value is too large, the overflow and
2623 | inexact exceptions are raised and an infinity or maximal finite value is
2624 | returned.  If the abstract value is too small, the input value is rounded to
2625 | a subnormal number, and the underflow and inexact exceptions are raised if
2626 | the abstract input cannot be represented exactly as a subnormal double-
2627 | precision floating-point number.
2628 |     The input significand `zSig' has its binary point between bits 62
2629 | and 61, which is 10 bits to the left of the usual location.  This shifted
2630 | significand must be normalized or smaller.  If `zSig' is not normalized,
2631 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2632 | and it must not require rounding.  In the usual case that `zSig' is
2633 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2634 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2635 | Binary Floating-Point Arithmetic.
2636 *----------------------------------------------------------------------------*/
2637
2638 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2639                                    float_status *status)
2640 {
2641     int8_t roundingMode;
2642     flag roundNearestEven;
2643     int roundIncrement, roundBits;
2644     flag isTiny;
2645
2646     roundingMode = status->float_rounding_mode;
2647     roundNearestEven = ( roundingMode == float_round_nearest_even );
2648     switch (roundingMode) {
2649     case float_round_nearest_even:
2650     case float_round_ties_away:
2651         roundIncrement = 0x200;
2652         break;
2653     case float_round_to_zero:
2654         roundIncrement = 0;
2655         break;
2656     case float_round_up:
2657         roundIncrement = zSign ? 0 : 0x3ff;
2658         break;
2659     case float_round_down:
2660         roundIncrement = zSign ? 0x3ff : 0;
2661         break;
2662     case float_round_to_odd:
2663         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2664         break;
2665     default:
2666         abort();
2667     }
2668     roundBits = zSig & 0x3FF;
2669     if ( 0x7FD <= (uint16_t) zExp ) {
2670         if (    ( 0x7FD < zExp )
2671              || (    ( zExp == 0x7FD )
2672                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2673            ) {
2674             bool overflow_to_inf = roundingMode != float_round_to_odd &&
2675                                    roundIncrement != 0;
2676             float_raise(float_flag_overflow | float_flag_inexact, status);
2677             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2678         }
2679         if ( zExp < 0 ) {
2680             if (status->flush_to_zero) {
2681                 float_raise(float_flag_output_denormal, status);
2682                 return packFloat64(zSign, 0, 0);
2683             }
2684             isTiny =
2685                    (status->float_detect_tininess
2686                     == float_tininess_before_rounding)
2687                 || ( zExp < -1 )
2688                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2689             shift64RightJamming( zSig, - zExp, &zSig );
2690             zExp = 0;
2691             roundBits = zSig & 0x3FF;
2692             if (isTiny && roundBits) {
2693                 float_raise(float_flag_underflow, status);
2694             }
2695             if (roundingMode == float_round_to_odd) {
2696                 /*
2697                  * For round-to-odd case, the roundIncrement depends on
2698                  * zSig which just changed.
2699                  */
2700                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2701             }
2702         }
2703     }
2704     if (roundBits) {
2705         status->float_exception_flags |= float_flag_inexact;
2706     }
2707     zSig = ( zSig + roundIncrement )>>10;
2708     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2709     if ( zSig == 0 ) zExp = 0;
2710     return packFloat64( zSign, zExp, zSig );
2711
2712 }
2713
2714 /*----------------------------------------------------------------------------
2715 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2716 | and significand `zSig', and returns the proper double-precision floating-
2717 | point value corresponding to the abstract input.  This routine is just like
2718 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2719 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2720 | floating-point exponent.
2721 *----------------------------------------------------------------------------*/
2722
2723 static float64
2724  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2725                               float_status *status)
2726 {
2727     int8_t shiftCount;
2728
2729     shiftCount = countLeadingZeros64( zSig ) - 1;
2730     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2731                                status);
2732
2733 }
2734
2735 /*----------------------------------------------------------------------------
2736 | Normalizes the subnormal extended double-precision floating-point value
2737 | represented by the denormalized significand `aSig'.  The normalized exponent
2738 | and significand are stored at the locations pointed to by `zExpPtr' and
2739 | `zSigPtr', respectively.
2740 *----------------------------------------------------------------------------*/
2741
2742 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2743                                 uint64_t *zSigPtr)
2744 {
2745     int8_t shiftCount;
2746
2747     shiftCount = countLeadingZeros64( aSig );
2748     *zSigPtr = aSig<<shiftCount;
2749     *zExpPtr = 1 - shiftCount;
2750 }
2751
2752 /*----------------------------------------------------------------------------
2753 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2754 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2755 | and returns the proper extended double-precision floating-point value
2756 | corresponding to the abstract input.  Ordinarily, the abstract value is
2757 | rounded and packed into the extended double-precision format, with the
2758 | inexact exception raised if the abstract input cannot be represented
2759 | exactly.  However, if the abstract value is too large, the overflow and
2760 | inexact exceptions are raised and an infinity or maximal finite value is
2761 | returned.  If the abstract value is too small, the input value is rounded to
2762 | a subnormal number, and the underflow and inexact exceptions are raised if
2763 | the abstract input cannot be represented exactly as a subnormal extended
2764 | double-precision floating-point number.
2765 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
2766 | number of bits as single or double precision, respectively.  Otherwise, the
2767 | result is rounded to the full precision of the extended double-precision
2768 | format.
2769 |     The input significand must be normalized or smaller.  If the input
2770 | significand is not normalized, `zExp' must be 0; in that case, the result
2771 | returned is a subnormal number, and it must not require rounding.  The
2772 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2773 | Floating-Point Arithmetic.
2774 *----------------------------------------------------------------------------*/
2775
2776 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2777                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2778                               float_status *status)
2779 {
2780     int8_t roundingMode;
2781     flag roundNearestEven, increment, isTiny;
2782     int64_t roundIncrement, roundMask, roundBits;
2783
2784     roundingMode = status->float_rounding_mode;
2785     roundNearestEven = ( roundingMode == float_round_nearest_even );
2786     if ( roundingPrecision == 80 ) goto precision80;
2787     if ( roundingPrecision == 64 ) {
2788         roundIncrement = LIT64( 0x0000000000000400 );
2789         roundMask = LIT64( 0x00000000000007FF );
2790     }
2791     else if ( roundingPrecision == 32 ) {
2792         roundIncrement = LIT64( 0x0000008000000000 );
2793         roundMask = LIT64( 0x000000FFFFFFFFFF );
2794     }
2795     else {
2796         goto precision80;
2797     }
2798     zSig0 |= ( zSig1 != 0 );
2799     switch (roundingMode) {
2800     case float_round_nearest_even:
2801     case float_round_ties_away:
2802         break;
2803     case float_round_to_zero:
2804         roundIncrement = 0;
2805         break;
2806     case float_round_up:
2807         roundIncrement = zSign ? 0 : roundMask;
2808         break;
2809     case float_round_down:
2810         roundIncrement = zSign ? roundMask : 0;
2811         break;
2812     default:
2813         abort();
2814     }
2815     roundBits = zSig0 & roundMask;
2816     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2817         if (    ( 0x7FFE < zExp )
2818              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2819            ) {
2820             goto overflow;
2821         }
2822         if ( zExp <= 0 ) {
2823             if (status->flush_to_zero) {
2824                 float_raise(float_flag_output_denormal, status);
2825                 return packFloatx80(zSign, 0, 0);
2826             }
2827             isTiny =
2828                    (status->float_detect_tininess
2829                     == float_tininess_before_rounding)
2830                 || ( zExp < 0 )
2831                 || ( zSig0 <= zSig0 + roundIncrement );
2832             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2833             zExp = 0;
2834             roundBits = zSig0 & roundMask;
2835             if (isTiny && roundBits) {
2836                 float_raise(float_flag_underflow, status);
2837             }
2838             if (roundBits) {
2839                 status->float_exception_flags |= float_flag_inexact;
2840             }
2841             zSig0 += roundIncrement;
2842             if ( (int64_t) zSig0 < 0 ) zExp = 1;
2843             roundIncrement = roundMask + 1;
2844             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2845                 roundMask |= roundIncrement;
2846             }
2847             zSig0 &= ~ roundMask;
2848             return packFloatx80( zSign, zExp, zSig0 );
2849         }
2850     }
2851     if (roundBits) {
2852         status->float_exception_flags |= float_flag_inexact;
2853     }
2854     zSig0 += roundIncrement;
2855     if ( zSig0 < roundIncrement ) {
2856         ++zExp;
2857         zSig0 = LIT64( 0x8000000000000000 );
2858     }
2859     roundIncrement = roundMask + 1;
2860     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2861         roundMask |= roundIncrement;
2862     }
2863     zSig0 &= ~ roundMask;
2864     if ( zSig0 == 0 ) zExp = 0;
2865     return packFloatx80( zSign, zExp, zSig0 );
2866  precision80:
2867     switch (roundingMode) {
2868     case float_round_nearest_even:
2869     case float_round_ties_away:
2870         increment = ((int64_t)zSig1 < 0);
2871         break;
2872     case float_round_to_zero:
2873         increment = 0;
2874         break;
2875     case float_round_up:
2876         increment = !zSign && zSig1;
2877         break;
2878     case float_round_down:
2879         increment = zSign && zSig1;
2880         break;
2881     default:
2882         abort();
2883     }
2884     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2885         if (    ( 0x7FFE < zExp )
2886              || (    ( zExp == 0x7FFE )
2887                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2888                   && increment
2889                 )
2890            ) {
2891             roundMask = 0;
2892  overflow:
2893             float_raise(float_flag_overflow | float_flag_inexact, status);
2894             if (    ( roundingMode == float_round_to_zero )
2895                  || ( zSign && ( roundingMode == float_round_up ) )
2896                  || ( ! zSign && ( roundingMode == float_round_down ) )
2897                ) {
2898                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2899             }
2900             return packFloatx80(zSign,
2901                                 floatx80_infinity_high,
2902                                 floatx80_infinity_low);
2903         }
2904         if ( zExp <= 0 ) {
2905             isTiny =
2906                    (status->float_detect_tininess
2907                     == float_tininess_before_rounding)
2908                 || ( zExp < 0 )
2909                 || ! increment
2910                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2911             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2912             zExp = 0;
2913             if (isTiny && zSig1) {
2914                 float_raise(float_flag_underflow, status);
2915             }
2916             if (zSig1) {
2917                 status->float_exception_flags |= float_flag_inexact;
2918             }
2919             switch (roundingMode) {
2920             case float_round_nearest_even:
2921             case float_round_ties_away:
2922                 increment = ((int64_t)zSig1 < 0);
2923                 break;
2924             case float_round_to_zero:
2925                 increment = 0;
2926                 break;
2927             case float_round_up:
2928                 increment = !zSign && zSig1;
2929                 break;
2930             case float_round_down:
2931                 increment = zSign && zSig1;
2932                 break;
2933             default:
2934                 abort();
2935             }
2936             if ( increment ) {
2937                 ++zSig0;
2938                 zSig0 &=
2939                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2940                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2941             }
2942             return packFloatx80( zSign, zExp, zSig0 );
2943         }
2944     }
2945     if (zSig1) {
2946         status->float_exception_flags |= float_flag_inexact;
2947     }
2948     if ( increment ) {
2949         ++zSig0;
2950         if ( zSig0 == 0 ) {
2951             ++zExp;
2952             zSig0 = LIT64( 0x8000000000000000 );
2953         }
2954         else {
2955             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2956         }
2957     }
2958     else {
2959         if ( zSig0 == 0 ) zExp = 0;
2960     }
2961     return packFloatx80( zSign, zExp, zSig0 );
2962
2963 }
2964
2965 /*----------------------------------------------------------------------------
2966 | Takes an abstract floating-point value having sign `zSign', exponent
2967 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2968 | and returns the proper extended double-precision floating-point value
2969 | corresponding to the abstract input.  This routine is just like
2970 | `roundAndPackFloatx80' except that the input significand does not have to be
2971 | normalized.
2972 *----------------------------------------------------------------------------*/
2973
2974 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2975                                        flag zSign, int32_t zExp,
2976                                        uint64_t zSig0, uint64_t zSig1,
2977                                        float_status *status)
2978 {
2979     int8_t shiftCount;
2980
2981     if ( zSig0 == 0 ) {
2982         zSig0 = zSig1;
2983         zSig1 = 0;
2984         zExp -= 64;
2985     }
2986     shiftCount = countLeadingZeros64( zSig0 );
2987     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2988     zExp -= shiftCount;
2989     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2990                                 zSig0, zSig1, status);
2991
2992 }
2993
2994 /*----------------------------------------------------------------------------
2995 | Returns the least-significant 64 fraction bits of the quadruple-precision
2996 | floating-point value `a'.
2997 *----------------------------------------------------------------------------*/
2998
2999 static inline uint64_t extractFloat128Frac1( float128 a )
3000 {
3001
3002     return a.low;
3003
3004 }
3005
3006 /*----------------------------------------------------------------------------
3007 | Returns the most-significant 48 fraction bits of the quadruple-precision
3008 | floating-point value `a'.
3009 *----------------------------------------------------------------------------*/
3010
3011 static inline uint64_t extractFloat128Frac0( float128 a )
3012 {
3013
3014     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
3015
3016 }
3017
3018 /*----------------------------------------------------------------------------
3019 | Returns the exponent bits of the quadruple-precision floating-point value
3020 | `a'.
3021 *----------------------------------------------------------------------------*/
3022
3023 static inline int32_t extractFloat128Exp( float128 a )
3024 {
3025
3026     return ( a.high>>48 ) & 0x7FFF;
3027
3028 }
3029
3030 /*----------------------------------------------------------------------------
3031 | Returns the sign bit of the quadruple-precision floating-point value `a'.
3032 *----------------------------------------------------------------------------*/
3033
3034 static inline flag extractFloat128Sign( float128 a )
3035 {
3036
3037     return a.high>>63;
3038
3039 }
3040
3041 /*----------------------------------------------------------------------------
3042 | Normalizes the subnormal quadruple-precision floating-point value
3043 | represented by the denormalized significand formed by the concatenation of
3044 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
3045 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
3046 | significand are stored at the location pointed to by `zSig0Ptr', and the
3047 | least significant 64 bits of the normalized significand are stored at the
3048 | location pointed to by `zSig1Ptr'.
3049 *----------------------------------------------------------------------------*/
3050
3051 static void
3052  normalizeFloat128Subnormal(
3053      uint64_t aSig0,
3054      uint64_t aSig1,
3055      int32_t *zExpPtr,
3056      uint64_t *zSig0Ptr,
3057      uint64_t *zSig1Ptr
3058  )
3059 {
3060     int8_t shiftCount;
3061
3062     if ( aSig0 == 0 ) {
3063         shiftCount = countLeadingZeros64( aSig1 ) - 15;
3064         if ( shiftCount < 0 ) {
3065             *zSig0Ptr = aSig1>>( - shiftCount );
3066             *zSig1Ptr = aSig1<<( shiftCount & 63 );
3067         }
3068         else {
3069             *zSig0Ptr = aSig1<<shiftCount;
3070             *zSig1Ptr = 0;
3071         }
3072         *zExpPtr = - shiftCount - 63;
3073     }
3074     else {
3075         shiftCount = countLeadingZeros64( aSig0 ) - 15;
3076         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
3077         *zExpPtr = 1 - shiftCount;
3078     }
3079
3080 }
3081
3082 /*----------------------------------------------------------------------------
3083 | Packs the sign `zSign', the exponent `zExp', and the significand formed
3084 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
3085 | floating-point value, returning the result.  After being shifted into the
3086 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
3087 | added together to form the most significant 32 bits of the result.  This
3088 | means that any integer portion of `zSig0' will be added into the exponent.
3089 | Since a properly normalized significand will have an integer portion equal
3090 | to 1, the `zExp' input should be 1 less than the desired result exponent
3091 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
3092 | significand.
3093 *----------------------------------------------------------------------------*/
3094
3095 static inline float128
3096  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
3097 {
3098     float128 z;
3099
3100     z.low = zSig1;
3101     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
3102     return z;
3103
3104 }
3105
3106 /*----------------------------------------------------------------------------
3107 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3108 | and extended significand formed by the concatenation of `zSig0', `zSig1',
3109 | and `zSig2', and returns the proper quadruple-precision floating-point value
3110 | corresponding to the abstract input.  Ordinarily, the abstract value is
3111 | simply rounded and packed into the quadruple-precision format, with the
3112 | inexact exception raised if the abstract input cannot be represented
3113 | exactly.  However, if the abstract value is too large, the overflow and
3114 | inexact exceptions are raised and an infinity or maximal finite value is
3115 | returned.  If the abstract value is too small, the input value is rounded to
3116 | a subnormal number, and the underflow and inexact exceptions are raised if
3117 | the abstract input cannot be represented exactly as a subnormal quadruple-
3118 | precision floating-point number.
3119 |     The input significand must be normalized or smaller.  If the input
3120 | significand is not normalized, `zExp' must be 0; in that case, the result
3121 | returned is a subnormal number, and it must not require rounding.  In the
3122 | usual case that the input significand is normalized, `zExp' must be 1 less
3123 | than the ``true'' floating-point exponent.  The handling of underflow and
3124 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3125 *----------------------------------------------------------------------------*/
3126
3127 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
3128                                      uint64_t zSig0, uint64_t zSig1,
3129                                      uint64_t zSig2, float_status *status)
3130 {
3131     int8_t roundingMode;
3132     flag roundNearestEven, increment, isTiny;
3133
3134     roundingMode = status->float_rounding_mode;
3135     roundNearestEven = ( roundingMode == float_round_nearest_even );
3136     switch (roundingMode) {
3137     case float_round_nearest_even:
3138     case float_round_ties_away:
3139         increment = ((int64_t)zSig2 < 0);
3140         break;
3141     case float_round_to_zero:
3142         increment = 0;
3143         break;
3144     case float_round_up:
3145         increment = !zSign && zSig2;
3146         break;
3147     case float_round_down:
3148         increment = zSign && zSig2;
3149         break;
3150     case float_round_to_odd:
3151         increment = !(zSig1 & 0x1) && zSig2;
3152         break;
3153     default:
3154         abort();
3155     }
3156     if ( 0x7FFD <= (uint32_t) zExp ) {
3157         if (    ( 0x7FFD < zExp )
3158              || (    ( zExp == 0x7FFD )
3159                   && eq128(
3160                          LIT64( 0x0001FFFFFFFFFFFF ),
3161                          LIT64( 0xFFFFFFFFFFFFFFFF ),
3162                          zSig0,
3163                          zSig1
3164                      )
3165                   && increment
3166                 )
3167            ) {
3168             float_raise(float_flag_overflow | float_flag_inexact, status);
3169             if (    ( roundingMode == float_round_to_zero )
3170                  || ( zSign && ( roundingMode == float_round_up ) )
3171                  || ( ! zSign && ( roundingMode == float_round_down ) )
3172                  || (roundingMode == float_round_to_odd)
3173                ) {
3174                 return
3175                     packFloat128(
3176                         zSign,
3177                         0x7FFE,
3178                         LIT64( 0x0000FFFFFFFFFFFF ),
3179                         LIT64( 0xFFFFFFFFFFFFFFFF )
3180                     );
3181             }
3182             return packFloat128( zSign, 0x7FFF, 0, 0 );
3183         }
3184         if ( zExp < 0 ) {
3185             if (status->flush_to_zero) {
3186                 float_raise(float_flag_output_denormal, status);
3187                 return packFloat128(zSign, 0, 0, 0);
3188             }
3189             isTiny =
3190                    (status->float_detect_tininess
3191                     == float_tininess_before_rounding)
3192                 || ( zExp < -1 )
3193                 || ! increment
3194                 || lt128(
3195                        zSig0,
3196                        zSig1,
3197                        LIT64( 0x0001FFFFFFFFFFFF ),
3198                        LIT64( 0xFFFFFFFFFFFFFFFF )
3199                    );
3200             shift128ExtraRightJamming(
3201                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
3202             zExp = 0;
3203             if (isTiny && zSig2) {
3204                 float_raise(float_flag_underflow, status);
3205             }
3206             switch (roundingMode) {
3207             case float_round_nearest_even:
3208             case float_round_ties_away:
3209                 increment = ((int64_t)zSig2 < 0);
3210                 break;
3211             case float_round_to_zero:
3212                 increment = 0;
3213                 break;
3214             case float_round_up:
3215                 increment = !zSign && zSig2;
3216                 break;
3217             case float_round_down:
3218                 increment = zSign && zSig2;
3219                 break;
3220             case float_round_to_odd:
3221                 increment = !(zSig1 & 0x1) && zSig2;
3222                 break;
3223             default:
3224                 abort();
3225             }
3226         }
3227     }
3228     if (zSig2) {
3229         status->float_exception_flags |= float_flag_inexact;
3230     }
3231     if ( increment ) {
3232         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
3233         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
3234     }
3235     else {
3236         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
3237     }
3238     return packFloat128( zSign, zExp, zSig0, zSig1 );
3239
3240 }
3241
3242 /*----------------------------------------------------------------------------
3243 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3244 | and significand formed by the concatenation of `zSig0' and `zSig1', and
3245 | returns the proper quadruple-precision floating-point value corresponding
3246 | to the abstract input.  This routine is just like `roundAndPackFloat128'
3247 | except that the input significand has fewer bits and does not have to be
3248 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
3249 | point exponent.
3250 *----------------------------------------------------------------------------*/
3251
3252 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3253                                               uint64_t zSig0, uint64_t zSig1,
3254                                               float_status *status)
3255 {
3256     int8_t shiftCount;
3257     uint64_t zSig2;
3258
3259     if ( zSig0 == 0 ) {
3260         zSig0 = zSig1;
3261         zSig1 = 0;
3262         zExp -= 64;
3263     }
3264     shiftCount = countLeadingZeros64( zSig0 ) - 15;
3265     if ( 0 <= shiftCount ) {
3266         zSig2 = 0;
3267         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3268     }
3269     else {
3270         shift128ExtraRightJamming(
3271             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3272     }
3273     zExp -= shiftCount;
3274     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3275
3276 }
3277
3278
3279 /*----------------------------------------------------------------------------
3280 | Returns the result of converting the 32-bit two's complement integer `a'
3281 | to the extended double-precision floating-point format.  The conversion
3282 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3283 | Arithmetic.
3284 *----------------------------------------------------------------------------*/
3285
3286 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3287 {
3288     flag zSign;
3289     uint32_t absA;
3290     int8_t shiftCount;
3291     uint64_t zSig;
3292
3293     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3294     zSign = ( a < 0 );
3295     absA = zSign ? - a : a;
3296     shiftCount = countLeadingZeros32( absA ) + 32;
3297     zSig = absA;
3298     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3299
3300 }
3301
3302 /*----------------------------------------------------------------------------
3303 | Returns the result of converting the 32-bit two's complement integer `a' to
3304 | the quadruple-precision floating-point format.  The conversion is performed
3305 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3306 *----------------------------------------------------------------------------*/
3307
3308 float128 int32_to_float128(int32_t a, float_status *status)
3309 {
3310     flag zSign;
3311     uint32_t absA;
3312     int8_t shiftCount;
3313     uint64_t zSig0;
3314
3315     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3316     zSign = ( a < 0 );
3317     absA = zSign ? - a : a;
3318     shiftCount = countLeadingZeros32( absA ) + 17;
3319     zSig0 = absA;
3320     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3321
3322 }
3323
3324 /*----------------------------------------------------------------------------
3325 | Returns the result of converting the 64-bit two's complement integer `a'
3326 | to the extended double-precision floating-point format.  The conversion
3327 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3328 | Arithmetic.
3329 *----------------------------------------------------------------------------*/
3330
3331 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3332 {
3333     flag zSign;
3334     uint64_t absA;
3335     int8_t shiftCount;
3336
3337     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3338     zSign = ( a < 0 );
3339     absA = zSign ? - a : a;
3340     shiftCount = countLeadingZeros64( absA );
3341     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3342
3343 }
3344
3345 /*----------------------------------------------------------------------------
3346 | Returns the result of converting the 64-bit two's complement integer `a' to
3347 | the quadruple-precision floating-point format.  The conversion is performed
3348 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3349 *----------------------------------------------------------------------------*/
3350
3351 float128 int64_to_float128(int64_t a, float_status *status)
3352 {
3353     flag zSign;
3354     uint64_t absA;
3355     int8_t shiftCount;
3356     int32_t zExp;
3357     uint64_t zSig0, zSig1;
3358
3359     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3360     zSign = ( a < 0 );
3361     absA = zSign ? - a : a;
3362     shiftCount = countLeadingZeros64( absA ) + 49;
3363     zExp = 0x406E - shiftCount;
3364     if ( 64 <= shiftCount ) {
3365         zSig1 = 0;
3366         zSig0 = absA;
3367         shiftCount -= 64;
3368     }
3369     else {
3370         zSig1 = absA;
3371         zSig0 = 0;
3372     }
3373     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3374     return packFloat128( zSign, zExp, zSig0, zSig1 );
3375
3376 }
3377
3378 /*----------------------------------------------------------------------------
3379 | Returns the result of converting the 64-bit unsigned integer `a'
3380 | to the quadruple-precision floating-point format.  The conversion is performed
3381 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3382 *----------------------------------------------------------------------------*/
3383
3384 float128 uint64_to_float128(uint64_t a, float_status *status)
3385 {
3386     if (a == 0) {
3387         return float128_zero;
3388     }
3389     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
3390 }
3391
3392 /*----------------------------------------------------------------------------
3393 | Returns the result of converting the single-precision floating-point value
3394 | `a' to the extended double-precision floating-point format.  The conversion
3395 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3396 | Arithmetic.
3397 *----------------------------------------------------------------------------*/
3398
3399 floatx80 float32_to_floatx80(float32 a, float_status *status)
3400 {
3401     flag aSign;
3402     int aExp;
3403     uint32_t aSig;
3404
3405     a = float32_squash_input_denormal(a, status);
3406     aSig = extractFloat32Frac( a );
3407     aExp = extractFloat32Exp( a );
3408     aSign = extractFloat32Sign( a );
3409     if ( aExp == 0xFF ) {
3410         if (aSig) {
3411             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3412         }
3413         return packFloatx80(aSign,
3414                             floatx80_infinity_high,
3415                             floatx80_infinity_low);
3416     }
3417     if ( aExp == 0 ) {
3418         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3419         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3420     }
3421     aSig |= 0x00800000;
3422     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3423
3424 }
3425
3426 /*----------------------------------------------------------------------------
3427 | Returns the result of converting the single-precision floating-point value
3428 | `a' to the double-precision floating-point format.  The conversion is
3429 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3430 | Arithmetic.
3431 *----------------------------------------------------------------------------*/
3432
3433 float128 float32_to_float128(float32 a, float_status *status)
3434 {
3435     flag aSign;
3436     int aExp;
3437     uint32_t aSig;
3438
3439     a = float32_squash_input_denormal(a, status);
3440     aSig = extractFloat32Frac( a );
3441     aExp = extractFloat32Exp( a );
3442     aSign = extractFloat32Sign( a );
3443     if ( aExp == 0xFF ) {
3444         if (aSig) {
3445             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3446         }
3447         return packFloat128( aSign, 0x7FFF, 0, 0 );
3448     }
3449     if ( aExp == 0 ) {
3450         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3451         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3452         --aExp;
3453     }
3454     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3455
3456 }
3457
3458 /*----------------------------------------------------------------------------
3459 | Returns the remainder of the single-precision floating-point value `a'
3460 | with respect to the corresponding value `b'.  The operation is performed
3461 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3462 *----------------------------------------------------------------------------*/
3463
3464 float32 float32_rem(float32 a, float32 b, float_status *status)
3465 {
3466     flag aSign, zSign;
3467     int aExp, bExp, expDiff;
3468     uint32_t aSig, bSig;
3469     uint32_t q;
3470     uint64_t aSig64, bSig64, q64;
3471     uint32_t alternateASig;
3472     int32_t sigMean;
3473     a = float32_squash_input_denormal(a, status);
3474     b = float32_squash_input_denormal(b, status);
3475
3476     aSig = extractFloat32Frac( a );
3477     aExp = extractFloat32Exp( a );
3478     aSign = extractFloat32Sign( a );
3479     bSig = extractFloat32Frac( b );
3480     bExp = extractFloat32Exp( b );
3481     if ( aExp == 0xFF ) {
3482         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3483             return propagateFloat32NaN(a, b, status);
3484         }
3485         float_raise(float_flag_invalid, status);
3486         return float32_default_nan(status);
3487     }
3488     if ( bExp == 0xFF ) {
3489         if (bSig) {
3490             return propagateFloat32NaN(a, b, status);
3491         }
3492         return a;
3493     }
3494     if ( bExp == 0 ) {
3495         if ( bSig == 0 ) {
3496             float_raise(float_flag_invalid, status);
3497             return float32_default_nan(status);
3498         }
3499         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3500     }
3501     if ( aExp == 0 ) {
3502         if ( aSig == 0 ) return a;
3503         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3504     }
3505     expDiff = aExp - bExp;
3506     aSig |= 0x00800000;
3507     bSig |= 0x00800000;
3508     if ( expDiff < 32 ) {
3509         aSig <<= 8;
3510         bSig <<= 8;
3511         if ( expDiff < 0 ) {
3512             if ( expDiff < -1 ) return a;
3513             aSig >>= 1;
3514         }
3515         q = ( bSig <= aSig );
3516         if ( q ) aSig -= bSig;
3517         if ( 0 < expDiff ) {
3518             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3519             q >>= 32 - expDiff;
3520             bSig >>= 2;
3521             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3522         }
3523         else {
3524             aSig >>= 2;
3525             bSig >>= 2;
3526         }
3527     }
3528     else {
3529         if ( bSig <= aSig ) aSig -= bSig;
3530         aSig64 = ( (uint64_t) aSig )<<40;
3531         bSig64 = ( (uint64_t) bSig )<<40;
3532         expDiff -= 64;
3533         while ( 0 < expDiff ) {
3534             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3535             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3536             aSig64 = - ( ( bSig * q64 )<<38 );
3537             expDiff -= 62;
3538         }
3539         expDiff += 64;
3540         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3541         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3542         q = q64>>( 64 - expDiff );
3543         bSig <<= 6;
3544         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3545     }
3546     do {
3547         alternateASig = aSig;
3548         ++q;
3549         aSig -= bSig;
3550     } while ( 0 <= (int32_t) aSig );
3551     sigMean = aSig + alternateASig;
3552     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3553         aSig = alternateASig;
3554     }
3555     zSign = ( (int32_t) aSig < 0 );
3556     if ( zSign ) aSig = - aSig;
3557     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3558 }
3559
3560
3561
3562 /*----------------------------------------------------------------------------
3563 | Returns the binary exponential of the single-precision floating-point value
3564 | `a'. The operation is performed according to the IEC/IEEE Standard for
3565 | Binary Floating-Point Arithmetic.
3566 |
3567 | Uses the following identities:
3568 |
3569 | 1. -------------------------------------------------------------------------
3570 |      x    x*ln(2)
3571 |     2  = e
3572 |
3573 | 2. -------------------------------------------------------------------------
3574 |                      2     3     4     5           n
3575 |      x        x     x     x     x     x           x
3576 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3577 |               1!    2!    3!    4!    5!          n!
3578 *----------------------------------------------------------------------------*/
3579
3580 static const float64 float32_exp2_coefficients[15] =
3581 {
3582     const_float64( 0x3ff0000000000000ll ), /*  1 */
3583     const_float64( 0x3fe0000000000000ll ), /*  2 */
3584     const_float64( 0x3fc5555555555555ll ), /*  3 */
3585     const_float64( 0x3fa5555555555555ll ), /*  4 */
3586     const_float64( 0x3f81111111111111ll ), /*  5 */
3587     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
3588     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
3589     const_float64( 0x3efa01a01a01a01all ), /*  8 */
3590     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
3591     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3592     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3593     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3594     const_float64( 0x3de6124613a86d09ll ), /* 13 */
3595     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3596     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3597 };
3598
3599 float32 float32_exp2(float32 a, float_status *status)
3600 {
3601     flag aSign;
3602     int aExp;
3603     uint32_t aSig;
3604     float64 r, x, xn;
3605     int i;
3606     a = float32_squash_input_denormal(a, status);
3607
3608     aSig = extractFloat32Frac( a );
3609     aExp = extractFloat32Exp( a );
3610     aSign = extractFloat32Sign( a );
3611
3612     if ( aExp == 0xFF) {
3613         if (aSig) {
3614             return propagateFloat32NaN(a, float32_zero, status);
3615         }
3616         return (aSign) ? float32_zero : a;
3617     }
3618     if (aExp == 0) {
3619         if (aSig == 0) return float32_one;
3620     }
3621
3622     float_raise(float_flag_inexact, status);
3623
3624     /* ******************************* */
3625     /* using float64 for approximation */
3626     /* ******************************* */
3627     x = float32_to_float64(a, status);
3628     x = float64_mul(x, float64_ln2, status);
3629
3630     xn = x;
3631     r = float64_one;
3632     for (i = 0 ; i < 15 ; i++) {
3633         float64 f;
3634
3635         f = float64_mul(xn, float32_exp2_coefficients[i], status);
3636         r = float64_add(r, f, status);
3637
3638         xn = float64_mul(xn, x, status);
3639     }
3640
3641     return float64_to_float32(r, status);
3642 }
3643
3644 /*----------------------------------------------------------------------------
3645 | Returns the binary log of the single-precision floating-point value `a'.
3646 | The operation is performed according to the IEC/IEEE Standard for Binary
3647 | Floating-Point Arithmetic.
3648 *----------------------------------------------------------------------------*/
3649 float32 float32_log2(float32 a, float_status *status)
3650 {
3651     flag aSign, zSign;
3652     int aExp;
3653     uint32_t aSig, zSig, i;
3654
3655     a = float32_squash_input_denormal(a, status);
3656     aSig = extractFloat32Frac( a );
3657     aExp = extractFloat32Exp( a );
3658     aSign = extractFloat32Sign( a );
3659
3660     if ( aExp == 0 ) {
3661         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3662         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3663     }
3664     if ( aSign ) {
3665         float_raise(float_flag_invalid, status);
3666         return float32_default_nan(status);
3667     }
3668     if ( aExp == 0xFF ) {
3669         if (aSig) {
3670             return propagateFloat32NaN(a, float32_zero, status);
3671         }
3672         return a;
3673     }
3674
3675     aExp -= 0x7F;
3676     aSig |= 0x00800000;
3677     zSign = aExp < 0;
3678     zSig = aExp << 23;
3679
3680     for (i = 1 << 22; i > 0; i >>= 1) {
3681         aSig = ( (uint64_t)aSig * aSig ) >> 23;
3682         if ( aSig & 0x01000000 ) {
3683             aSig >>= 1;
3684             zSig |= i;
3685         }
3686     }
3687
3688     if ( zSign )
3689         zSig = -zSig;
3690
3691     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3692 }
3693
3694 /*----------------------------------------------------------------------------
3695 | Returns 1 if the single-precision floating-point value `a' is equal to
3696 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3697 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3698 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3699 *----------------------------------------------------------------------------*/
3700
3701 int float32_eq(float32 a, float32 b, float_status *status)
3702 {
3703     uint32_t av, bv;
3704     a = float32_squash_input_denormal(a, status);
3705     b = float32_squash_input_denormal(b, status);
3706
3707     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3708          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3709        ) {
3710         float_raise(float_flag_invalid, status);
3711         return 0;
3712     }
3713     av = float32_val(a);
3714     bv = float32_val(b);
3715     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3716 }
3717
3718 /*----------------------------------------------------------------------------
3719 | Returns 1 if the single-precision floating-point value `a' is less than
3720 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
3721 | exception is raised if either operand is a NaN.  The comparison is performed
3722 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3723 *----------------------------------------------------------------------------*/
3724
3725 int float32_le(float32 a, float32 b, float_status *status)
3726 {
3727     flag aSign, bSign;
3728     uint32_t av, bv;
3729     a = float32_squash_input_denormal(a, status);
3730     b = float32_squash_input_denormal(b, status);
3731
3732     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3733          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3734        ) {
3735         float_raise(float_flag_invalid, status);
3736         return 0;
3737     }
3738     aSign = extractFloat32Sign( a );
3739     bSign = extractFloat32Sign( b );
3740     av = float32_val(a);
3741     bv = float32_val(b);
3742     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3743     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3744
3745 }
3746
3747 /*----------------------------------------------------------------------------
3748 | Returns 1 if the single-precision floating-point value `a' is less than
3749 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3750 | raised if either operand is a NaN.  The comparison is performed according
3751 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3752 *----------------------------------------------------------------------------*/
3753
3754 int float32_lt(float32 a, float32 b, float_status *status)
3755 {
3756     flag aSign, bSign;
3757     uint32_t av, bv;
3758     a = float32_squash_input_denormal(a, status);
3759     b = float32_squash_input_denormal(b, status);
3760
3761     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3762          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3763        ) {
3764         float_raise(float_flag_invalid, status);
3765         return 0;
3766     }
3767     aSign = extractFloat32Sign( a );
3768     bSign = extractFloat32Sign( b );
3769     av = float32_val(a);
3770     bv = float32_val(b);
3771     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3772     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3773
3774 }
3775
3776 /*----------------------------------------------------------------------------
3777 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3778 | be compared, and 0 otherwise.  The invalid exception is raised if either
3779 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
3780 | Standard for Binary Floating-Point Arithmetic.
3781 *----------------------------------------------------------------------------*/
3782
3783 int float32_unordered(float32 a, float32 b, float_status *status)
3784 {
3785     a = float32_squash_input_denormal(a, status);
3786     b = float32_squash_input_denormal(b, status);
3787
3788     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3789          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3790        ) {
3791         float_raise(float_flag_invalid, status);
3792         return 1;
3793     }
3794     return 0;
3795 }
3796
3797 /*----------------------------------------------------------------------------
3798 | Returns 1 if the single-precision floating-point value `a' is equal to
3799 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3800 | exception.  The comparison is performed according to the IEC/IEEE Standard
3801 | for Binary Floating-Point Arithmetic.
3802 *----------------------------------------------------------------------------*/
3803
3804 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3805 {
3806     a = float32_squash_input_denormal(a, status);
3807     b = float32_squash_input_denormal(b, status);
3808
3809     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3810          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3811        ) {
3812         if (float32_is_signaling_nan(a, status)
3813          || float32_is_signaling_nan(b, status)) {
3814             float_raise(float_flag_invalid, status);
3815         }
3816         return 0;
3817     }
3818     return ( float32_val(a) == float32_val(b) ) ||
3819             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3820 }
3821
3822 /*----------------------------------------------------------------------------
3823 | Returns 1 if the single-precision floating-point value `a' is less than or
3824 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3825 | cause an exception.  Otherwise, the comparison is performed according to the
3826 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3827 *----------------------------------------------------------------------------*/
3828
3829 int float32_le_quiet(float32 a, float32 b, float_status *status)
3830 {
3831     flag aSign, bSign;
3832     uint32_t av, bv;
3833     a = float32_squash_input_denormal(a, status);
3834     b = float32_squash_input_denormal(b, status);
3835
3836     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3837          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3838        ) {
3839         if (float32_is_signaling_nan(a, status)
3840          || float32_is_signaling_nan(b, status)) {
3841             float_raise(float_flag_invalid, status);
3842         }
3843         return 0;
3844     }
3845     aSign = extractFloat32Sign( a );
3846     bSign = extractFloat32Sign( b );
3847     av = float32_val(a);
3848     bv = float32_val(b);
3849     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3850     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3851
3852 }
3853
3854 /*----------------------------------------------------------------------------
3855 | Returns 1 if the single-precision floating-point value `a' is less than
3856 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3857 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3858 | Standard for Binary Floating-Point Arithmetic.
3859 *----------------------------------------------------------------------------*/
3860
3861 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3862 {
3863     flag aSign, bSign;
3864     uint32_t av, bv;
3865     a = float32_squash_input_denormal(a, status);
3866     b = float32_squash_input_denormal(b, status);
3867
3868     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3869          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3870        ) {
3871         if (float32_is_signaling_nan(a, status)
3872          || float32_is_signaling_nan(b, status)) {
3873             float_raise(float_flag_invalid, status);
3874         }
3875         return 0;
3876     }
3877     aSign = extractFloat32Sign( a );
3878     bSign = extractFloat32Sign( b );
3879     av = float32_val(a);
3880     bv = float32_val(b);
3881     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3882     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3883
3884 }
3885
3886 /*----------------------------------------------------------------------------
3887 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3888 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3889 | comparison is performed according to the IEC/IEEE Standard for Binary
3890 | Floating-Point Arithmetic.
3891 *----------------------------------------------------------------------------*/
3892
3893 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3894 {
3895     a = float32_squash_input_denormal(a, status);
3896     b = float32_squash_input_denormal(b, status);
3897
3898     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3899          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3900        ) {
3901         if (float32_is_signaling_nan(a, status)
3902          || float32_is_signaling_nan(b, status)) {
3903             float_raise(float_flag_invalid, status);
3904         }
3905         return 1;
3906     }
3907     return 0;
3908 }
3909
3910 /*----------------------------------------------------------------------------
3911 | If `a' is denormal and we are in flush-to-zero mode then set the
3912 | input-denormal exception and return zero. Otherwise just return the value.
3913 *----------------------------------------------------------------------------*/
3914 float16 float16_squash_input_denormal(float16 a, float_status *status)
3915 {
3916     if (status->flush_inputs_to_zero) {
3917         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3918             float_raise(float_flag_input_denormal, status);
3919             return make_float16(float16_val(a) & 0x8000);
3920         }
3921     }
3922     return a;
3923 }
3924
3925 /*----------------------------------------------------------------------------
3926 | Returns the result of converting the double-precision floating-point value
3927 | `a' to the extended double-precision floating-point format.  The conversion
3928 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3929 | Arithmetic.
3930 *----------------------------------------------------------------------------*/
3931
3932 floatx80 float64_to_floatx80(float64 a, float_status *status)
3933 {
3934     flag aSign;
3935     int aExp;
3936     uint64_t aSig;
3937
3938     a = float64_squash_input_denormal(a, status);
3939     aSig = extractFloat64Frac( a );
3940     aExp = extractFloat64Exp( a );
3941     aSign = extractFloat64Sign( a );
3942     if ( aExp == 0x7FF ) {
3943         if (aSig) {
3944             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3945         }
3946         return packFloatx80(aSign,
3947                             floatx80_infinity_high,
3948                             floatx80_infinity_low);
3949     }
3950     if ( aExp == 0 ) {
3951         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3952         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3953     }
3954     return
3955         packFloatx80(
3956             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3957
3958 }
3959
3960 /*----------------------------------------------------------------------------
3961 | Returns the result of converting the double-precision floating-point value
3962 | `a' to the quadruple-precision floating-point format.  The conversion is
3963 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3964 | Arithmetic.
3965 *----------------------------------------------------------------------------*/
3966
3967 float128 float64_to_float128(float64 a, float_status *status)
3968 {
3969     flag aSign;
3970     int aExp;
3971     uint64_t aSig, zSig0, zSig1;
3972
3973     a = float64_squash_input_denormal(a, status);
3974     aSig = extractFloat64Frac( a );
3975     aExp = extractFloat64Exp( a );
3976     aSign = extractFloat64Sign( a );
3977     if ( aExp == 0x7FF ) {
3978         if (aSig) {
3979             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3980         }
3981         return packFloat128( aSign, 0x7FFF, 0, 0 );
3982     }
3983     if ( aExp == 0 ) {
3984         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3985         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3986         --aExp;
3987     }
3988     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3989     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3990
3991 }
3992
3993
3994 /*----------------------------------------------------------------------------
3995 | Returns the remainder of the double-precision floating-point value `a'
3996 | with respect to the corresponding value `b'.  The operation is performed
3997 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3998 *----------------------------------------------------------------------------*/
3999
4000 float64 float64_rem(float64 a, float64 b, float_status *status)
4001 {
4002     flag aSign, zSign;
4003     int aExp, bExp, expDiff;
4004     uint64_t aSig, bSig;
4005     uint64_t q, alternateASig;
4006     int64_t sigMean;
4007
4008     a = float64_squash_input_denormal(a, status);
4009     b = float64_squash_input_denormal(b, status);
4010     aSig = extractFloat64Frac( a );
4011     aExp = extractFloat64Exp( a );
4012     aSign = extractFloat64Sign( a );
4013     bSig = extractFloat64Frac( b );
4014     bExp = extractFloat64Exp( b );
4015     if ( aExp == 0x7FF ) {
4016         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4017             return propagateFloat64NaN(a, b, status);
4018         }
4019         float_raise(float_flag_invalid, status);
4020         return float64_default_nan(status);
4021     }
4022     if ( bExp == 0x7FF ) {
4023         if (bSig) {
4024             return propagateFloat64NaN(a, b, status);
4025         }
4026         return a;
4027     }
4028     if ( bExp == 0 ) {
4029         if ( bSig == 0 ) {
4030             float_raise(float_flag_invalid, status);
4031             return float64_default_nan(status);
4032         }
4033         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4034     }
4035     if ( aExp == 0 ) {
4036         if ( aSig == 0 ) return a;
4037         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4038     }
4039     expDiff = aExp - bExp;
4040     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4041     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4042     if ( expDiff < 0 ) {
4043         if ( expDiff < -1 ) return a;
4044         aSig >>= 1;
4045     }
4046     q = ( bSig <= aSig );
4047     if ( q ) aSig -= bSig;
4048     expDiff -= 64;
4049     while ( 0 < expDiff ) {
4050         q = estimateDiv128To64( aSig, 0, bSig );
4051         q = ( 2 < q ) ? q - 2 : 0;
4052         aSig = - ( ( bSig>>2 ) * q );
4053         expDiff -= 62;
4054     }
4055     expDiff += 64;
4056     if ( 0 < expDiff ) {
4057         q = estimateDiv128To64( aSig, 0, bSig );
4058         q = ( 2 < q ) ? q - 2 : 0;
4059         q >>= 64 - expDiff;
4060         bSig >>= 2;
4061         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4062     }
4063     else {
4064         aSig >>= 2;
4065         bSig >>= 2;
4066     }
4067     do {
4068         alternateASig = aSig;
4069         ++q;
4070         aSig -= bSig;
4071     } while ( 0 <= (int64_t) aSig );
4072     sigMean = aSig + alternateASig;
4073     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4074         aSig = alternateASig;
4075     }
4076     zSign = ( (int64_t) aSig < 0 );
4077     if ( zSign ) aSig = - aSig;
4078     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4079
4080 }
4081
4082 /*----------------------------------------------------------------------------
4083 | Returns the binary log of the double-precision floating-point value `a'.
4084 | The operation is performed according to the IEC/IEEE Standard for Binary
4085 | Floating-Point Arithmetic.
4086 *----------------------------------------------------------------------------*/
4087 float64 float64_log2(float64 a, float_status *status)
4088 {
4089     flag aSign, zSign;
4090     int aExp;
4091     uint64_t aSig, aSig0, aSig1, zSig, i;
4092     a = float64_squash_input_denormal(a, status);
4093
4094     aSig = extractFloat64Frac( a );
4095     aExp = extractFloat64Exp( a );
4096     aSign = extractFloat64Sign( a );
4097
4098     if ( aExp == 0 ) {
4099         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4100         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4101     }
4102     if ( aSign ) {
4103         float_raise(float_flag_invalid, status);
4104         return float64_default_nan(status);
4105     }
4106     if ( aExp == 0x7FF ) {
4107         if (aSig) {
4108             return propagateFloat64NaN(a, float64_zero, status);
4109         }
4110         return a;
4111     }
4112
4113     aExp -= 0x3FF;
4114     aSig |= LIT64( 0x0010000000000000 );
4115     zSign = aExp < 0;
4116     zSig = (uint64_t)aExp << 52;
4117     for (i = 1LL << 51; i > 0; i >>= 1) {
4118         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4119         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4120         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4121             aSig >>= 1;
4122             zSig |= i;
4123         }
4124     }
4125
4126     if ( zSign )
4127         zSig = -zSig;
4128     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4129 }
4130
4131 /*----------------------------------------------------------------------------
4132 | Returns 1 if the double-precision floating-point value `a' is equal to the
4133 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4134 | if either operand is a NaN.  Otherwise, the comparison is performed
4135 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4136 *----------------------------------------------------------------------------*/
4137
4138 int float64_eq(float64 a, float64 b, float_status *status)
4139 {
4140     uint64_t av, bv;
4141     a = float64_squash_input_denormal(a, status);
4142     b = float64_squash_input_denormal(b, status);
4143
4144     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4145          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4146        ) {
4147         float_raise(float_flag_invalid, status);
4148         return 0;
4149     }
4150     av = float64_val(a);
4151     bv = float64_val(b);
4152     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4153
4154 }
4155
4156 /*----------------------------------------------------------------------------
4157 | Returns 1 if the double-precision floating-point value `a' is less than or
4158 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4159 | exception is raised if either operand is a NaN.  The comparison is performed
4160 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4161 *----------------------------------------------------------------------------*/
4162
4163 int float64_le(float64 a, float64 b, float_status *status)
4164 {
4165     flag aSign, bSign;
4166     uint64_t av, bv;
4167     a = float64_squash_input_denormal(a, status);
4168     b = float64_squash_input_denormal(b, status);
4169
4170     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4171          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4172        ) {
4173         float_raise(float_flag_invalid, status);
4174         return 0;
4175     }
4176     aSign = extractFloat64Sign( a );
4177     bSign = extractFloat64Sign( b );
4178     av = float64_val(a);
4179     bv = float64_val(b);
4180     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4181     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4182
4183 }
4184
4185 /*----------------------------------------------------------------------------
4186 | Returns 1 if the double-precision floating-point value `a' is less than
4187 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4188 | raised if either operand is a NaN.  The comparison is performed according
4189 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4190 *----------------------------------------------------------------------------*/
4191
4192 int float64_lt(float64 a, float64 b, float_status *status)
4193 {
4194     flag aSign, bSign;
4195     uint64_t av, bv;
4196
4197     a = float64_squash_input_denormal(a, status);
4198     b = float64_squash_input_denormal(b, status);
4199     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4200          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4201        ) {
4202         float_raise(float_flag_invalid, status);
4203         return 0;
4204     }
4205     aSign = extractFloat64Sign( a );
4206     bSign = extractFloat64Sign( b );
4207     av = float64_val(a);
4208     bv = float64_val(b);
4209     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4210     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4211
4212 }
4213
4214 /*----------------------------------------------------------------------------
4215 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4216 | be compared, and 0 otherwise.  The invalid exception is raised if either
4217 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4218 | Standard for Binary Floating-Point Arithmetic.
4219 *----------------------------------------------------------------------------*/
4220
4221 int float64_unordered(float64 a, float64 b, float_status *status)
4222 {
4223     a = float64_squash_input_denormal(a, status);
4224     b = float64_squash_input_denormal(b, status);
4225
4226     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4227          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4228        ) {
4229         float_raise(float_flag_invalid, status);
4230         return 1;
4231     }
4232     return 0;
4233 }
4234
4235 /*----------------------------------------------------------------------------
4236 | Returns 1 if the double-precision floating-point value `a' is equal to the
4237 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4238 | exception.The comparison is performed according to the IEC/IEEE Standard
4239 | for Binary Floating-Point Arithmetic.
4240 *----------------------------------------------------------------------------*/
4241
4242 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4243 {
4244     uint64_t av, bv;
4245     a = float64_squash_input_denormal(a, status);
4246     b = float64_squash_input_denormal(b, status);
4247
4248     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4249          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4250        ) {
4251         if (float64_is_signaling_nan(a, status)
4252          || float64_is_signaling_nan(b, status)) {
4253             float_raise(float_flag_invalid, status);
4254         }
4255         return 0;
4256     }
4257     av = float64_val(a);
4258     bv = float64_val(b);
4259     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4260
4261 }
4262
4263 /*----------------------------------------------------------------------------
4264 | Returns 1 if the double-precision floating-point value `a' is less than or
4265 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4266 | cause an exception.  Otherwise, the comparison is performed according to the
4267 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4268 *----------------------------------------------------------------------------*/
4269
4270 int float64_le_quiet(float64 a, float64 b, float_status *status)
4271 {
4272     flag aSign, bSign;
4273     uint64_t av, bv;
4274     a = float64_squash_input_denormal(a, status);
4275     b = float64_squash_input_denormal(b, status);
4276
4277     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4278          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4279        ) {
4280         if (float64_is_signaling_nan(a, status)
4281          || float64_is_signaling_nan(b, status)) {
4282             float_raise(float_flag_invalid, status);
4283         }
4284         return 0;
4285     }
4286     aSign = extractFloat64Sign( a );
4287     bSign = extractFloat64Sign( b );
4288     av = float64_val(a);
4289     bv = float64_val(b);
4290     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4291     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4292
4293 }
4294
4295 /*----------------------------------------------------------------------------
4296 | Returns 1 if the double-precision floating-point value `a' is less than
4297 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4298 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4299 | Standard for Binary Floating-Point Arithmetic.
4300 *----------------------------------------------------------------------------*/
4301
4302 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4303 {
4304     flag aSign, bSign;
4305     uint64_t av, bv;
4306     a = float64_squash_input_denormal(a, status);
4307     b = float64_squash_input_denormal(b, status);
4308
4309     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4310          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4311        ) {
4312         if (float64_is_signaling_nan(a, status)
4313          || float64_is_signaling_nan(b, status)) {
4314             float_raise(float_flag_invalid, status);
4315         }
4316         return 0;
4317     }
4318     aSign = extractFloat64Sign( a );
4319     bSign = extractFloat64Sign( b );
4320     av = float64_val(a);
4321     bv = float64_val(b);
4322     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4323     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4324
4325 }
4326
4327 /*----------------------------------------------------------------------------
4328 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4329 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4330 | comparison is performed according to the IEC/IEEE Standard for Binary
4331 | Floating-Point Arithmetic.
4332 *----------------------------------------------------------------------------*/
4333
4334 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4335 {
4336     a = float64_squash_input_denormal(a, status);
4337     b = float64_squash_input_denormal(b, status);
4338
4339     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4340          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4341        ) {
4342         if (float64_is_signaling_nan(a, status)
4343          || float64_is_signaling_nan(b, status)) {
4344             float_raise(float_flag_invalid, status);
4345         }
4346         return 1;
4347     }
4348     return 0;
4349 }
4350
4351 /*----------------------------------------------------------------------------
4352 | Returns the result of converting the extended double-precision floating-
4353 | point value `a' to the 32-bit two's complement integer format.  The
4354 | conversion is performed according to the IEC/IEEE Standard for Binary
4355 | Floating-Point Arithmetic---which means in particular that the conversion
4356 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4357 | largest positive integer is returned.  Otherwise, if the conversion
4358 | overflows, the largest integer with the same sign as `a' is returned.
4359 *----------------------------------------------------------------------------*/
4360
4361 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4362 {
4363     flag aSign;
4364     int32_t aExp, shiftCount;
4365     uint64_t aSig;
4366
4367     if (floatx80_invalid_encoding(a)) {
4368         float_raise(float_flag_invalid, status);
4369         return 1 << 31;
4370     }
4371     aSig = extractFloatx80Frac( a );
4372     aExp = extractFloatx80Exp( a );
4373     aSign = extractFloatx80Sign( a );
4374     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4375     shiftCount = 0x4037 - aExp;
4376     if ( shiftCount <= 0 ) shiftCount = 1;
4377     shift64RightJamming( aSig, shiftCount, &aSig );
4378     return roundAndPackInt32(aSign, aSig, status);
4379
4380 }
4381
4382 /*----------------------------------------------------------------------------
4383 | Returns the result of converting the extended double-precision floating-
4384 | point value `a' to the 32-bit two's complement integer format.  The
4385 | conversion is performed according to the IEC/IEEE Standard for Binary
4386 | Floating-Point Arithmetic, except that the conversion is always rounded
4387 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4388 | Otherwise, if the conversion overflows, the largest integer with the same
4389 | sign as `a' is returned.
4390 *----------------------------------------------------------------------------*/
4391
4392 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4393 {
4394     flag aSign;
4395     int32_t aExp, shiftCount;
4396     uint64_t aSig, savedASig;
4397     int32_t z;
4398
4399     if (floatx80_invalid_encoding(a)) {
4400         float_raise(float_flag_invalid, status);
4401         return 1 << 31;
4402     }
4403     aSig = extractFloatx80Frac( a );
4404     aExp = extractFloatx80Exp( a );
4405     aSign = extractFloatx80Sign( a );
4406     if ( 0x401E < aExp ) {
4407         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4408         goto invalid;
4409     }
4410     else if ( aExp < 0x3FFF ) {
4411         if (aExp || aSig) {
4412             status->float_exception_flags |= float_flag_inexact;
4413         }
4414         return 0;
4415     }
4416     shiftCount = 0x403E - aExp;
4417     savedASig = aSig;
4418     aSig >>= shiftCount;
4419     z = aSig;
4420     if ( aSign ) z = - z;
4421     if ( ( z < 0 ) ^ aSign ) {
4422  invalid:
4423         float_raise(float_flag_invalid, status);
4424         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4425     }
4426     if ( ( aSig<<shiftCount ) != savedASig ) {
4427         status->float_exception_flags |= float_flag_inexact;
4428     }
4429     return z;
4430
4431 }
4432
4433 /*----------------------------------------------------------------------------
4434 | Returns the result of converting the extended double-precision floating-
4435 | point value `a' to the 64-bit two's complement integer format.  The
4436 | conversion is performed according to the IEC/IEEE Standard for Binary
4437 | Floating-Point Arithmetic---which means in particular that the conversion
4438 | is rounded according to the current rounding mode.  If `a' is a NaN,
4439 | the largest positive integer is returned.  Otherwise, if the conversion
4440 | overflows, the largest integer with the same sign as `a' is returned.
4441 *----------------------------------------------------------------------------*/
4442
4443 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4444 {
4445     flag aSign;
4446     int32_t aExp, shiftCount;
4447     uint64_t aSig, aSigExtra;
4448
4449     if (floatx80_invalid_encoding(a)) {
4450         float_raise(float_flag_invalid, status);
4451         return 1ULL << 63;
4452     }
4453     aSig = extractFloatx80Frac( a );
4454     aExp = extractFloatx80Exp( a );
4455     aSign = extractFloatx80Sign( a );
4456     shiftCount = 0x403E - aExp;
4457     if ( shiftCount <= 0 ) {
4458         if ( shiftCount ) {
4459             float_raise(float_flag_invalid, status);
4460             if (!aSign || floatx80_is_any_nan(a)) {
4461                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4462             }
4463             return (int64_t) LIT64( 0x8000000000000000 );
4464         }
4465         aSigExtra = 0;
4466     }
4467     else {
4468         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4469     }
4470     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4471
4472 }
4473
4474 /*----------------------------------------------------------------------------
4475 | Returns the result of converting the extended double-precision floating-
4476 | point value `a' to the 64-bit two's complement integer format.  The
4477 | conversion is performed according to the IEC/IEEE Standard for Binary
4478 | Floating-Point Arithmetic, except that the conversion is always rounded
4479 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4480 | Otherwise, if the conversion overflows, the largest integer with the same
4481 | sign as `a' is returned.
4482 *----------------------------------------------------------------------------*/
4483
4484 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4485 {
4486     flag aSign;
4487     int32_t aExp, shiftCount;
4488     uint64_t aSig;
4489     int64_t z;
4490
4491     if (floatx80_invalid_encoding(a)) {
4492         float_raise(float_flag_invalid, status);
4493         return 1ULL << 63;
4494     }
4495     aSig = extractFloatx80Frac( a );
4496     aExp = extractFloatx80Exp( a );
4497     aSign = extractFloatx80Sign( a );
4498     shiftCount = aExp - 0x403E;
4499     if ( 0 <= shiftCount ) {
4500         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4501         if ( ( a.high != 0xC03E ) || aSig ) {
4502             float_raise(float_flag_invalid, status);
4503             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4504                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4505             }
4506         }
4507         return (int64_t) LIT64( 0x8000000000000000 );
4508     }
4509     else if ( aExp < 0x3FFF ) {
4510         if (aExp | aSig) {
4511             status->float_exception_flags |= float_flag_inexact;
4512         }
4513         return 0;
4514     }
4515     z = aSig>>( - shiftCount );
4516     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4517         status->float_exception_flags |= float_flag_inexact;
4518     }
4519     if ( aSign ) z = - z;
4520     return z;
4521
4522 }
4523
4524 /*----------------------------------------------------------------------------
4525 | Returns the result of converting the extended double-precision floating-
4526 | point value `a' to the single-precision floating-point format.  The
4527 | conversion is performed according to the IEC/IEEE Standard for Binary
4528 | Floating-Point Arithmetic.
4529 *----------------------------------------------------------------------------*/
4530
4531 float32 floatx80_to_float32(floatx80 a, float_status *status)
4532 {
4533     flag aSign;
4534     int32_t aExp;
4535     uint64_t aSig;
4536
4537     if (floatx80_invalid_encoding(a)) {
4538         float_raise(float_flag_invalid, status);
4539         return float32_default_nan(status);
4540     }
4541     aSig = extractFloatx80Frac( a );
4542     aExp = extractFloatx80Exp( a );
4543     aSign = extractFloatx80Sign( a );
4544     if ( aExp == 0x7FFF ) {
4545         if ( (uint64_t) ( aSig<<1 ) ) {
4546             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4547         }
4548         return packFloat32( aSign, 0xFF, 0 );
4549     }
4550     shift64RightJamming( aSig, 33, &aSig );
4551     if ( aExp || aSig ) aExp -= 0x3F81;
4552     return roundAndPackFloat32(aSign, aExp, aSig, status);
4553
4554 }
4555
4556 /*----------------------------------------------------------------------------
4557 | Returns the result of converting the extended double-precision floating-
4558 | point value `a' to the double-precision floating-point format.  The
4559 | conversion is performed according to the IEC/IEEE Standard for Binary
4560 | Floating-Point Arithmetic.
4561 *----------------------------------------------------------------------------*/
4562
4563 float64 floatx80_to_float64(floatx80 a, float_status *status)
4564 {
4565     flag aSign;
4566     int32_t aExp;
4567     uint64_t aSig, zSig;
4568
4569     if (floatx80_invalid_encoding(a)) {
4570         float_raise(float_flag_invalid, status);
4571         return float64_default_nan(status);
4572     }
4573     aSig = extractFloatx80Frac( a );
4574     aExp = extractFloatx80Exp( a );
4575     aSign = extractFloatx80Sign( a );
4576     if ( aExp == 0x7FFF ) {
4577         if ( (uint64_t) ( aSig<<1 ) ) {
4578             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4579         }
4580         return packFloat64( aSign, 0x7FF, 0 );
4581     }
4582     shift64RightJamming( aSig, 1, &zSig );
4583     if ( aExp || aSig ) aExp -= 0x3C01;
4584     return roundAndPackFloat64(aSign, aExp, zSig, status);
4585
4586 }
4587
4588 /*----------------------------------------------------------------------------
4589 | Returns the result of converting the extended double-precision floating-
4590 | point value `a' to the quadruple-precision floating-point format.  The
4591 | conversion is performed according to the IEC/IEEE Standard for Binary
4592 | Floating-Point Arithmetic.
4593 *----------------------------------------------------------------------------*/
4594
4595 float128 floatx80_to_float128(floatx80 a, float_status *status)
4596 {
4597     flag aSign;
4598     int aExp;
4599     uint64_t aSig, zSig0, zSig1;
4600
4601     if (floatx80_invalid_encoding(a)) {
4602         float_raise(float_flag_invalid, status);
4603         return float128_default_nan(status);
4604     }
4605     aSig = extractFloatx80Frac( a );
4606     aExp = extractFloatx80Exp( a );
4607     aSign = extractFloatx80Sign( a );
4608     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4609         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4610     }
4611     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4612     return packFloat128( aSign, aExp, zSig0, zSig1 );
4613
4614 }
4615
4616 /*----------------------------------------------------------------------------
4617 | Rounds the extended double-precision floating-point value `a'
4618 | to the precision provided by floatx80_rounding_precision and returns the
4619 | result as an extended double-precision floating-point value.
4620 | The operation is performed according to the IEC/IEEE Standard for Binary
4621 | Floating-Point Arithmetic.
4622 *----------------------------------------------------------------------------*/
4623
4624 floatx80 floatx80_round(floatx80 a, float_status *status)
4625 {
4626     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4627                                 extractFloatx80Sign(a),
4628                                 extractFloatx80Exp(a),
4629                                 extractFloatx80Frac(a), 0, status);
4630 }
4631
4632 /*----------------------------------------------------------------------------
4633 | Rounds the extended double-precision floating-point value `a' to an integer,
4634 | and returns the result as an extended quadruple-precision floating-point
4635 | value.  The operation is performed according to the IEC/IEEE Standard for
4636 | Binary Floating-Point Arithmetic.
4637 *----------------------------------------------------------------------------*/
4638
4639 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4640 {
4641     flag aSign;
4642     int32_t aExp;
4643     uint64_t lastBitMask, roundBitsMask;
4644     floatx80 z;
4645
4646     if (floatx80_invalid_encoding(a)) {
4647         float_raise(float_flag_invalid, status);
4648         return floatx80_default_nan(status);
4649     }
4650     aExp = extractFloatx80Exp( a );
4651     if ( 0x403E <= aExp ) {
4652         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4653             return propagateFloatx80NaN(a, a, status);
4654         }
4655         return a;
4656     }
4657     if ( aExp < 0x3FFF ) {
4658         if (    ( aExp == 0 )
4659              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4660             return a;
4661         }
4662         status->float_exception_flags |= float_flag_inexact;
4663         aSign = extractFloatx80Sign( a );
4664         switch (status->float_rounding_mode) {
4665          case float_round_nearest_even:
4666             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4667                ) {
4668                 return
4669                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4670             }
4671             break;
4672         case float_round_ties_away:
4673             if (aExp == 0x3FFE) {
4674                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4675             }
4676             break;
4677          case float_round_down:
4678             return
4679                   aSign ?
4680                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4681                 : packFloatx80( 0, 0, 0 );
4682          case float_round_up:
4683             return
4684                   aSign ? packFloatx80( 1, 0, 0 )
4685                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4686         }
4687         return packFloatx80( aSign, 0, 0 );
4688     }
4689     lastBitMask = 1;
4690     lastBitMask <<= 0x403E - aExp;
4691     roundBitsMask = lastBitMask - 1;
4692     z = a;
4693     switch (status->float_rounding_mode) {
4694     case float_round_nearest_even:
4695         z.low += lastBitMask>>1;
4696         if ((z.low & roundBitsMask) == 0) {
4697             z.low &= ~lastBitMask;
4698         }
4699         break;
4700     case float_round_ties_away:
4701         z.low += lastBitMask >> 1;
4702         break;
4703     case float_round_to_zero:
4704         break;
4705     case float_round_up:
4706         if (!extractFloatx80Sign(z)) {
4707             z.low += roundBitsMask;
4708         }
4709         break;
4710     case float_round_down:
4711         if (extractFloatx80Sign(z)) {
4712             z.low += roundBitsMask;
4713         }
4714         break;
4715     default:
4716         abort();
4717     }
4718     z.low &= ~ roundBitsMask;
4719     if ( z.low == 0 ) {
4720         ++z.high;
4721         z.low = LIT64( 0x8000000000000000 );
4722     }
4723     if (z.low != a.low) {
4724         status->float_exception_flags |= float_flag_inexact;
4725     }
4726     return z;
4727
4728 }
4729
4730 /*----------------------------------------------------------------------------
4731 | Returns the result of adding the absolute values of the extended double-
4732 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4733 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4734 | The addition is performed according to the IEC/IEEE Standard for Binary
4735 | Floating-Point Arithmetic.
4736 *----------------------------------------------------------------------------*/
4737
4738 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4739                                 float_status *status)
4740 {
4741     int32_t aExp, bExp, zExp;
4742     uint64_t aSig, bSig, zSig0, zSig1;
4743     int32_t expDiff;
4744
4745     aSig = extractFloatx80Frac( a );
4746     aExp = extractFloatx80Exp( a );
4747     bSig = extractFloatx80Frac( b );
4748     bExp = extractFloatx80Exp( b );
4749     expDiff = aExp - bExp;
4750     if ( 0 < expDiff ) {
4751         if ( aExp == 0x7FFF ) {
4752             if ((uint64_t)(aSig << 1)) {
4753                 return propagateFloatx80NaN(a, b, status);
4754             }
4755             return a;
4756         }
4757         if ( bExp == 0 ) --expDiff;
4758         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4759         zExp = aExp;
4760     }
4761     else if ( expDiff < 0 ) {
4762         if ( bExp == 0x7FFF ) {
4763             if ((uint64_t)(bSig << 1)) {
4764                 return propagateFloatx80NaN(a, b, status);
4765             }
4766             return packFloatx80(zSign,
4767                                 floatx80_infinity_high,
4768                                 floatx80_infinity_low);
4769         }
4770         if ( aExp == 0 ) ++expDiff;
4771         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4772         zExp = bExp;
4773     }
4774     else {
4775         if ( aExp == 0x7FFF ) {
4776             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4777                 return propagateFloatx80NaN(a, b, status);
4778             }
4779             return a;
4780         }
4781         zSig1 = 0;
4782         zSig0 = aSig + bSig;
4783         if ( aExp == 0 ) {
4784             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4785             goto roundAndPack;
4786         }
4787         zExp = aExp;
4788         goto shiftRight1;
4789     }
4790     zSig0 = aSig + bSig;
4791     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4792  shiftRight1:
4793     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4794     zSig0 |= LIT64( 0x8000000000000000 );
4795     ++zExp;
4796  roundAndPack:
4797     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4798                                 zSign, zExp, zSig0, zSig1, status);
4799 }
4800
4801 /*----------------------------------------------------------------------------
4802 | Returns the result of subtracting the absolute values of the extended
4803 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4804 | difference is negated before being returned.  `zSign' is ignored if the
4805 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4806 | Standard for Binary Floating-Point Arithmetic.
4807 *----------------------------------------------------------------------------*/
4808
4809 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4810                                 float_status *status)
4811 {
4812     int32_t aExp, bExp, zExp;
4813     uint64_t aSig, bSig, zSig0, zSig1;
4814     int32_t expDiff;
4815
4816     aSig = extractFloatx80Frac( a );
4817     aExp = extractFloatx80Exp( a );
4818     bSig = extractFloatx80Frac( b );
4819     bExp = extractFloatx80Exp( b );
4820     expDiff = aExp - bExp;
4821     if ( 0 < expDiff ) goto aExpBigger;
4822     if ( expDiff < 0 ) goto bExpBigger;
4823     if ( aExp == 0x7FFF ) {
4824         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4825             return propagateFloatx80NaN(a, b, status);
4826         }
4827         float_raise(float_flag_invalid, status);
4828         return floatx80_default_nan(status);
4829     }
4830     if ( aExp == 0 ) {
4831         aExp = 1;
4832         bExp = 1;
4833     }
4834     zSig1 = 0;
4835     if ( bSig < aSig ) goto aBigger;
4836     if ( aSig < bSig ) goto bBigger;
4837     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
4838  bExpBigger:
4839     if ( bExp == 0x7FFF ) {
4840         if ((uint64_t)(bSig << 1)) {
4841             return propagateFloatx80NaN(a, b, status);
4842         }
4843         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
4844                             floatx80_infinity_low);
4845     }
4846     if ( aExp == 0 ) ++expDiff;
4847     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4848  bBigger:
4849     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4850     zExp = bExp;
4851     zSign ^= 1;
4852     goto normalizeRoundAndPack;
4853  aExpBigger:
4854     if ( aExp == 0x7FFF ) {
4855         if ((uint64_t)(aSig << 1)) {
4856             return propagateFloatx80NaN(a, b, status);
4857         }
4858         return a;
4859     }
4860     if ( bExp == 0 ) --expDiff;
4861     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4862  aBigger:
4863     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4864     zExp = aExp;
4865  normalizeRoundAndPack:
4866     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
4867                                          zSign, zExp, zSig0, zSig1, status);
4868 }
4869
4870 /*----------------------------------------------------------------------------
4871 | Returns the result of adding the extended double-precision floating-point
4872 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4873 | Standard for Binary Floating-Point Arithmetic.
4874 *----------------------------------------------------------------------------*/
4875
4876 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
4877 {
4878     flag aSign, bSign;
4879
4880     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4881         float_raise(float_flag_invalid, status);
4882         return floatx80_default_nan(status);
4883     }
4884     aSign = extractFloatx80Sign( a );
4885     bSign = extractFloatx80Sign( b );
4886     if ( aSign == bSign ) {
4887         return addFloatx80Sigs(a, b, aSign, status);
4888     }
4889     else {
4890         return subFloatx80Sigs(a, b, aSign, status);
4891     }
4892
4893 }
4894
4895 /*----------------------------------------------------------------------------
4896 | Returns the result of subtracting the extended double-precision floating-
4897 | point values `a' and `b'.  The operation is performed according to the
4898 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4899 *----------------------------------------------------------------------------*/
4900
4901 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
4902 {
4903     flag aSign, bSign;
4904
4905     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4906         float_raise(float_flag_invalid, status);
4907         return floatx80_default_nan(status);
4908     }
4909     aSign = extractFloatx80Sign( a );
4910     bSign = extractFloatx80Sign( b );
4911     if ( aSign == bSign ) {
4912         return subFloatx80Sigs(a, b, aSign, status);
4913     }
4914     else {
4915         return addFloatx80Sigs(a, b, aSign, status);
4916     }
4917
4918 }
4919
4920 /*----------------------------------------------------------------------------
4921 | Returns the result of multiplying the extended double-precision floating-
4922 | point values `a' and `b'.  The operation is performed according to the
4923 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4924 *----------------------------------------------------------------------------*/
4925
4926 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
4927 {
4928     flag aSign, bSign, zSign;
4929     int32_t aExp, bExp, zExp;
4930     uint64_t aSig, bSig, zSig0, zSig1;
4931
4932     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4933         float_raise(float_flag_invalid, status);
4934         return floatx80_default_nan(status);
4935     }
4936     aSig = extractFloatx80Frac( a );
4937     aExp = extractFloatx80Exp( a );
4938     aSign = extractFloatx80Sign( a );
4939     bSig = extractFloatx80Frac( b );
4940     bExp = extractFloatx80Exp( b );
4941     bSign = extractFloatx80Sign( b );
4942     zSign = aSign ^ bSign;
4943     if ( aExp == 0x7FFF ) {
4944         if (    (uint64_t) ( aSig<<1 )
4945              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4946             return propagateFloatx80NaN(a, b, status);
4947         }
4948         if ( ( bExp | bSig ) == 0 ) goto invalid;
4949         return packFloatx80(zSign, floatx80_infinity_high,
4950                                    floatx80_infinity_low);
4951     }
4952     if ( bExp == 0x7FFF ) {
4953         if ((uint64_t)(bSig << 1)) {
4954             return propagateFloatx80NaN(a, b, status);
4955         }
4956         if ( ( aExp | aSig ) == 0 ) {
4957  invalid:
4958             float_raise(float_flag_invalid, status);
4959             return floatx80_default_nan(status);
4960         }
4961         return packFloatx80(zSign, floatx80_infinity_high,
4962                                    floatx80_infinity_low);
4963     }
4964     if ( aExp == 0 ) {
4965         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4966         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4967     }
4968     if ( bExp == 0 ) {
4969         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4970         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4971     }
4972     zExp = aExp + bExp - 0x3FFE;
4973     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4974     if ( 0 < (int64_t) zSig0 ) {
4975         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4976         --zExp;
4977     }
4978     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4979                                 zSign, zExp, zSig0, zSig1, status);
4980 }
4981
4982 /*----------------------------------------------------------------------------
4983 | Returns the result of dividing the extended double-precision floating-point
4984 | value `a' by the corresponding value `b'.  The operation is performed
4985 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4986 *----------------------------------------------------------------------------*/
4987
4988 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
4989 {
4990     flag aSign, bSign, zSign;
4991     int32_t aExp, bExp, zExp;
4992     uint64_t aSig, bSig, zSig0, zSig1;
4993     uint64_t rem0, rem1, rem2, term0, term1, term2;
4994
4995     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4996         float_raise(float_flag_invalid, status);
4997         return floatx80_default_nan(status);
4998     }
4999     aSig = extractFloatx80Frac( a );
5000     aExp = extractFloatx80Exp( a );
5001     aSign = extractFloatx80Sign( a );
5002     bSig = extractFloatx80Frac( b );
5003     bExp = extractFloatx80Exp( b );
5004     bSign = extractFloatx80Sign( b );
5005     zSign = aSign ^ bSign;
5006     if ( aExp == 0x7FFF ) {
5007         if ((uint64_t)(aSig << 1)) {
5008             return propagateFloatx80NaN(a, b, status);
5009         }
5010         if ( bExp == 0x7FFF ) {
5011             if ((uint64_t)(bSig << 1)) {
5012                 return propagateFloatx80NaN(a, b, status);
5013             }
5014             goto invalid;
5015         }
5016         return packFloatx80(zSign, floatx80_infinity_high,
5017                                    floatx80_infinity_low);
5018     }
5019     if ( bExp == 0x7FFF ) {
5020         if ((uint64_t)(bSig << 1)) {
5021             return propagateFloatx80NaN(a, b, status);
5022         }
5023         return packFloatx80( zSign, 0, 0 );
5024     }
5025     if ( bExp == 0 ) {
5026         if ( bSig == 0 ) {
5027             if ( ( aExp | aSig ) == 0 ) {
5028  invalid:
5029                 float_raise(float_flag_invalid, status);
5030                 return floatx80_default_nan(status);
5031             }
5032             float_raise(float_flag_divbyzero, status);
5033             return packFloatx80(zSign, floatx80_infinity_high,
5034                                        floatx80_infinity_low);
5035         }
5036         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5037     }
5038     if ( aExp == 0 ) {
5039         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5040         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5041     }
5042     zExp = aExp - bExp + 0x3FFE;
5043     rem1 = 0;
5044     if ( bSig <= aSig ) {
5045         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5046         ++zExp;
5047     }
5048     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5049     mul64To128( bSig, zSig0, &term0, &term1 );
5050     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5051     while ( (int64_t) rem0 < 0 ) {
5052         --zSig0;
5053         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5054     }
5055     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5056     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5057         mul64To128( bSig, zSig1, &term1, &term2 );
5058         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5059         while ( (int64_t) rem1 < 0 ) {
5060             --zSig1;
5061             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5062         }
5063         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5064     }
5065     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5066                                 zSign, zExp, zSig0, zSig1, status);
5067 }
5068
5069 /*----------------------------------------------------------------------------
5070 | Returns the remainder of the extended double-precision floating-point value
5071 | `a' with respect to the corresponding value `b'.  The operation is performed
5072 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5073 *----------------------------------------------------------------------------*/
5074
5075 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5076 {
5077     flag aSign, zSign;
5078     int32_t aExp, bExp, expDiff;
5079     uint64_t aSig0, aSig1, bSig;
5080     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5081
5082     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5083         float_raise(float_flag_invalid, status);
5084         return floatx80_default_nan(status);
5085     }
5086     aSig0 = extractFloatx80Frac( a );
5087     aExp = extractFloatx80Exp( a );
5088     aSign = extractFloatx80Sign( a );
5089     bSig = extractFloatx80Frac( b );
5090     bExp = extractFloatx80Exp( b );
5091     if ( aExp == 0x7FFF ) {
5092         if (    (uint64_t) ( aSig0<<1 )
5093              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5094             return propagateFloatx80NaN(a, b, status);
5095         }
5096         goto invalid;
5097     }
5098     if ( bExp == 0x7FFF ) {
5099         if ((uint64_t)(bSig << 1)) {
5100             return propagateFloatx80NaN(a, b, status);
5101         }
5102         return a;
5103     }
5104     if ( bExp == 0 ) {
5105         if ( bSig == 0 ) {
5106  invalid:
5107             float_raise(float_flag_invalid, status);
5108             return floatx80_default_nan(status);
5109         }
5110         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5111     }
5112     if ( aExp == 0 ) {
5113         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5114         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5115     }
5116     bSig |= LIT64( 0x8000000000000000 );
5117     zSign = aSign;
5118     expDiff = aExp - bExp;
5119     aSig1 = 0;
5120     if ( expDiff < 0 ) {
5121         if ( expDiff < -1 ) return a;
5122         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5123         expDiff = 0;
5124     }
5125     q = ( bSig <= aSig0 );
5126     if ( q ) aSig0 -= bSig;
5127     expDiff -= 64;
5128     while ( 0 < expDiff ) {
5129         q = estimateDiv128To64( aSig0, aSig1, bSig );
5130         q = ( 2 < q ) ? q - 2 : 0;
5131         mul64To128( bSig, q, &term0, &term1 );
5132         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5133         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5134         expDiff -= 62;
5135     }
5136     expDiff += 64;
5137     if ( 0 < expDiff ) {
5138         q = estimateDiv128To64( aSig0, aSig1, bSig );
5139         q = ( 2 < q ) ? q - 2 : 0;
5140         q >>= 64 - expDiff;
5141         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5142         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5143         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5144         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5145             ++q;
5146             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5147         }
5148     }
5149     else {
5150         term1 = 0;
5151         term0 = bSig;
5152     }
5153     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5154     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5155          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5156               && ( q & 1 ) )
5157        ) {
5158         aSig0 = alternateASig0;
5159         aSig1 = alternateASig1;
5160         zSign = ! zSign;
5161     }
5162     return
5163         normalizeRoundAndPackFloatx80(
5164             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5165
5166 }
5167
5168 /*----------------------------------------------------------------------------
5169 | Returns the square root of the extended double-precision floating-point
5170 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5171 | for Binary Floating-Point Arithmetic.
5172 *----------------------------------------------------------------------------*/
5173
5174 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5175 {
5176     flag aSign;
5177     int32_t aExp, zExp;
5178     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5179     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5180
5181     if (floatx80_invalid_encoding(a)) {
5182         float_raise(float_flag_invalid, status);
5183         return floatx80_default_nan(status);
5184     }
5185     aSig0 = extractFloatx80Frac( a );
5186     aExp = extractFloatx80Exp( a );
5187     aSign = extractFloatx80Sign( a );
5188     if ( aExp == 0x7FFF ) {
5189         if ((uint64_t)(aSig0 << 1)) {
5190             return propagateFloatx80NaN(a, a, status);
5191         }
5192         if ( ! aSign ) return a;
5193         goto invalid;
5194     }
5195     if ( aSign ) {
5196         if ( ( aExp | aSig0 ) == 0 ) return a;
5197  invalid:
5198         float_raise(float_flag_invalid, status);
5199         return floatx80_default_nan(status);
5200     }
5201     if ( aExp == 0 ) {
5202         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5203         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5204     }
5205     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5206     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5207     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5208     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5209     doubleZSig0 = zSig0<<1;
5210     mul64To128( zSig0, zSig0, &term0, &term1 );
5211     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5212     while ( (int64_t) rem0 < 0 ) {
5213         --zSig0;
5214         doubleZSig0 -= 2;
5215         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5216     }
5217     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5218     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5219         if ( zSig1 == 0 ) zSig1 = 1;
5220         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5221         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5222         mul64To128( zSig1, zSig1, &term2, &term3 );
5223         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5224         while ( (int64_t) rem1 < 0 ) {
5225             --zSig1;
5226             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5227             term3 |= 1;
5228             term2 |= doubleZSig0;
5229             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5230         }
5231         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5232     }
5233     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5234     zSig0 |= doubleZSig0;
5235     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5236                                 0, zExp, zSig0, zSig1, status);
5237 }
5238
5239 /*----------------------------------------------------------------------------
5240 | Returns 1 if the extended double-precision floating-point value `a' is equal
5241 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5242 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5243 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5244 *----------------------------------------------------------------------------*/
5245
5246 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5247 {
5248
5249     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5250         || (extractFloatx80Exp(a) == 0x7FFF
5251             && (uint64_t) (extractFloatx80Frac(a) << 1))
5252         || (extractFloatx80Exp(b) == 0x7FFF
5253             && (uint64_t) (extractFloatx80Frac(b) << 1))
5254        ) {
5255         float_raise(float_flag_invalid, status);
5256         return 0;
5257     }
5258     return
5259            ( a.low == b.low )
5260         && (    ( a.high == b.high )
5261              || (    ( a.low == 0 )
5262                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5263            );
5264
5265 }
5266
5267 /*----------------------------------------------------------------------------
5268 | Returns 1 if the extended double-precision floating-point value `a' is
5269 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5270 | invalid exception is raised if either operand is a NaN.  The comparison is
5271 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5272 | Arithmetic.
5273 *----------------------------------------------------------------------------*/
5274
5275 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5276 {
5277     flag aSign, bSign;
5278
5279     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5280         || (extractFloatx80Exp(a) == 0x7FFF
5281             && (uint64_t) (extractFloatx80Frac(a) << 1))
5282         || (extractFloatx80Exp(b) == 0x7FFF
5283             && (uint64_t) (extractFloatx80Frac(b) << 1))
5284        ) {
5285         float_raise(float_flag_invalid, status);
5286         return 0;
5287     }
5288     aSign = extractFloatx80Sign( a );
5289     bSign = extractFloatx80Sign( b );
5290     if ( aSign != bSign ) {
5291         return
5292                aSign
5293             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5294                  == 0 );
5295     }
5296     return
5297           aSign ? le128( b.high, b.low, a.high, a.low )
5298         : le128( a.high, a.low, b.high, b.low );
5299
5300 }
5301
5302 /*----------------------------------------------------------------------------
5303 | Returns 1 if the extended double-precision floating-point value `a' is
5304 | less than the corresponding value `b', and 0 otherwise.  The invalid
5305 | exception is raised if either operand is a NaN.  The comparison is performed
5306 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5307 *----------------------------------------------------------------------------*/
5308
5309 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5310 {
5311     flag aSign, bSign;
5312
5313     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5314         || (extractFloatx80Exp(a) == 0x7FFF
5315             && (uint64_t) (extractFloatx80Frac(a) << 1))
5316         || (extractFloatx80Exp(b) == 0x7FFF
5317             && (uint64_t) (extractFloatx80Frac(b) << 1))
5318        ) {
5319         float_raise(float_flag_invalid, status);
5320         return 0;
5321     }
5322     aSign = extractFloatx80Sign( a );
5323     bSign = extractFloatx80Sign( b );
5324     if ( aSign != bSign ) {
5325         return
5326                aSign
5327             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5328                  != 0 );
5329     }
5330     return
5331           aSign ? lt128( b.high, b.low, a.high, a.low )
5332         : lt128( a.high, a.low, b.high, b.low );
5333
5334 }
5335
5336 /*----------------------------------------------------------------------------
5337 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5338 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5339 | either operand is a NaN.   The comparison is performed according to the
5340 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5341 *----------------------------------------------------------------------------*/
5342 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5343 {
5344     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5345         || (extractFloatx80Exp(a) == 0x7FFF
5346             && (uint64_t) (extractFloatx80Frac(a) << 1))
5347         || (extractFloatx80Exp(b) == 0x7FFF
5348             && (uint64_t) (extractFloatx80Frac(b) << 1))
5349        ) {
5350         float_raise(float_flag_invalid, status);
5351         return 1;
5352     }
5353     return 0;
5354 }
5355
5356 /*----------------------------------------------------------------------------
5357 | Returns 1 if the extended double-precision floating-point value `a' is
5358 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5359 | cause an exception.  The comparison is performed according to the IEC/IEEE
5360 | Standard for Binary Floating-Point Arithmetic.
5361 *----------------------------------------------------------------------------*/
5362
5363 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5364 {
5365
5366     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5367         float_raise(float_flag_invalid, status);
5368         return 0;
5369     }
5370     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5371               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5372          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5373               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5374        ) {
5375         if (floatx80_is_signaling_nan(a, status)
5376          || floatx80_is_signaling_nan(b, status)) {
5377             float_raise(float_flag_invalid, status);
5378         }
5379         return 0;
5380     }
5381     return
5382            ( a.low == b.low )
5383         && (    ( a.high == b.high )
5384              || (    ( a.low == 0 )
5385                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5386            );
5387
5388 }
5389
5390 /*----------------------------------------------------------------------------
5391 | Returns 1 if the extended double-precision floating-point value `a' is less
5392 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5393 | do not cause an exception.  Otherwise, the comparison is performed according
5394 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5395 *----------------------------------------------------------------------------*/
5396
5397 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5398 {
5399     flag aSign, bSign;
5400
5401     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5402         float_raise(float_flag_invalid, status);
5403         return 0;
5404     }
5405     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5406               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5407          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5408               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5409        ) {
5410         if (floatx80_is_signaling_nan(a, status)
5411          || floatx80_is_signaling_nan(b, status)) {
5412             float_raise(float_flag_invalid, status);
5413         }
5414         return 0;
5415     }
5416     aSign = extractFloatx80Sign( a );
5417     bSign = extractFloatx80Sign( b );
5418     if ( aSign != bSign ) {
5419         return
5420                aSign
5421             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5422                  == 0 );
5423     }
5424     return
5425           aSign ? le128( b.high, b.low, a.high, a.low )
5426         : le128( a.high, a.low, b.high, b.low );
5427
5428 }
5429
5430 /*----------------------------------------------------------------------------
5431 | Returns 1 if the extended double-precision floating-point value `a' is less
5432 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5433 | an exception.  Otherwise, the comparison is performed according to the
5434 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5435 *----------------------------------------------------------------------------*/
5436
5437 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5438 {
5439     flag aSign, bSign;
5440
5441     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5442         float_raise(float_flag_invalid, status);
5443         return 0;
5444     }
5445     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5446               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5447          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5448               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5449        ) {
5450         if (floatx80_is_signaling_nan(a, status)
5451          || floatx80_is_signaling_nan(b, status)) {
5452             float_raise(float_flag_invalid, status);
5453         }
5454         return 0;
5455     }
5456     aSign = extractFloatx80Sign( a );
5457     bSign = extractFloatx80Sign( b );
5458     if ( aSign != bSign ) {
5459         return
5460                aSign
5461             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5462                  != 0 );
5463     }
5464     return
5465           aSign ? lt128( b.high, b.low, a.high, a.low )
5466         : lt128( a.high, a.low, b.high, b.low );
5467
5468 }
5469
5470 /*----------------------------------------------------------------------------
5471 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5472 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5473 | The comparison is performed according to the IEC/IEEE Standard for Binary
5474 | Floating-Point Arithmetic.
5475 *----------------------------------------------------------------------------*/
5476 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5477 {
5478     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5479         float_raise(float_flag_invalid, status);
5480         return 1;
5481     }
5482     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5483               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5484          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5485               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5486        ) {
5487         if (floatx80_is_signaling_nan(a, status)
5488          || floatx80_is_signaling_nan(b, status)) {
5489             float_raise(float_flag_invalid, status);
5490         }
5491         return 1;
5492     }
5493     return 0;
5494 }
5495
5496 /*----------------------------------------------------------------------------
5497 | Returns the result of converting the quadruple-precision floating-point
5498 | value `a' to the 32-bit two's complement integer format.  The conversion
5499 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5500 | Arithmetic---which means in particular that the conversion is rounded
5501 | according to the current rounding mode.  If `a' is a NaN, the largest
5502 | positive integer is returned.  Otherwise, if the conversion overflows, the
5503 | largest integer with the same sign as `a' is returned.
5504 *----------------------------------------------------------------------------*/
5505
5506 int32_t float128_to_int32(float128 a, float_status *status)
5507 {
5508     flag aSign;
5509     int32_t aExp, shiftCount;
5510     uint64_t aSig0, aSig1;
5511
5512     aSig1 = extractFloat128Frac1( a );
5513     aSig0 = extractFloat128Frac0( a );
5514     aExp = extractFloat128Exp( a );
5515     aSign = extractFloat128Sign( a );
5516     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5517     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5518     aSig0 |= ( aSig1 != 0 );
5519     shiftCount = 0x4028 - aExp;
5520     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5521     return roundAndPackInt32(aSign, aSig0, status);
5522
5523 }
5524
5525 /*----------------------------------------------------------------------------
5526 | Returns the result of converting the quadruple-precision floating-point
5527 | value `a' to the 32-bit two's complement integer format.  The conversion
5528 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5529 | Arithmetic, except that the conversion is always rounded toward zero.  If
5530 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5531 | conversion overflows, the largest integer with the same sign as `a' is
5532 | returned.
5533 *----------------------------------------------------------------------------*/
5534
5535 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5536 {
5537     flag aSign;
5538     int32_t aExp, shiftCount;
5539     uint64_t aSig0, aSig1, savedASig;
5540     int32_t z;
5541
5542     aSig1 = extractFloat128Frac1( a );
5543     aSig0 = extractFloat128Frac0( a );
5544     aExp = extractFloat128Exp( a );
5545     aSign = extractFloat128Sign( a );
5546     aSig0 |= ( aSig1 != 0 );
5547     if ( 0x401E < aExp ) {
5548         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5549         goto invalid;
5550     }
5551     else if ( aExp < 0x3FFF ) {
5552         if (aExp || aSig0) {
5553             status->float_exception_flags |= float_flag_inexact;
5554         }
5555         return 0;
5556     }
5557     aSig0 |= LIT64( 0x0001000000000000 );
5558     shiftCount = 0x402F - aExp;
5559     savedASig = aSig0;
5560     aSig0 >>= shiftCount;
5561     z = aSig0;
5562     if ( aSign ) z = - z;
5563     if ( ( z < 0 ) ^ aSign ) {
5564  invalid:
5565         float_raise(float_flag_invalid, status);
5566         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5567     }
5568     if ( ( aSig0<<shiftCount ) != savedASig ) {
5569         status->float_exception_flags |= float_flag_inexact;
5570     }
5571     return z;
5572
5573 }
5574
5575 /*----------------------------------------------------------------------------
5576 | Returns the result of converting the quadruple-precision floating-point
5577 | value `a' to the 64-bit two's complement integer format.  The conversion
5578 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5579 | Arithmetic---which means in particular that the conversion is rounded
5580 | according to the current rounding mode.  If `a' is a NaN, the largest
5581 | positive integer is returned.  Otherwise, if the conversion overflows, the
5582 | largest integer with the same sign as `a' is returned.
5583 *----------------------------------------------------------------------------*/
5584
5585 int64_t float128_to_int64(float128 a, float_status *status)
5586 {
5587     flag aSign;
5588     int32_t aExp, shiftCount;
5589     uint64_t aSig0, aSig1;
5590
5591     aSig1 = extractFloat128Frac1( a );
5592     aSig0 = extractFloat128Frac0( a );
5593     aExp = extractFloat128Exp( a );
5594     aSign = extractFloat128Sign( a );
5595     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5596     shiftCount = 0x402F - aExp;
5597     if ( shiftCount <= 0 ) {
5598         if ( 0x403E < aExp ) {
5599             float_raise(float_flag_invalid, status);
5600             if (    ! aSign
5601                  || (    ( aExp == 0x7FFF )
5602                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5603                     )
5604                ) {
5605                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5606             }
5607             return (int64_t) LIT64( 0x8000000000000000 );
5608         }
5609         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5610     }
5611     else {
5612         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5613     }
5614     return roundAndPackInt64(aSign, aSig0, aSig1, status);
5615
5616 }
5617
5618 /*----------------------------------------------------------------------------
5619 | Returns the result of converting the quadruple-precision floating-point
5620 | value `a' to the 64-bit two's complement integer format.  The conversion
5621 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5622 | Arithmetic, except that the conversion is always rounded toward zero.
5623 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5624 | the conversion overflows, the largest integer with the same sign as `a' is
5625 | returned.
5626 *----------------------------------------------------------------------------*/
5627
5628 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5629 {
5630     flag aSign;
5631     int32_t aExp, shiftCount;
5632     uint64_t aSig0, aSig1;
5633     int64_t z;
5634
5635     aSig1 = extractFloat128Frac1( a );
5636     aSig0 = extractFloat128Frac0( a );
5637     aExp = extractFloat128Exp( a );
5638     aSign = extractFloat128Sign( a );
5639     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5640     shiftCount = aExp - 0x402F;
5641     if ( 0 < shiftCount ) {
5642         if ( 0x403E <= aExp ) {
5643             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5644             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5645                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5646                 if (aSig1) {
5647                     status->float_exception_flags |= float_flag_inexact;
5648                 }
5649             }
5650             else {
5651                 float_raise(float_flag_invalid, status);
5652                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5653                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5654                 }
5655             }
5656             return (int64_t) LIT64( 0x8000000000000000 );
5657         }
5658         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5659         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5660             status->float_exception_flags |= float_flag_inexact;
5661         }
5662     }
5663     else {
5664         if ( aExp < 0x3FFF ) {
5665             if ( aExp | aSig0 | aSig1 ) {
5666                 status->float_exception_flags |= float_flag_inexact;
5667             }
5668             return 0;
5669         }
5670         z = aSig0>>( - shiftCount );
5671         if (    aSig1
5672              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5673             status->float_exception_flags |= float_flag_inexact;
5674         }
5675     }
5676     if ( aSign ) z = - z;
5677     return z;
5678
5679 }
5680
5681 /*----------------------------------------------------------------------------
5682 | Returns the result of converting the quadruple-precision floating-point value
5683 | `a' to the 64-bit unsigned integer format.  The conversion is
5684 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5685 | Arithmetic---which means in particular that the conversion is rounded
5686 | according to the current rounding mode.  If `a' is a NaN, the largest
5687 | positive integer is returned.  If the conversion overflows, the
5688 | largest unsigned integer is returned.  If 'a' is negative, the value is
5689 | rounded and zero is returned; negative values that do not round to zero
5690 | will raise the inexact exception.
5691 *----------------------------------------------------------------------------*/
5692
5693 uint64_t float128_to_uint64(float128 a, float_status *status)
5694 {
5695     flag aSign;
5696     int aExp;
5697     int shiftCount;
5698     uint64_t aSig0, aSig1;
5699
5700     aSig0 = extractFloat128Frac0(a);
5701     aSig1 = extractFloat128Frac1(a);
5702     aExp = extractFloat128Exp(a);
5703     aSign = extractFloat128Sign(a);
5704     if (aSign && (aExp > 0x3FFE)) {
5705         float_raise(float_flag_invalid, status);
5706         if (float128_is_any_nan(a)) {
5707             return LIT64(0xFFFFFFFFFFFFFFFF);
5708         } else {
5709             return 0;
5710         }
5711     }
5712     if (aExp) {
5713         aSig0 |= LIT64(0x0001000000000000);
5714     }
5715     shiftCount = 0x402F - aExp;
5716     if (shiftCount <= 0) {
5717         if (0x403E < aExp) {
5718             float_raise(float_flag_invalid, status);
5719             return LIT64(0xFFFFFFFFFFFFFFFF);
5720         }
5721         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5722     } else {
5723         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5724     }
5725     return roundAndPackUint64(aSign, aSig0, aSig1, status);
5726 }
5727
5728 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5729 {
5730     uint64_t v;
5731     signed char current_rounding_mode = status->float_rounding_mode;
5732
5733     set_float_rounding_mode(float_round_to_zero, status);
5734     v = float128_to_uint64(a, status);
5735     set_float_rounding_mode(current_rounding_mode, status);
5736
5737     return v;
5738 }
5739
5740 /*----------------------------------------------------------------------------
5741 | Returns the result of converting the quadruple-precision floating-point
5742 | value `a' to the 32-bit unsigned integer format.  The conversion
5743 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5744 | Arithmetic except that the conversion is always rounded toward zero.
5745 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
5746 | if the conversion overflows, the largest unsigned integer is returned.
5747 | If 'a' is negative, the value is rounded and zero is returned; negative
5748 | values that do not round to zero will raise the inexact exception.
5749 *----------------------------------------------------------------------------*/
5750
5751 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5752 {
5753     uint64_t v;
5754     uint32_t res;
5755     int old_exc_flags = get_float_exception_flags(status);
5756
5757     v = float128_to_uint64_round_to_zero(a, status);
5758     if (v > 0xffffffff) {
5759         res = 0xffffffff;
5760     } else {
5761         return v;
5762     }
5763     set_float_exception_flags(old_exc_flags, status);
5764     float_raise(float_flag_invalid, status);
5765     return res;
5766 }
5767
5768 /*----------------------------------------------------------------------------
5769 | Returns the result of converting the quadruple-precision floating-point
5770 | value `a' to the single-precision floating-point format.  The conversion
5771 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5772 | Arithmetic.
5773 *----------------------------------------------------------------------------*/
5774
5775 float32 float128_to_float32(float128 a, float_status *status)
5776 {
5777     flag aSign;
5778     int32_t aExp;
5779     uint64_t aSig0, aSig1;
5780     uint32_t zSig;
5781
5782     aSig1 = extractFloat128Frac1( a );
5783     aSig0 = extractFloat128Frac0( a );
5784     aExp = extractFloat128Exp( a );
5785     aSign = extractFloat128Sign( a );
5786     if ( aExp == 0x7FFF ) {
5787         if ( aSig0 | aSig1 ) {
5788             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5789         }
5790         return packFloat32( aSign, 0xFF, 0 );
5791     }
5792     aSig0 |= ( aSig1 != 0 );
5793     shift64RightJamming( aSig0, 18, &aSig0 );
5794     zSig = aSig0;
5795     if ( aExp || zSig ) {
5796         zSig |= 0x40000000;
5797         aExp -= 0x3F81;
5798     }
5799     return roundAndPackFloat32(aSign, aExp, zSig, status);
5800
5801 }
5802
5803 /*----------------------------------------------------------------------------
5804 | Returns the result of converting the quadruple-precision floating-point
5805 | value `a' to the double-precision floating-point format.  The conversion
5806 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5807 | Arithmetic.
5808 *----------------------------------------------------------------------------*/
5809
5810 float64 float128_to_float64(float128 a, float_status *status)
5811 {
5812     flag aSign;
5813     int32_t aExp;
5814     uint64_t aSig0, aSig1;
5815
5816     aSig1 = extractFloat128Frac1( a );
5817     aSig0 = extractFloat128Frac0( a );
5818     aExp = extractFloat128Exp( a );
5819     aSign = extractFloat128Sign( a );
5820     if ( aExp == 0x7FFF ) {
5821         if ( aSig0 | aSig1 ) {
5822             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5823         }
5824         return packFloat64( aSign, 0x7FF, 0 );
5825     }
5826     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5827     aSig0 |= ( aSig1 != 0 );
5828     if ( aExp || aSig0 ) {
5829         aSig0 |= LIT64( 0x4000000000000000 );
5830         aExp -= 0x3C01;
5831     }
5832     return roundAndPackFloat64(aSign, aExp, aSig0, status);
5833
5834 }
5835
5836 /*----------------------------------------------------------------------------
5837 | Returns the result of converting the quadruple-precision floating-point
5838 | value `a' to the extended double-precision floating-point format.  The
5839 | conversion is performed according to the IEC/IEEE Standard for Binary
5840 | Floating-Point Arithmetic.
5841 *----------------------------------------------------------------------------*/
5842
5843 floatx80 float128_to_floatx80(float128 a, float_status *status)
5844 {
5845     flag aSign;
5846     int32_t aExp;
5847     uint64_t aSig0, aSig1;
5848
5849     aSig1 = extractFloat128Frac1( a );
5850     aSig0 = extractFloat128Frac0( a );
5851     aExp = extractFloat128Exp( a );
5852     aSign = extractFloat128Sign( a );
5853     if ( aExp == 0x7FFF ) {
5854         if ( aSig0 | aSig1 ) {
5855             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
5856         }
5857         return packFloatx80(aSign, floatx80_infinity_high,
5858                                    floatx80_infinity_low);
5859     }
5860     if ( aExp == 0 ) {
5861         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5862         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5863     }
5864     else {
5865         aSig0 |= LIT64( 0x0001000000000000 );
5866     }
5867     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5868     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
5869
5870 }
5871
5872 /*----------------------------------------------------------------------------
5873 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5874 | returns the result as a quadruple-precision floating-point value.  The
5875 | operation is performed according to the IEC/IEEE Standard for Binary
5876 | Floating-Point Arithmetic.
5877 *----------------------------------------------------------------------------*/
5878
5879 float128 float128_round_to_int(float128 a, float_status *status)
5880 {
5881     flag aSign;
5882     int32_t aExp;
5883     uint64_t lastBitMask, roundBitsMask;
5884     float128 z;
5885
5886     aExp = extractFloat128Exp( a );
5887     if ( 0x402F <= aExp ) {
5888         if ( 0x406F <= aExp ) {
5889             if (    ( aExp == 0x7FFF )
5890                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5891                ) {
5892                 return propagateFloat128NaN(a, a, status);
5893             }
5894             return a;
5895         }
5896         lastBitMask = 1;
5897         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5898         roundBitsMask = lastBitMask - 1;
5899         z = a;
5900         switch (status->float_rounding_mode) {
5901         case float_round_nearest_even:
5902             if ( lastBitMask ) {
5903                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5904                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5905             }
5906             else {
5907                 if ( (int64_t) z.low < 0 ) {
5908                     ++z.high;
5909                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5910                 }
5911             }
5912             break;
5913         case float_round_ties_away:
5914             if (lastBitMask) {
5915                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
5916             } else {
5917                 if ((int64_t) z.low < 0) {
5918                     ++z.high;
5919                 }
5920             }
5921             break;
5922         case float_round_to_zero:
5923             break;
5924         case float_round_up:
5925             if (!extractFloat128Sign(z)) {
5926                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5927             }
5928             break;
5929         case float_round_down:
5930             if (extractFloat128Sign(z)) {
5931                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5932             }
5933             break;
5934         default:
5935             abort();
5936         }
5937         z.low &= ~ roundBitsMask;
5938     }
5939     else {
5940         if ( aExp < 0x3FFF ) {
5941             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5942             status->float_exception_flags |= float_flag_inexact;
5943             aSign = extractFloat128Sign( a );
5944             switch (status->float_rounding_mode) {
5945              case float_round_nearest_even:
5946                 if (    ( aExp == 0x3FFE )
5947                      && (   extractFloat128Frac0( a )
5948                           | extractFloat128Frac1( a ) )
5949                    ) {
5950                     return packFloat128( aSign, 0x3FFF, 0, 0 );
5951                 }
5952                 break;
5953             case float_round_ties_away:
5954                 if (aExp == 0x3FFE) {
5955                     return packFloat128(aSign, 0x3FFF, 0, 0);
5956                 }
5957                 break;
5958              case float_round_down:
5959                 return
5960                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5961                     : packFloat128( 0, 0, 0, 0 );
5962              case float_round_up:
5963                 return
5964                       aSign ? packFloat128( 1, 0, 0, 0 )
5965                     : packFloat128( 0, 0x3FFF, 0, 0 );
5966             }
5967             return packFloat128( aSign, 0, 0, 0 );
5968         }
5969         lastBitMask = 1;
5970         lastBitMask <<= 0x402F - aExp;
5971         roundBitsMask = lastBitMask - 1;
5972         z.low = 0;
5973         z.high = a.high;
5974         switch (status->float_rounding_mode) {
5975         case float_round_nearest_even:
5976             z.high += lastBitMask>>1;
5977             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5978                 z.high &= ~ lastBitMask;
5979             }
5980             break;
5981         case float_round_ties_away:
5982             z.high += lastBitMask>>1;
5983             break;
5984         case float_round_to_zero:
5985             break;
5986         case float_round_up:
5987             if (!extractFloat128Sign(z)) {
5988                 z.high |= ( a.low != 0 );
5989                 z.high += roundBitsMask;
5990             }
5991             break;
5992         case float_round_down:
5993             if (extractFloat128Sign(z)) {
5994                 z.high |= (a.low != 0);
5995                 z.high += roundBitsMask;
5996             }
5997             break;
5998         default:
5999             abort();
6000         }
6001         z.high &= ~ roundBitsMask;
6002     }
6003     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6004         status->float_exception_flags |= float_flag_inexact;
6005     }
6006     return z;
6007
6008 }
6009
6010 /*----------------------------------------------------------------------------
6011 | Returns the result of adding the absolute values of the quadruple-precision
6012 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6013 | before being returned.  `zSign' is ignored if the result is a NaN.
6014 | The addition is performed according to the IEC/IEEE Standard for Binary
6015 | Floating-Point Arithmetic.
6016 *----------------------------------------------------------------------------*/
6017
6018 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6019                                 float_status *status)
6020 {
6021     int32_t aExp, bExp, zExp;
6022     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6023     int32_t expDiff;
6024
6025     aSig1 = extractFloat128Frac1( a );
6026     aSig0 = extractFloat128Frac0( a );
6027     aExp = extractFloat128Exp( a );
6028     bSig1 = extractFloat128Frac1( b );
6029     bSig0 = extractFloat128Frac0( b );
6030     bExp = extractFloat128Exp( b );
6031     expDiff = aExp - bExp;
6032     if ( 0 < expDiff ) {
6033         if ( aExp == 0x7FFF ) {
6034             if (aSig0 | aSig1) {
6035                 return propagateFloat128NaN(a, b, status);
6036             }
6037             return a;
6038         }
6039         if ( bExp == 0 ) {
6040             --expDiff;
6041         }
6042         else {
6043             bSig0 |= LIT64( 0x0001000000000000 );
6044         }
6045         shift128ExtraRightJamming(
6046             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6047         zExp = aExp;
6048     }
6049     else if ( expDiff < 0 ) {
6050         if ( bExp == 0x7FFF ) {
6051             if (bSig0 | bSig1) {
6052                 return propagateFloat128NaN(a, b, status);
6053             }
6054             return packFloat128( zSign, 0x7FFF, 0, 0 );
6055         }
6056         if ( aExp == 0 ) {
6057             ++expDiff;
6058         }
6059         else {
6060             aSig0 |= LIT64( 0x0001000000000000 );
6061         }
6062         shift128ExtraRightJamming(
6063             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6064         zExp = bExp;
6065     }
6066     else {
6067         if ( aExp == 0x7FFF ) {
6068             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6069                 return propagateFloat128NaN(a, b, status);
6070             }
6071             return a;
6072         }
6073         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6074         if ( aExp == 0 ) {
6075             if (status->flush_to_zero) {
6076                 if (zSig0 | zSig1) {
6077                     float_raise(float_flag_output_denormal, status);
6078                 }
6079                 return packFloat128(zSign, 0, 0, 0);
6080             }
6081             return packFloat128( zSign, 0, zSig0, zSig1 );
6082         }
6083         zSig2 = 0;
6084         zSig0 |= LIT64( 0x0002000000000000 );
6085         zExp = aExp;
6086         goto shiftRight1;
6087     }
6088     aSig0 |= LIT64( 0x0001000000000000 );
6089     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6090     --zExp;
6091     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6092     ++zExp;
6093  shiftRight1:
6094     shift128ExtraRightJamming(
6095         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6096  roundAndPack:
6097     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6098
6099 }
6100
6101 /*----------------------------------------------------------------------------
6102 | Returns the result of subtracting the absolute values of the quadruple-
6103 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6104 | difference is negated before being returned.  `zSign' is ignored if the
6105 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6106 | Standard for Binary Floating-Point Arithmetic.
6107 *----------------------------------------------------------------------------*/
6108
6109 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6110                                 float_status *status)
6111 {
6112     int32_t aExp, bExp, zExp;
6113     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6114     int32_t expDiff;
6115
6116     aSig1 = extractFloat128Frac1( a );
6117     aSig0 = extractFloat128Frac0( a );
6118     aExp = extractFloat128Exp( a );
6119     bSig1 = extractFloat128Frac1( b );
6120     bSig0 = extractFloat128Frac0( b );
6121     bExp = extractFloat128Exp( b );
6122     expDiff = aExp - bExp;
6123     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6124     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6125     if ( 0 < expDiff ) goto aExpBigger;
6126     if ( expDiff < 0 ) goto bExpBigger;
6127     if ( aExp == 0x7FFF ) {
6128         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6129             return propagateFloat128NaN(a, b, status);
6130         }
6131         float_raise(float_flag_invalid, status);
6132         return float128_default_nan(status);
6133     }
6134     if ( aExp == 0 ) {
6135         aExp = 1;
6136         bExp = 1;
6137     }
6138     if ( bSig0 < aSig0 ) goto aBigger;
6139     if ( aSig0 < bSig0 ) goto bBigger;
6140     if ( bSig1 < aSig1 ) goto aBigger;
6141     if ( aSig1 < bSig1 ) goto bBigger;
6142     return packFloat128(status->float_rounding_mode == float_round_down,
6143                         0, 0, 0);
6144  bExpBigger:
6145     if ( bExp == 0x7FFF ) {
6146         if (bSig0 | bSig1) {
6147             return propagateFloat128NaN(a, b, status);
6148         }
6149         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6150     }
6151     if ( aExp == 0 ) {
6152         ++expDiff;
6153     }
6154     else {
6155         aSig0 |= LIT64( 0x4000000000000000 );
6156     }
6157     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6158     bSig0 |= LIT64( 0x4000000000000000 );
6159  bBigger:
6160     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6161     zExp = bExp;
6162     zSign ^= 1;
6163     goto normalizeRoundAndPack;
6164  aExpBigger:
6165     if ( aExp == 0x7FFF ) {
6166         if (aSig0 | aSig1) {
6167             return propagateFloat128NaN(a, b, status);
6168         }
6169         return a;
6170     }
6171     if ( bExp == 0 ) {
6172         --expDiff;
6173     }
6174     else {
6175         bSig0 |= LIT64( 0x4000000000000000 );
6176     }
6177     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6178     aSig0 |= LIT64( 0x4000000000000000 );
6179  aBigger:
6180     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6181     zExp = aExp;
6182  normalizeRoundAndPack:
6183     --zExp;
6184     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6185                                          status);
6186
6187 }
6188
6189 /*----------------------------------------------------------------------------
6190 | Returns the result of adding the quadruple-precision floating-point values
6191 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6192 | for Binary Floating-Point Arithmetic.
6193 *----------------------------------------------------------------------------*/
6194
6195 float128 float128_add(float128 a, float128 b, float_status *status)
6196 {
6197     flag aSign, bSign;
6198
6199     aSign = extractFloat128Sign( a );
6200     bSign = extractFloat128Sign( b );
6201     if ( aSign == bSign ) {
6202         return addFloat128Sigs(a, b, aSign, status);
6203     }
6204     else {
6205         return subFloat128Sigs(a, b, aSign, status);
6206     }
6207
6208 }
6209
6210 /*----------------------------------------------------------------------------
6211 | Returns the result of subtracting the quadruple-precision floating-point
6212 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6213 | Standard for Binary Floating-Point Arithmetic.
6214 *----------------------------------------------------------------------------*/
6215
6216 float128 float128_sub(float128 a, float128 b, float_status *status)
6217 {
6218     flag aSign, bSign;
6219
6220     aSign = extractFloat128Sign( a );
6221     bSign = extractFloat128Sign( b );
6222     if ( aSign == bSign ) {
6223         return subFloat128Sigs(a, b, aSign, status);
6224     }
6225     else {
6226         return addFloat128Sigs(a, b, aSign, status);
6227     }
6228
6229 }
6230
6231 /*----------------------------------------------------------------------------
6232 | Returns the result of multiplying the quadruple-precision floating-point
6233 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6234 | Standard for Binary Floating-Point Arithmetic.
6235 *----------------------------------------------------------------------------*/
6236
6237 float128 float128_mul(float128 a, float128 b, float_status *status)
6238 {
6239     flag aSign, bSign, zSign;
6240     int32_t aExp, bExp, zExp;
6241     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6242
6243     aSig1 = extractFloat128Frac1( a );
6244     aSig0 = extractFloat128Frac0( a );
6245     aExp = extractFloat128Exp( a );
6246     aSign = extractFloat128Sign( a );
6247     bSig1 = extractFloat128Frac1( b );
6248     bSig0 = extractFloat128Frac0( b );
6249     bExp = extractFloat128Exp( b );
6250     bSign = extractFloat128Sign( b );
6251     zSign = aSign ^ bSign;
6252     if ( aExp == 0x7FFF ) {
6253         if (    ( aSig0 | aSig1 )
6254              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6255             return propagateFloat128NaN(a, b, status);
6256         }
6257         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6258         return packFloat128( zSign, 0x7FFF, 0, 0 );
6259     }
6260     if ( bExp == 0x7FFF ) {
6261         if (bSig0 | bSig1) {
6262             return propagateFloat128NaN(a, b, status);
6263         }
6264         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6265  invalid:
6266             float_raise(float_flag_invalid, status);
6267             return float128_default_nan(status);
6268         }
6269         return packFloat128( zSign, 0x7FFF, 0, 0 );
6270     }
6271     if ( aExp == 0 ) {
6272         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6273         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6274     }
6275     if ( bExp == 0 ) {
6276         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6277         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6278     }
6279     zExp = aExp + bExp - 0x4000;
6280     aSig0 |= LIT64( 0x0001000000000000 );
6281     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6282     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6283     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6284     zSig2 |= ( zSig3 != 0 );
6285     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6286         shift128ExtraRightJamming(
6287             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6288         ++zExp;
6289     }
6290     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6291
6292 }
6293
6294 /*----------------------------------------------------------------------------
6295 | Returns the result of dividing the quadruple-precision floating-point value
6296 | `a' by the corresponding value `b'.  The operation is performed according to
6297 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6298 *----------------------------------------------------------------------------*/
6299
6300 float128 float128_div(float128 a, float128 b, float_status *status)
6301 {
6302     flag aSign, bSign, zSign;
6303     int32_t aExp, bExp, zExp;
6304     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6305     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6306
6307     aSig1 = extractFloat128Frac1( a );
6308     aSig0 = extractFloat128Frac0( a );
6309     aExp = extractFloat128Exp( a );
6310     aSign = extractFloat128Sign( a );
6311     bSig1 = extractFloat128Frac1( b );
6312     bSig0 = extractFloat128Frac0( b );
6313     bExp = extractFloat128Exp( b );
6314     bSign = extractFloat128Sign( b );
6315     zSign = aSign ^ bSign;
6316     if ( aExp == 0x7FFF ) {
6317         if (aSig0 | aSig1) {
6318             return propagateFloat128NaN(a, b, status);
6319         }
6320         if ( bExp == 0x7FFF ) {
6321             if (bSig0 | bSig1) {
6322                 return propagateFloat128NaN(a, b, status);
6323             }
6324             goto invalid;
6325         }
6326         return packFloat128( zSign, 0x7FFF, 0, 0 );
6327     }
6328     if ( bExp == 0x7FFF ) {
6329         if (bSig0 | bSig1) {
6330             return propagateFloat128NaN(a, b, status);
6331         }
6332         return packFloat128( zSign, 0, 0, 0 );
6333     }
6334     if ( bExp == 0 ) {
6335         if ( ( bSig0 | bSig1 ) == 0 ) {
6336             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6337  invalid:
6338                 float_raise(float_flag_invalid, status);
6339                 return float128_default_nan(status);
6340             }
6341             float_raise(float_flag_divbyzero, status);
6342             return packFloat128( zSign, 0x7FFF, 0, 0 );
6343         }
6344         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6345     }
6346     if ( aExp == 0 ) {
6347         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6348         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6349     }
6350     zExp = aExp - bExp + 0x3FFD;
6351     shortShift128Left(
6352         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6353     shortShift128Left(
6354         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6355     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6356         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6357         ++zExp;
6358     }
6359     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6360     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6361     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6362     while ( (int64_t) rem0 < 0 ) {
6363         --zSig0;
6364         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6365     }
6366     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6367     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6368         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6369         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6370         while ( (int64_t) rem1 < 0 ) {
6371             --zSig1;
6372             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6373         }
6374         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6375     }
6376     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6377     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6378
6379 }
6380
6381 /*----------------------------------------------------------------------------
6382 | Returns the remainder of the quadruple-precision floating-point value `a'
6383 | with respect to the corresponding value `b'.  The operation is performed
6384 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6385 *----------------------------------------------------------------------------*/
6386
6387 float128 float128_rem(float128 a, float128 b, float_status *status)
6388 {
6389     flag aSign, zSign;
6390     int32_t aExp, bExp, expDiff;
6391     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6392     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6393     int64_t sigMean0;
6394
6395     aSig1 = extractFloat128Frac1( a );
6396     aSig0 = extractFloat128Frac0( a );
6397     aExp = extractFloat128Exp( a );
6398     aSign = extractFloat128Sign( a );
6399     bSig1 = extractFloat128Frac1( b );
6400     bSig0 = extractFloat128Frac0( b );
6401     bExp = extractFloat128Exp( b );
6402     if ( aExp == 0x7FFF ) {
6403         if (    ( aSig0 | aSig1 )
6404              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6405             return propagateFloat128NaN(a, b, status);
6406         }
6407         goto invalid;
6408     }
6409     if ( bExp == 0x7FFF ) {
6410         if (bSig0 | bSig1) {
6411             return propagateFloat128NaN(a, b, status);
6412         }
6413         return a;
6414     }
6415     if ( bExp == 0 ) {
6416         if ( ( bSig0 | bSig1 ) == 0 ) {
6417  invalid:
6418             float_raise(float_flag_invalid, status);
6419             return float128_default_nan(status);
6420         }
6421         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6422     }
6423     if ( aExp == 0 ) {
6424         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6425         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6426     }
6427     expDiff = aExp - bExp;
6428     if ( expDiff < -1 ) return a;
6429     shortShift128Left(
6430         aSig0 | LIT64( 0x0001000000000000 ),
6431         aSig1,
6432         15 - ( expDiff < 0 ),
6433         &aSig0,
6434         &aSig1
6435     );
6436     shortShift128Left(
6437         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6438     q = le128( bSig0, bSig1, aSig0, aSig1 );
6439     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6440     expDiff -= 64;
6441     while ( 0 < expDiff ) {
6442         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6443         q = ( 4 < q ) ? q - 4 : 0;
6444         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6445         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6446         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6447         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6448         expDiff -= 61;
6449     }
6450     if ( -64 < expDiff ) {
6451         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6452         q = ( 4 < q ) ? q - 4 : 0;
6453         q >>= - expDiff;
6454         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6455         expDiff += 52;
6456         if ( expDiff < 0 ) {
6457             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6458         }
6459         else {
6460             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6461         }
6462         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6463         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6464     }
6465     else {
6466         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6467         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6468     }
6469     do {
6470         alternateASig0 = aSig0;
6471         alternateASig1 = aSig1;
6472         ++q;
6473         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6474     } while ( 0 <= (int64_t) aSig0 );
6475     add128(
6476         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6477     if (    ( sigMean0 < 0 )
6478          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6479         aSig0 = alternateASig0;
6480         aSig1 = alternateASig1;
6481     }
6482     zSign = ( (int64_t) aSig0 < 0 );
6483     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6484     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6485                                          status);
6486 }
6487
6488 /*----------------------------------------------------------------------------
6489 | Returns the square root of the quadruple-precision floating-point value `a'.
6490 | The operation is performed according to the IEC/IEEE Standard for Binary
6491 | Floating-Point Arithmetic.
6492 *----------------------------------------------------------------------------*/
6493
6494 float128 float128_sqrt(float128 a, float_status *status)
6495 {
6496     flag aSign;
6497     int32_t aExp, zExp;
6498     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6499     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6500
6501     aSig1 = extractFloat128Frac1( a );
6502     aSig0 = extractFloat128Frac0( a );
6503     aExp = extractFloat128Exp( a );
6504     aSign = extractFloat128Sign( a );
6505     if ( aExp == 0x7FFF ) {
6506         if (aSig0 | aSig1) {
6507             return propagateFloat128NaN(a, a, status);
6508         }
6509         if ( ! aSign ) return a;
6510         goto invalid;
6511     }
6512     if ( aSign ) {
6513         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6514  invalid:
6515         float_raise(float_flag_invalid, status);
6516         return float128_default_nan(status);
6517     }
6518     if ( aExp == 0 ) {
6519         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6520         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6521     }
6522     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6523     aSig0 |= LIT64( 0x0001000000000000 );
6524     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6525     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6526     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6527     doubleZSig0 = zSig0<<1;
6528     mul64To128( zSig0, zSig0, &term0, &term1 );
6529     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6530     while ( (int64_t) rem0 < 0 ) {
6531         --zSig0;
6532         doubleZSig0 -= 2;
6533         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6534     }
6535     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6536     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6537         if ( zSig1 == 0 ) zSig1 = 1;
6538         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6539         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6540         mul64To128( zSig1, zSig1, &term2, &term3 );
6541         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6542         while ( (int64_t) rem1 < 0 ) {
6543             --zSig1;
6544             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6545             term3 |= 1;
6546             term2 |= doubleZSig0;
6547             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6548         }
6549         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6550     }
6551     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6552     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6553
6554 }
6555
6556 /*----------------------------------------------------------------------------
6557 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6558 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6559 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6560 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6561 *----------------------------------------------------------------------------*/
6562
6563 int float128_eq(float128 a, float128 b, float_status *status)
6564 {
6565
6566     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6567               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6568          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6569               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6570        ) {
6571         float_raise(float_flag_invalid, status);
6572         return 0;
6573     }
6574     return
6575            ( a.low == b.low )
6576         && (    ( a.high == b.high )
6577              || (    ( a.low == 0 )
6578                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6579            );
6580
6581 }
6582
6583 /*----------------------------------------------------------------------------
6584 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6585 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6586 | exception is raised if either operand is a NaN.  The comparison is performed
6587 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6588 *----------------------------------------------------------------------------*/
6589
6590 int float128_le(float128 a, float128 b, float_status *status)
6591 {
6592     flag aSign, bSign;
6593
6594     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6595               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6596          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6597               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6598        ) {
6599         float_raise(float_flag_invalid, status);
6600         return 0;
6601     }
6602     aSign = extractFloat128Sign( a );
6603     bSign = extractFloat128Sign( b );
6604     if ( aSign != bSign ) {
6605         return
6606                aSign
6607             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6608                  == 0 );
6609     }
6610     return
6611           aSign ? le128( b.high, b.low, a.high, a.low )
6612         : le128( a.high, a.low, b.high, b.low );
6613
6614 }
6615
6616 /*----------------------------------------------------------------------------
6617 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6618 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6619 | raised if either operand is a NaN.  The comparison is performed according
6620 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6621 *----------------------------------------------------------------------------*/
6622
6623 int float128_lt(float128 a, float128 b, float_status *status)
6624 {
6625     flag aSign, bSign;
6626
6627     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6628               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6629          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6630               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6631        ) {
6632         float_raise(float_flag_invalid, status);
6633         return 0;
6634     }
6635     aSign = extractFloat128Sign( a );
6636     bSign = extractFloat128Sign( b );
6637     if ( aSign != bSign ) {
6638         return
6639                aSign
6640             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6641                  != 0 );
6642     }
6643     return
6644           aSign ? lt128( b.high, b.low, a.high, a.low )
6645         : lt128( a.high, a.low, b.high, b.low );
6646
6647 }
6648
6649 /*----------------------------------------------------------------------------
6650 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6651 | be compared, and 0 otherwise.  The invalid exception is raised if either
6652 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6653 | Standard for Binary Floating-Point Arithmetic.
6654 *----------------------------------------------------------------------------*/
6655
6656 int float128_unordered(float128 a, float128 b, float_status *status)
6657 {
6658     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6659               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6660          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6661               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6662        ) {
6663         float_raise(float_flag_invalid, status);
6664         return 1;
6665     }
6666     return 0;
6667 }
6668
6669 /*----------------------------------------------------------------------------
6670 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6671 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6672 | exception.  The comparison is performed according to the IEC/IEEE Standard
6673 | for Binary Floating-Point Arithmetic.
6674 *----------------------------------------------------------------------------*/
6675
6676 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6677 {
6678
6679     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6680               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6681          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6682               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6683        ) {
6684         if (float128_is_signaling_nan(a, status)
6685          || float128_is_signaling_nan(b, status)) {
6686             float_raise(float_flag_invalid, status);
6687         }
6688         return 0;
6689     }
6690     return
6691            ( a.low == b.low )
6692         && (    ( a.high == b.high )
6693              || (    ( a.low == 0 )
6694                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6695            );
6696
6697 }
6698
6699 /*----------------------------------------------------------------------------
6700 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6701 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6702 | cause an exception.  Otherwise, the comparison is performed according to the
6703 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6704 *----------------------------------------------------------------------------*/
6705
6706 int float128_le_quiet(float128 a, float128 b, float_status *status)
6707 {
6708     flag aSign, bSign;
6709
6710     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6711               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6712          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6713               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6714        ) {
6715         if (float128_is_signaling_nan(a, status)
6716          || float128_is_signaling_nan(b, status)) {
6717             float_raise(float_flag_invalid, status);
6718         }
6719         return 0;
6720     }
6721     aSign = extractFloat128Sign( a );
6722     bSign = extractFloat128Sign( b );
6723     if ( aSign != bSign ) {
6724         return
6725                aSign
6726             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6727                  == 0 );
6728     }
6729     return
6730           aSign ? le128( b.high, b.low, a.high, a.low )
6731         : le128( a.high, a.low, b.high, b.low );
6732
6733 }
6734
6735 /*----------------------------------------------------------------------------
6736 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6737 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6738 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6739 | Standard for Binary Floating-Point Arithmetic.
6740 *----------------------------------------------------------------------------*/
6741
6742 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6743 {
6744     flag aSign, bSign;
6745
6746     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6747               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6748          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6749               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6750        ) {
6751         if (float128_is_signaling_nan(a, status)
6752          || float128_is_signaling_nan(b, status)) {
6753             float_raise(float_flag_invalid, status);
6754         }
6755         return 0;
6756     }
6757     aSign = extractFloat128Sign( a );
6758     bSign = extractFloat128Sign( b );
6759     if ( aSign != bSign ) {
6760         return
6761                aSign
6762             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6763                  != 0 );
6764     }
6765     return
6766           aSign ? lt128( b.high, b.low, a.high, a.low )
6767         : lt128( a.high, a.low, b.high, b.low );
6768
6769 }
6770
6771 /*----------------------------------------------------------------------------
6772 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6773 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6774 | comparison is performed according to the IEC/IEEE Standard for Binary
6775 | Floating-Point Arithmetic.
6776 *----------------------------------------------------------------------------*/
6777
6778 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6779 {
6780     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6781               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6782          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6783               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6784        ) {
6785         if (float128_is_signaling_nan(a, status)
6786          || float128_is_signaling_nan(b, status)) {
6787             float_raise(float_flag_invalid, status);
6788         }
6789         return 1;
6790     }
6791     return 0;
6792 }
6793
6794 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6795                                             int is_quiet, float_status *status)
6796 {
6797     flag aSign, bSign;
6798
6799     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6800         float_raise(float_flag_invalid, status);
6801         return float_relation_unordered;
6802     }
6803     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6804           ( extractFloatx80Frac( a )<<1 ) ) ||
6805         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6806           ( extractFloatx80Frac( b )<<1 ) )) {
6807         if (!is_quiet ||
6808             floatx80_is_signaling_nan(a, status) ||
6809             floatx80_is_signaling_nan(b, status)) {
6810             float_raise(float_flag_invalid, status);
6811         }
6812         return float_relation_unordered;
6813     }
6814     aSign = extractFloatx80Sign( a );
6815     bSign = extractFloatx80Sign( b );
6816     if ( aSign != bSign ) {
6817
6818         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6819              ( ( a.low | b.low ) == 0 ) ) {
6820             /* zero case */
6821             return float_relation_equal;
6822         } else {
6823             return 1 - (2 * aSign);
6824         }
6825     } else {
6826         if (a.low == b.low && a.high == b.high) {
6827             return float_relation_equal;
6828         } else {
6829             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6830         }
6831     }
6832 }
6833
6834 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6835 {
6836     return floatx80_compare_internal(a, b, 0, status);
6837 }
6838
6839 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6840 {
6841     return floatx80_compare_internal(a, b, 1, status);
6842 }
6843
6844 static inline int float128_compare_internal(float128 a, float128 b,
6845                                             int is_quiet, float_status *status)
6846 {
6847     flag aSign, bSign;
6848
6849     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6850           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6851         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6852           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6853         if (!is_quiet ||
6854             float128_is_signaling_nan(a, status) ||
6855             float128_is_signaling_nan(b, status)) {
6856             float_raise(float_flag_invalid, status);
6857         }
6858         return float_relation_unordered;
6859     }
6860     aSign = extractFloat128Sign( a );
6861     bSign = extractFloat128Sign( b );
6862     if ( aSign != bSign ) {
6863         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6864             /* zero case */
6865             return float_relation_equal;
6866         } else {
6867             return 1 - (2 * aSign);
6868         }
6869     } else {
6870         if (a.low == b.low && a.high == b.high) {
6871             return float_relation_equal;
6872         } else {
6873             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6874         }
6875     }
6876 }
6877
6878 int float128_compare(float128 a, float128 b, float_status *status)
6879 {
6880     return float128_compare_internal(a, b, 0, status);
6881 }
6882
6883 int float128_compare_quiet(float128 a, float128 b, float_status *status)
6884 {
6885     return float128_compare_internal(a, b, 1, status);
6886 }
6887
6888 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
6889 {
6890     flag aSign;
6891     int32_t aExp;
6892     uint64_t aSig;
6893
6894     if (floatx80_invalid_encoding(a)) {
6895         float_raise(float_flag_invalid, status);
6896         return floatx80_default_nan(status);
6897     }
6898     aSig = extractFloatx80Frac( a );
6899     aExp = extractFloatx80Exp( a );
6900     aSign = extractFloatx80Sign( a );
6901
6902     if ( aExp == 0x7FFF ) {
6903         if ( aSig<<1 ) {
6904             return propagateFloatx80NaN(a, a, status);
6905         }
6906         return a;
6907     }
6908
6909     if (aExp == 0) {
6910         if (aSig == 0) {
6911             return a;
6912         }
6913         aExp++;
6914     }
6915
6916     if (n > 0x10000) {
6917         n = 0x10000;
6918     } else if (n < -0x10000) {
6919         n = -0x10000;
6920     }
6921
6922     aExp += n;
6923     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
6924                                          aSign, aExp, aSig, 0, status);
6925 }
6926
6927 float128 float128_scalbn(float128 a, int n, float_status *status)
6928 {
6929     flag aSign;
6930     int32_t aExp;
6931     uint64_t aSig0, aSig1;
6932
6933     aSig1 = extractFloat128Frac1( a );
6934     aSig0 = extractFloat128Frac0( a );
6935     aExp = extractFloat128Exp( a );
6936     aSign = extractFloat128Sign( a );
6937     if ( aExp == 0x7FFF ) {
6938         if ( aSig0 | aSig1 ) {
6939             return propagateFloat128NaN(a, a, status);
6940         }
6941         return a;
6942     }
6943     if (aExp != 0) {
6944         aSig0 |= LIT64( 0x0001000000000000 );
6945     } else if (aSig0 == 0 && aSig1 == 0) {
6946         return a;
6947     } else {
6948         aExp++;
6949     }
6950
6951     if (n > 0x10000) {
6952         n = 0x10000;
6953     } else if (n < -0x10000) {
6954         n = -0x10000;
6955     }
6956
6957     aExp += n - 1;
6958     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6959                                          , status);
6960
6961 }